From 5463c7904c952aa6b6804dd902c72a5332fa5221 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Mon, 27 Mar 2006 01:17:58 -0800 Subject: [PATCH] dm/md dependency tree in sysfs: md to use bd_claim_by_disk Use bd_claim_by_disk. Following symlinks are created if md0 is built from sda and sdb /sys/block/md0/slaves/sda --> /sys/block/sda /sys/block/md0/slaves/sdb --> /sys/block/sdb /sys/block/sda/holders/md0 --> /sys/block/md0 /sys/block/sdb/holders/md0 --> /sys/block/md0 Signed-off-by: Jun'ichi Nomura Cc: Alasdair G Kergon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5ed2228745cb..bde3e9682250 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1301,6 +1301,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) else ko = &rdev->bdev->bd_disk->kobj; sysfs_create_link(&rdev->kobj, ko, "block"); + bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk); return 0; } @@ -1311,6 +1312,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } + bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); list_del_init(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; -- cgit v1.2.3 From 89e5c8b5b85d6d46e8a28cdfa076313ae691d35c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:02 -0800 Subject: [PATCH] md: Make sure QUEUE_FLAG_CLUSTER is set properly for md. This flag should be set for a virtual device iff it is set for all underlying devices. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/ll_rw_blk.c | 2 ++ drivers/md/md.c | 1 + 2 files changed, 3 insertions(+) (limited to 'drivers/md/md.c') diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 062067fa7ead..a2e333ad0b64 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -785,6 +785,8 @@ void blk_queue_stack_limits(request_queue_t *t, request_queue_t *b) t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); t->max_segment_size = min(t->max_segment_size,b->max_segment_size); t->hardsect_size = max(t->hardsect_size,b->hardsect_size); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); } EXPORT_SYMBOL(blk_queue_stack_limits); diff --git a/drivers/md/md.c b/drivers/md/md.c index bde3e9682250..3254ff1a5cc9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,6 +266,7 @@ static mddev_t * mddev_find(dev_t unit) kfree(new); return NULL; } + set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); blk_queue_make_request(new->queue, md_fail_request); -- cgit v1.2.3 From c5a10f62c5c496c49db749af103b991873b7e2dc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:03 -0800 Subject: [PATCH] md: Add '4' to the list of levels for which bitmaps are supported I really should make this a function of the personality.... Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3254ff1a5cc9..875bced88e36 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -765,7 +765,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<bitmap_file == NULL) { - if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6 + if (mddev->level != 1 && mddev->level != 4 + && mddev->level != 5 && mddev->level != 6 && mddev->level != 10) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps not supported for this level.\n"); -- cgit v1.2.3 From 1be7892fffb45f6017494a88ff68fe84c6de26b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:03 -0800 Subject: [PATCH] md: Fix the 'failed' count for version-0 superblocks We are counting failed devices twice, once of the device that is failed, and once for the hole that has been left in the array. Remove the former so 'failed' matches 'missing'. Storing these counts in the superblock is a bit silly anyway.... Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 875bced88e36..686314f070a5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -895,10 +895,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ - if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(Faulty, &rdev2->flags)) d->state = (1<flags)) { + else if (test_bit(In_sync, &rdev2->flags)) { d->state = (1<state |= (1< Date: Mon, 27 Mar 2006 01:18:04 -0800 Subject: [PATCH] md: Update status_resync to handle LARGE devices status_resync - used by /proc/mdstat to report the status of a resync, assumes that device sizes will always fit into an 'unsigned long' This is no longer the case... Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 686314f070a5..a3ecaf8ed30a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4044,7 +4044,10 @@ static void status_unused(struct seq_file *seq) static void status_resync(struct seq_file *seq, mddev_t * mddev) { - unsigned long max_blocks, resync, res, dt, db, rt; + sector_t max_blocks, resync, res; + unsigned long dt, db, rt; + int scale; + unsigned int per_milli; resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; @@ -4060,9 +4063,22 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) MD_BUG(); return; } - res = (resync/1024)*1000/(max_blocks/1024 + 1); + /* Pick 'scale' such that (resync>>scale)*1000 will fit + * in a sector_t, and (max_blocks>>scale) will fit in a + * u32, as those are the requirements for sector_div. + * Thus 'scale' must be at least 10 + */ + scale = 10; + if (sizeof(sector_t) > sizeof(unsigned long)) { + while ( max_blocks/2 > (1ULL<<(scale+32))) + scale++; + } + res = (resync>>scale)*1000; + sector_div(res, (u32)((max_blocks>>scale)+1)); + + per_milli = res; { - int i, x = res/50, y = 20-x; + int i, x = per_milli/50, y = 20-x; seq_printf(seq, "["); for (i = 0; i < x; i++) seq_printf(seq, "="); @@ -4071,10 +4087,12 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) seq_printf(seq, "."); seq_printf(seq, "] "); } - seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)", + seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"), - res/10, res % 10, resync, max_blocks); + per_milli/10, per_milli % 10, + (unsigned long long) resync, + (unsigned long long) max_blocks); /* * We do not want to overflow, so the order of operands and @@ -4088,7 +4106,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; db = resync - (mddev->resync_mark_cnt/2); - rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100; seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6); -- cgit v1.2.3 From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:07 -0800 Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding an array Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache data structure. This requires allocating new stripes in a new kmem_cache. If this succeeds, we copy cache pages over and release the old stripes and kmem_cache. We then allocate new pages. If that fails, we leave the stripe cache at it's new size. It isn't worth the effort to shrink it back again. Unfortuanately this means we need two kmem_cache names as we, for a short period of time, we have two kmem_caches. So they are raid5/%s and raid5/%s-alt Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- drivers/md/raid5.c | 131 +++++++++++++++++++++++++++++++++++++++++++-- drivers/md/raid6main.c | 4 +- include/linux/raid/raid5.h | 9 +++- 4 files changed, 137 insertions(+), 9 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index a3ecaf8ed30a..c7b7656f9aa5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2775,7 +2775,6 @@ static void autorun_array(mddev_t *mddev) */ static void autorun_devices(int part) { - struct list_head candidates; struct list_head *tmp; mdk_rdev_t *rdev0, *rdev; mddev_t *mddev; @@ -2784,6 +2783,7 @@ static void autorun_devices(int part) printk(KERN_INFO "md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { dev_t dev; + LIST_HEAD(candidates); rdev0 = list_entry(pending_raid_disks.next, mdk_rdev_t, same_set); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 03f31379cebb..6c20b44509d8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -313,20 +313,143 @@ static int grow_stripes(raid5_conf_t *conf, int num) kmem_cache_t *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev)); - - sc = kmem_cache_create(conf->cache_name, + sprintf(conf->cache_name[0], "raid5/%s", mdname(conf->mddev)); + sprintf(conf->cache_name[1], "raid5/%s-alt", mdname(conf->mddev)); + conf->active_name = 0; + sc = kmem_cache_create(conf->cache_name[conf->active_name], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 0, 0, NULL, NULL); if (!sc) return 1; conf->slab_cache = sc; + conf->pool_size = devs; while (num--) { if (!grow_one_stripe(conf)) return 1; } return 0; } +static int resize_stripes(raid5_conf_t *conf, int newsize) +{ + /* Make all the stripes able to hold 'newsize' devices. + * New slots in each stripe get 'page' set to a new page. + * + * This happens in stages: + * 1/ create a new kmem_cache and allocate the required number of + * stripe_heads. + * 2/ gather all the old stripe_heads and tranfer the pages across + * to the new stripe_heads. This will have the side effect of + * freezing the array as once all stripe_heads have been collected, + * no IO will be possible. Old stripe heads are freed once their + * pages have been transferred over, and the old kmem_cache is + * freed when all stripes are done. + * 3/ reallocate conf->disks to be suitable bigger. If this fails, + * we simple return a failre status - no need to clean anything up. + * 4/ allocate new pages for the new slots in the new stripe_heads. + * If this fails, we don't bother trying the shrink the + * stripe_heads down again, we just leave them as they are. + * As each stripe_head is processed the new one is released into + * active service. + * + * Once step2 is started, we cannot afford to wait for a write, + * so we use GFP_NOIO allocations. + */ + struct stripe_head *osh, *nsh; + LIST_HEAD(newstripes); + struct disk_info *ndisks; + int err = 0; + kmem_cache_t *sc; + int i; + + if (newsize <= conf->pool_size) + return 0; /* never bother to shrink */ + + /* Step 1 */ + sc = kmem_cache_create(conf->cache_name[1-conf->active_name], + sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), + 0, 0, NULL, NULL); + if (!sc) + return -ENOMEM; + + for (i = conf->max_nr_stripes; i; i--) { + nsh = kmem_cache_alloc(sc, GFP_KERNEL); + if (!nsh) + break; + + memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); + + nsh->raid_conf = conf; + spin_lock_init(&nsh->lock); + + list_add(&nsh->lru, &newstripes); + } + if (i) { + /* didn't get enough, give up */ + while (!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del(&nsh->lru); + kmem_cache_free(sc, nsh); + } + kmem_cache_destroy(sc); + return -ENOMEM; + } + /* Step 2 - Must use GFP_NOIO now. + * OK, we have enough stripes, start collecting inactive + * stripes and copying them over + */ + list_for_each_entry(nsh, &newstripes, lru) { + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf->inactive_list), + conf->device_lock, + unplug_slaves(conf->mddev); + ); + osh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + atomic_set(&nsh->count, 1); + for(i=0; ipool_size; i++) + nsh->dev[i].page = osh->dev[i].page; + for( ; idev[i].page = NULL; + kmem_cache_free(conf->slab_cache, osh); + } + kmem_cache_destroy(conf->slab_cache); + + /* Step 3. + * At this point, we are holding all the stripes so the array + * is completely stalled, so now is a good time to resize + * conf->disks. + */ + ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); + if (ndisks) { + for (i=0; iraid_disks; i++) + ndisks[i] = conf->disks[i]; + kfree(conf->disks); + conf->disks = ndisks; + } else + err = -ENOMEM; + + /* Step 4, return new stripes to service */ + while(!list_empty(&newstripes)) { + nsh = list_entry(newstripes.next, struct stripe_head, lru); + list_del_init(&nsh->lru); + for (i=conf->raid_disks; i < newsize; i++) + if (nsh->dev[i].page == NULL) { + struct page *p = alloc_page(GFP_NOIO); + nsh->dev[i].page = p; + if (!p) + err = -ENOMEM; + } + release_stripe(nsh); + } + /* critical section pass, GFP_NOIO no longer needed */ + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + conf->pool_size = newsize; + return err; +} + static int drop_one_stripe(raid5_conf_t *conf) { @@ -339,7 +462,7 @@ static int drop_one_stripe(raid5_conf_t *conf) return 0; if (atomic_read(&sh->count)) BUG(); - shrink_buffers(sh, conf->raid_disks); + shrink_buffers(sh, conf->pool_size); kmem_cache_free(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); return 1; diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index c7632f6cc487..6df4930fddec 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -331,9 +331,9 @@ static int grow_stripes(raid6_conf_t *conf, int num) kmem_cache_t *sc; int devs = conf->raid_disks; - sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev)); + sprintf(conf->cache_name[0], "raid6/%s", mdname(conf->mddev)); - sc = kmem_cache_create(conf->cache_name, + sc = kmem_cache_create(conf->cache_name[0], sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 0, 0, NULL, NULL); if (!sc) diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 94dbdd406f12..b7b2653af7bb 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -216,7 +216,11 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ - char cache_name[20]; + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; kmem_cache_t *slab_cache; /* for allocating stripes */ int seq_flush, seq_write; @@ -238,7 +242,8 @@ struct raid5_private_data { wait_queue_head_t wait_for_overlap; int inactive_blocked; /* release of inactive stripes blocked, * waiting for 25% to be free - */ + */ + int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; }; -- cgit v1.2.3 From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:09 -0800 Subject: [PATCH] md: Core of raid5 resize process This patch provides the core of the resize/expand process. sync_request notices if a 'reshape' is happening and acts accordingly. It allocated new stripe_heads for the next chunk-wide-stripe in the target geometry, marking them STRIPE_EXPANDING. Then it finds which stripe heads in the old geometry can provide data needed by these and marks them STRIPE_EXPAND_SOURCE. This causes stripe_handle to read all blocks on those stripes. Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are needed are copied into the corresponding STRIPE_EXPANDING stripe_head. Once a STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then is written out and released. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 14 +++- drivers/md/raid5.c | 185 +++++++++++++++++++++++++++++++++++++++------ include/linux/raid/md_k.h | 4 + include/linux/raid/raid5.h | 4 +- 4 files changed, 181 insertions(+), 26 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c7b7656f9aa5..8e65986bc63f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2165,7 +2165,9 @@ action_show(mddev_t *mddev, char *page) char *type = "idle"; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + type = "reshape"; + else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) type = "resync"; else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) @@ -4088,8 +4090,10 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) seq_printf(seq, "] "); } seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", + (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? + "reshape" : (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? - "resync" : "recovery"), + "resync" : "recovery")), per_milli/10, per_milli % 10, (unsigned long long) resync, (unsigned long long) max_blocks); @@ -4543,7 +4547,9 @@ static void md_do_sync(mddev_t *mddev) */ max_sectors = mddev->resync_max_sectors; mddev->resync_mismatches = 0; - } else + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sectors = mddev->size << 1; + else /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; @@ -4679,6 +4685,8 @@ static void md_do_sync(mddev_t *mddev) mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2 && mddev->curr_resync >= mddev->recovery_cp) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7a6df515b008..56cba8d3e398 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -93,11 +93,11 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); } - list_add_tail(&sh->lru, &conf->inactive_list); atomic_dec(&conf->active_stripes); - if (!conf->inactive_blocked || - atomic_read(&conf->active_stripes) < (conf->max_nr_stripes*3/4)) + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + list_add_tail(&sh->lru, &conf->inactive_list); wake_up(&conf->wait_for_stripe); + } } } } @@ -273,9 +273,8 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru)) - BUG(); - list_del_init(&sh->lru); + if (!list_empty(&sh->lru)) + list_del_init(&sh->lru); } } } while (sh == NULL); @@ -1035,6 +1034,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in return 0; } +static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) +{ + int sectors_per_chunk = conf->chunk_size >> 9; + sector_t x = stripe; + int pd_idx, dd_idx; + int chunk_offset = sector_div(x, sectors_per_chunk); + stripe = x; + raid5_compute_sector(stripe*(disks-1)*sectors_per_chunk + + chunk_offset, disks, disks-1, &dd_idx, &pd_idx, conf); + return pd_idx; +} + /* * handle_stripe - do things to a stripe. @@ -1061,7 +1072,7 @@ static void handle_stripe(struct stripe_head *sh) struct bio *return_bi= NULL; struct bio *bi; int i; - int syncing; + int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int non_overwrite = 0; int failed_num=0; @@ -1076,6 +1087,8 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_DELAYED, &sh->state); syncing = test_bit(STRIPE_SYNCING, &sh->state); + expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -1268,13 +1281,14 @@ static void handle_stripe(struct stripe_head *sh) * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks))) { + if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || + expanding || (failed && (sh->dev[failed_num].toread || (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags)))) ) @@ -1464,13 +1478,76 @@ static void handle_stripe(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); set_bit(R5_LOCKED, &dev->flags); + locked++; } } + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); + compute_parity(sh, RECONSTRUCT_WRITE); + for (i= conf->raid_disks; i--;) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + clear_bit(STRIPE_EXPANDING, &sh->state); + } else if (expanded) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + } + + if (expanding && locked == 0) { + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i=0; i< sh->disks; i++) + if (i != sh->pd_idx) { + int dd_idx, pd_idx, j; + struct stripe_head *sh2; + + sector_t bn = compute_blocknr(sh, i); + sector_t s = raid5_compute_sector(bn, conf->raid_disks, + conf->raid_disks-1, + &dd_idx, &pd_idx, conf); + sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1); + if (sh2 == NULL) + /* so far only the early blocks of this stripe + * have been requested. When later blocks + * get requested, we will try again + */ + continue; + if(!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { + /* must have already done this block */ + release_stripe(sh2); + continue; + } + memcpy(page_address(sh2->dev[dd_idx].page), + page_address(sh->dev[i].page), + STRIPE_SIZE); + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); + for (j=0; jraid_disks; j++) + if (j != sh2->pd_idx && + !test_bit(R5_Expanded, &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + release_stripe(sh2); + } + } + spin_unlock(&sh->lock); while ((bi=return_bi)) { @@ -1509,7 +1586,7 @@ static void handle_stripe(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (syncing) + if (syncing || expanding || expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; @@ -1757,12 +1834,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; - int sectors_per_chunk = conf->chunk_size >> 9; - sector_t x; - unsigned long stripe; - int chunk_offset; - int dd_idx, pd_idx; - sector_t first_sector; + int pd_idx; + sector_t first_sector, last_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; sector_t max_sector = mddev->size << 1; @@ -1781,6 +1854,80 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } + + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + /* reshaping is quite different to recovery/resync so it is + * handled quite separately ... here. + * + * On each call to sync_request, we gather one chunk worth of + * destination stripes and flag them as expanding. + * Then we find all the source stripes and request reads. + * As the reads complete, handle_stripe will copy the data + * into the destination stripe and release that stripe. + */ + int i; + int dd_idx; + for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { + int j; + int skipped = 0; + pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); + sh = get_active_stripe(conf, sector_nr+i, + conf->raid_disks, pd_idx, 0); + set_bit(STRIPE_EXPANDING, &sh->state); + /* If any of this stripe is beyond the end of the old + * array, then we need to zero those blocks + */ + for (j=sh->disks; j--;) { + sector_t s; + if (j == sh->pd_idx) + continue; + s = compute_blocknr(sh, j); + if (s < (mddev->array_size<<1)) { + skipped = 1; + continue; + } + memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + set_bit(R5_Expanded, &sh->dev[j].flags); + set_bit(R5_UPTODATE, &sh->dev[j].flags); + } + if (!skipped) { + set_bit(STRIPE_EXPAND_READY, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + } + release_stripe(sh); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); + spin_unlock_irq(&conf->device_lock); + /* Ok, those stripe are ready. We can start scheduling + * reads on the source stripes. + * The source stripes are determined by mapping the first and last + * block on the destination stripes. + */ + raid_disks = conf->previous_raid_disks; + data_disks = raid_disks - 1; + first_sector = + raid5_compute_sector(sector_nr*(conf->raid_disks-1), + raid_disks, data_disks, + &dd_idx, &pd_idx, conf); + last_sector = + raid5_compute_sector((sector_nr+conf->chunk_size/512) + *(conf->raid_disks-1) -1, + raid_disks, data_disks, + &dd_idx, &pd_idx, conf); + if (last_sector >= (mddev->size<<1)) + last_sector = (mddev->size<<1)-1; + while (first_sector <= last_sector) { + pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); + sh = get_active_stripe(conf, first_sector, + conf->previous_raid_disks, pd_idx, 0); + set_bit(STRIPE_EXPAND_SOURCE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + first_sector += STRIPE_SECTORS; + } + return conf->chunk_size>>9; + } /* if there is 1 or more failed drives and we are trying * to resync, then assert that we are finished, because there is * nothing we can do. @@ -1799,13 +1946,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - x = sector_nr; - chunk_offset = sector_div(x, sectors_per_chunk); - stripe = x; - BUG_ON(x != stripe); - - first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk - + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf); + pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1); if (sh == NULL) { sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 617b9506c760..4e26ef2cacca 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -157,6 +157,9 @@ struct mddev_s * DONE: thread is done and is waiting to be reaped * REQUEST: user-space has requested a sync (used with SYNC) * CHECK: user-space request for for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. */ #define MD_RECOVERY_RUNNING 0 #define MD_RECOVERY_SYNC 1 @@ -166,6 +169,7 @@ struct mddev_s #define MD_RECOVERY_NEEDED 5 #define MD_RECOVERY_REQUESTED 6 #define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 unsigned long recovery; int in_sync; /* know to not need resync */ diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 6fa274aea2a0..55c738d50508 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -157,6 +157,7 @@ struct stripe_head { #define R5_ReadError 8 /* seen a read error here recently */ #define R5_ReWrite 9 /* have tried to over-write the readerror */ +#define R5_Expanded 10 /* This block now has post-expand data */ /* * Write method */ @@ -176,7 +177,8 @@ struct stripe_head { #define STRIPE_DEGRADED 7 #define STRIPE_BIT_DELAY 8 #define STRIPE_EXPANDING 9 - +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 /* * Plugging: * -- cgit v1.2.3 From 292695531ae4019bb15deedc121b218d1908b648 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:10 -0800 Subject: [PATCH] md: Final stages of raid5 expand code This patch adds raid5_reshape and end_reshape which will start and finish the reshape processes. raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage accidental use. Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry. and Make sure that you have backups, just in case. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/Kconfig | 26 ++++++++++ drivers/md/md.c | 6 ++- drivers/md/raid5.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/raid/md.h | 3 +- 4 files changed, 154 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ac43f98062fd..fd2aae150ccc 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -127,6 +127,32 @@ config MD_RAID5 If unsure, say Y. +config MD_RAID5_RESHAPE + bool "Support adding drives to a raid-5 array (experimental)" + depends on MD_RAID5 && EXPERIMENTAL + ---help--- + A RAID-5 set can be expanded by adding extra drives. This + requires "restriping" the array which means (almost) every + block must be written to a different place. + + This option allows such restriping to be done while the array + is online. However it is still EXPERIMENTAL code. It should + work, but please be sure that you have backups. + + You will need a version of mdadm newer than 2.3.1. During the + early stage of reshape there is a critical section where live data + is being over-written. A crash during this time needs extra care + for recovery. The newer mdadm takes a copy of the data in the + critical section and will restore it, if necessary, after a crash. + + The mdadm usage is e.g. + mdadm --grow /dev/md1 --raid-disks=6 + to grow '/dev/md1' to having 6 disks. + + Note: The array can only be expanded, not contracted. + There should be enough spares already present to make the new + array workable. + config MD_RAID6 tristate "RAID-6 mode" depends on BLK_DEV_MD diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e65986bc63f..d169bc964676 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -158,11 +158,12 @@ static int start_readonly; */ static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); static atomic_t md_event_count; -static void md_new_event(mddev_t *mddev) +void md_new_event(mddev_t *mddev) { atomic_inc(&md_event_count); wake_up(&md_event_waiters); } +EXPORT_SYMBOL_GPL(md_new_event); /* * Enables to iterate over all existing md arrays @@ -4467,7 +4468,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -static void md_do_sync(mddev_t *mddev) +void md_do_sync(mddev_t *mddev) { mddev_t *mddev2; unsigned int currspeed = 0, @@ -4704,6 +4705,7 @@ static void md_do_sync(mddev_t *mddev) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); } +EXPORT_SYMBOL_GPL(md_do_sync); /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 56cba8d3e398..b29135acb1d9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -331,6 +331,8 @@ static int grow_stripes(raid5_conf_t *conf, int num) } return 0; } + +#ifdef CONFIG_MD_RAID5_RESHAPE static int resize_stripes(raid5_conf_t *conf, int newsize) { /* Make all the stripes able to hold 'newsize' devices. @@ -451,7 +453,7 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) conf->pool_size = newsize; return err; } - +#endif static int drop_one_stripe(raid5_conf_t *conf) { @@ -1034,6 +1036,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in return 0; } +static void end_reshape(raid5_conf_t *conf); + static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) { int sectors_per_chunk = conf->chunk_size >> 9; @@ -1844,6 +1848,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { + end_reshape(conf); + return 0; + } if (mddev->curr_resync < max_sector) /* aborted */ bitmap_end_sync(mddev->bitmap, mddev->curr_resync, @@ -2464,6 +2472,116 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +#ifdef CONFIG_MD_RAID5_RESHAPE +static int raid5_reshape(mddev_t *mddev, int raid_disks) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + int err; + mdk_rdev_t *rdev; + struct list_head *rtmp; + int spares = 0; + int added_devices = 0; + + if (mddev->degraded || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + if (conf->raid_disks > raid_disks) + return -EINVAL; /* Cannot shrink array yet */ + if (conf->raid_disks == raid_disks) + return 0; /* nothing to do */ + + /* Can only proceed if there are plenty of stripe_heads. + * We need a minimum of one full stripe,, and for sensible progress + * it is best to have about 4 times that. + * If we require 4 times, then the default 256 4K stripe_heads will + * allow for chunk sizes up to 256K, which is probably OK. + * If the chunk size is greater, user-space should request more + * stripe_heads first. + */ + if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { + printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", + (mddev->chunk_size / STRIPE_SIZE)*4); + return -ENOSPC; + } + + ITERATE_RDEV(mddev, rdev, rtmp) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) + spares++; + if (conf->raid_disks + spares < raid_disks-1) + /* Not enough devices even to make a degraded array + * of that size + */ + return -EINVAL; + + err = resize_stripes(conf, raid_disks); + if (err) + return err; + + spin_lock_irq(&conf->device_lock); + conf->previous_raid_disks = conf->raid_disks; + mddev->raid_disks = conf->raid_disks = raid_disks; + conf->expand_progress = 0; + spin_unlock_irq(&conf->device_lock); + + /* Add some new drives, as many as will fit. + * We know there are enough to make the newly sized array work. + */ + ITERATE_RDEV(mddev, rdev, rtmp) + if (rdev->raid_disk < 0 && + !test_bit(Faulty, &rdev->flags)) { + if (raid5_add_disk(mddev, rdev)) { + char nm[20]; + set_bit(In_sync, &rdev->flags); + conf->working_disks++; + added_devices++; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } else + break; + } + + mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "%s_reshape"); + if (!mddev->sync_thread) { + mddev->recovery = 0; + spin_lock_irq(&conf->device_lock); + mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + return -EAGAIN; + } + md_wakeup_thread(mddev->sync_thread); + md_new_event(mddev); + return 0; +} +#endif + +static void end_reshape(raid5_conf_t *conf) +{ + struct block_device *bdev; + + conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + conf->mddev->changed = 1; + + bdev = bdget_disk(conf->mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); +} + static void raid5_quiesce(mddev_t *mddev, int state) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -2502,6 +2620,9 @@ static struct mdk_personality raid5_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .reshape = raid5_reshape, +#endif .quiesce = raid5_quiesce, }; diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b6e0bcad84e1..9c77cde5a795 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, extern void md_super_wait(mddev_t *mddev); extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); - +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } -- cgit v1.2.3 From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:11 -0800 Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 69 +++++++++++++++++++++- drivers/md/raid1.c | 5 ++ drivers/md/raid5.c | 140 ++++++++++++++++++++++++++++++++++++++------- include/linux/raid/md.h | 2 + include/linux/raid/md_k.h | 8 +++ include/linux/raid/md_p.h | 32 ++++++++++- include/linux/raid/raid5.h | 1 + 7 files changed, 231 insertions(+), 26 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d169bc964676..b9dfdfccdb78 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -662,7 +662,8 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version } if (sb->major_version != 0 || - sb->minor_version != 90) { + sb->minor_version < 90 || + sb->minor_version > 91) { printk(KERN_WARNING "Bad version number %d.%d on %s\n", sb->major_version, sb->minor_version, b); @@ -747,6 +748,20 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->bitmap_offset = 0; mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + if (mddev->minor_version >= 91) { + mddev->reshape_position = sb->reshape_position; + mddev->delta_disks = sb->delta_disks; + mddev->new_level = sb->new_level; + mddev->new_layout = sb->new_layout; + mddev->new_chunk = sb->new_chunk; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + } + if (sb->state & (1<recovery_cp = MaxSector; else { @@ -841,7 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->md_magic = MD_SB_MAGIC; sb->major_version = mddev->major_version; - sb->minor_version = mddev->minor_version; sb->patch_version = mddev->patch_version; sb->gvalid_words = 0; /* ignored */ memcpy(&sb->set_uuid0, mddev->uuid+0, 4); @@ -860,6 +874,17 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->events_hi = (mddev->events>>32); sb->events_lo = (u32)mddev->events; + if (mddev->reshape_position == MaxSector) + sb->minor_version = 90; + else { + sb->minor_version = 91; + sb->reshape_position = mddev->reshape_position; + sb->new_level = mddev->new_level; + sb->delta_disks = mddev->delta_disks; + sb->new_layout = mddev->new_layout; + sb->new_chunk = mddev->new_chunk; + } + mddev->minor_version = sb->minor_version; if (mddev->in_sync) { sb->recovery_cp = mddev->recovery_cp; @@ -1104,6 +1129,20 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + mddev->reshape_position = le64_to_cpu(sb->reshape_position); + mddev->delta_disks = le32_to_cpu(sb->delta_disks); + mddev->new_level = le32_to_cpu(sb->new_level); + mddev->new_layout = le32_to_cpu(sb->new_layout); + mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9; + } else { + mddev->reshape_position = MaxSector; + mddev->delta_disks = 0; + mddev->new_level = mddev->level; + mddev->new_layout = mddev->layout; + mddev->new_chunk = mddev->chunk_size; + } + } else if (mddev->pers == NULL) { /* Insist of good event counter while assembling */ __u64 ev1 = le64_to_cpu(sb->events); @@ -1175,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } + if (mddev->reshape_position != MaxSector) { + sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->reshape_position = cpu_to_le64(mddev->reshape_position); + sb->new_layout = cpu_to_le32(mddev->new_layout); + sb->delta_disks = cpu_to_le32(mddev->delta_disks); + sb->new_level = cpu_to_le32(mddev->new_level); + sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9); + } max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) @@ -1497,7 +1544,7 @@ static void sync_sbs(mddev_t * mddev) } } -static void md_update_sb(mddev_t * mddev) +void md_update_sb(mddev_t * mddev) { int err; struct list_head *tmp; @@ -1574,6 +1621,7 @@ repeat: wake_up(&mddev->sb_wait); } +EXPORT_SYMBOL_GPL(md_update_sb); /* words written to sysfs files may, or my not, be \n terminated. * We want to accept with case. For this we use cmd_match. @@ -2545,6 +2593,14 @@ static int do_md_run(mddev_t * mddev) mddev->level = pers->level; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + if (mddev->reshape_position != MaxSector && + pers->reshape == NULL) { + /* This personality cannot handle reshaping... */ + mddev->pers = NULL; + module_put(pers->owner); + return -EINVAL; + } + mddev->recovery = 0; mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ mddev->barriers_work = 1; @@ -3433,11 +3489,18 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->default_bitmap_offset = MD_SB_BYTES >> 9; mddev->bitmap_offset = 0; + mddev->reshape_position = MaxSector; + /* * Generate a 128 bit UUID */ get_random_bytes(mddev->uuid, 16); + mddev->new_level = mddev->level; + mddev->new_chunk = mddev->chunk_size; + mddev->new_layout = mddev->layout; + mddev->delta_disks = 0; + return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5d88329e3c7a..b65b8cfbdf30 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1789,6 +1789,11 @@ static int run(mddev_t *mddev) mdname(mddev), mddev->level); goto out; } + if (mddev->reshape_position != MaxSector) { + printk("raid1: %s: reshape_position set but not supported\n", + mdname(mddev)); + goto out; + } /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b29135acb1d9..20ae32d67e21 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPANDING, &sh->state); } else if (expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } @@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ int i; int dd_idx; + + if (sector_nr == 0 && + conf->expand_progress != 0) { + /* restarting in the middle, skip the initial sectors */ + sector_nr = conf->expand_progress; + sector_div(sector_nr, conf->raid_disks-1); + *skipped = 1; + return sector_nr; + } + + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes)==0); + mddev->reshape_position = conf->expand_progress; + + mddev->sb_dirty = 1; + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || + kthread_should_stop()); + for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; @@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sh = get_active_stripe(conf, sector_nr+i, conf->raid_disks, pd_idx, 0); set_bit(STRIPE_EXPANDING, &sh->state); + atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old * array, then we need to zero those blocks */ @@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev) return -EIO; } + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + + if (mddev->new_level != mddev->level || + mddev->new_layout != mddev->layout || + mddev->new_chunk != mddev->chunk_size) { + printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->delta_disks <= 0) { + printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { + printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); + /* here_old is the first stripe that we might need to read from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } + + mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; - conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info), + if (mddev->reshape_position == MaxSector) { + conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; + } else { + conf->raid_disks = mddev->raid_disks; + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + } + + conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); if (!conf->disks) goto abort; @@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev) ITERATE_RDEV(mddev,rdev,tmp) { raid_disk = rdev->raid_disk; - if (raid_disk >= mddev->raid_disks + if (raid_disk >= conf->raid_disks || raid_disk < 0) continue; disk = conf->disks + raid_disk; @@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev) } } - conf->raid_disks = mddev->raid_disks; /* * 0 for a fully functional array, 1 for a degraded array. */ @@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev) conf->level = mddev->level; conf->algorithm = mddev->layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = MaxSector; + conf->expand_progress = mddev->reshape_position; /* device size must be a multiple of chunk size */ mddev->size &= ~(mddev->chunk_size/1024 -1); @@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev) print_raid5_conf(conf); + if (conf->expand_progress != MaxSector) { + printk("...ok start reshape thread\n"); + atomic_set(&conf->reshape_stripes, 0); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "%s_reshape"); + /* FIXME if md_register_thread fails?? */ + md_wakeup_thread(mddev->sync_thread); + + } + /* read-ahead size must cover two whole stripes, which is * 2 * (n-1) * chunksize where 'n' is the number of raid devices */ @@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; + mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); - mddev->array_size = mddev->size * (mddev->raid_disks - 1); return 0; abort: if (conf) { @@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) /* * find the disk ... */ - for (disk=0; disk < mddev->raid_disks; disk++) + for (disk=0; disk < conf->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; @@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) if (err) return err; + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - mddev->raid_disks = conf->raid_disks = raid_disks; + conf->raid_disks = raid_disks; conf->expand_progress = 0; spin_unlock_irq(&conf->device_lock); @@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) } mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; + mddev->new_chunk = mddev->chunk_size; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + mddev->raid_disks = raid_disks; + mddev->delta_disks = raid_disks - conf->previous_raid_disks; + mddev->reshape_position = 0; + mddev->sb_dirty = 1; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->delta_disks = 0; conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; @@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf) { struct block_device *bdev; - conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); - conf->mddev->changed = 1; - - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); + if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + conf->mddev->changed = 1; + + bdev = bdget_disk(conf->mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + conf->mddev->reshape_position = MaxSector; } - spin_lock_irq(&conf->device_lock); - conf->expand_progress = MaxSector; - spin_unlock_irq(&conf->device_lock); } static void raid5_quiesce(mddev_t *mddev, int state) diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 9c77cde5a795..66b44e5e0d6e 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); +extern void md_update_sb(mddev_t * mddev); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4e26ef2cacca..1a6f9f2f6282 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -132,6 +132,14 @@ struct mddev_s char uuid[16]; + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout, new_chunk; + struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* blocks scheduled */ diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index c100fa5d4bfa..774e1acfb8c4 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_ERRORS 1 #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +/* + * Notes: + * - if an array is being reshaped (restriped) in order to change the + * the number of active devices in the array, 'raid_disks' will be + * the larger of the old and new numbers. 'delta_disks' will + * be the "new - old". So if +ve, raid_disks is the new value, and + * "raid_disks-delta_disks" is the old. If -ve, raid_disks is the + * old value and "raid_disks+delta_disks" is the new (smaller) value. + */ + + typedef struct mdp_superblock_s { /* * Constant generic information @@ -146,7 +158,13 @@ typedef struct mdp_superblock_s { __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ #endif __u32 recovery_cp; /* 11 recovery checkpoint sector count */ - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12]; + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; /* * Personality information @@ -207,7 +225,14 @@ struct mdp_superblock_1 { * NOTE: signed, so bitmap can be before superblock * only meaningful of feature_map[0] is set. */ - __u8 pad1[128-100]; /* set to 0 when written */ + + /* These are only valid with feature bit '4' */ + __u64 reshape_position; /* next address in array-space for reshape */ + __u32 new_level; /* new level we are reshaping to */ + __u32 delta_disks; /* change in number of raid_disks */ + __u32 new_layout; /* new layout */ + __u32 new_chunk; /* new chunk size (bytes) */ + __u8 pad1[128-124]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ __u64 data_offset; /* sector start of data, often 0 */ @@ -240,8 +265,9 @@ struct mdp_superblock_1 { /* feature_map bits */ #define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RESHAPE_ACTIVE 4 -#define MD_FEATURE_ALL 1 +#define MD_FEATURE_ALL 5 #endif diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 55c738d50508..abcdf0d0658a 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -224,6 +224,7 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have * two caches. */ -- cgit v1.2.3 From 63c70c4f3a30e77e6f445bd16eff7934a031ebd3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:13 -0800 Subject: [PATCH] md: Split reshape handler in check_reshape and start_reshape check_reshape checks validity and does things that can be done instantly - like adding devices to raid1. start_reshape initiates a restriping process to convert the whole array. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 10 ++++---- drivers/md/raid1.c | 19 ++++++++++++--- drivers/md/raid5.c | 60 +++++++++++++++++++++++++++-------------------- include/linux/raid/md_k.h | 3 ++- 4 files changed, 58 insertions(+), 34 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index b9dfdfccdb78..1a637676a930 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2594,7 +2594,7 @@ static int do_md_run(mddev_t * mddev) strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && - pers->reshape == NULL) { + pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ mddev->pers = NULL; module_put(pers->owner); @@ -3556,14 +3556,16 @@ static int update_raid_disks(mddev_t *mddev, int raid_disks) { int rv; /* change the number of raid disks */ - if (mddev->pers->reshape == NULL) + if (mddev->pers->check_reshape == NULL) return -EINVAL; if (raid_disks <= 0 || raid_disks >= mddev->max_disks) return -EINVAL; - if (mddev->sync_thread) + if (mddev->sync_thread || mddev->reshape_position != MaxSector) return -EBUSY; - rv = mddev->pers->reshape(mddev, raid_disks); + mddev->delta_disks = raid_disks - mddev->raid_disks; + + rv = mddev->pers->check_reshape(mddev); return rv; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b65b8cfbdf30..04418e10d1dc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1976,7 +1976,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) return 0; } -static int raid1_reshape(mddev_t *mddev, int raid_disks) +static int raid1_reshape(mddev_t *mddev) { /* We need to: * 1/ resize the r1bio_pool @@ -1993,10 +1993,22 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) struct pool_info *newpoolinfo; mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); - int cnt; + int cnt, raid_disks; int d, d2; + /* Cannot change chunk_size, layout, or level */ + if (mddev->chunk_size != mddev->new_chunk || + mddev->layout != mddev->new_layout || + mddev->level != mddev->new_level) { + mddev->new_chunk = mddev->chunk_size; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + return -EINVAL; + } + + raid_disks = mddev->raid_disks + mddev->delta_disks; + if (raid_disks < conf->raid_disks) { cnt=0; for (d= 0; d < conf->raid_disks; d++) @@ -2043,6 +2055,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) mddev->degraded += (raid_disks - conf->raid_disks); conf->raid_disks = mddev->raid_disks = raid_disks; + mddev->delta_disks = 0; conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); @@ -2084,7 +2097,7 @@ static struct mdk_personality raid1_personality = .spare_active = raid1_spare_active, .sync_request = sync_request, .resize = raid1_resize, - .reshape = raid1_reshape, + .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, }; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 089a32604305..355dafb98aac 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2590,21 +2590,15 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) } #ifdef CONFIG_MD_RAID5_RESHAPE -static int raid5_reshape(mddev_t *mddev, int raid_disks) +static int raid5_check_reshape(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); int err; - mdk_rdev_t *rdev; - struct list_head *rtmp; - int spares = 0; - int added_devices = 0; - if (mddev->degraded || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; - if (conf->raid_disks > raid_disks) - return -EINVAL; /* Cannot shrink array yet */ - if (conf->raid_disks == raid_disks) + if (mddev->delta_disks < 0 || + mddev->new_level != mddev->level) + return -EINVAL; /* Cannot shrink array or change level yet */ + if (mddev->delta_disks == 0) return 0; /* nothing to do */ /* Can only proceed if there are plenty of stripe_heads. @@ -2615,30 +2609,48 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) * If the chunk size is greater, user-space should request more * stripe_heads first. */ - if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { + if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || + (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", (mddev->chunk_size / STRIPE_SIZE)*4); return -ENOSPC; } + err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); + if (err) + return err; + + /* looks like we might be able to manage this */ + return 0; +} + +static int raid5_start_reshape(mddev_t *mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *rdev; + struct list_head *rtmp; + int spares = 0; + int added_devices = 0; + + if (mddev->degraded || + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return -EBUSY; + ITERATE_RDEV(mddev, rdev, rtmp) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) spares++; - if (conf->raid_disks + spares < raid_disks-1) + + if (spares < mddev->delta_disks-1) /* Not enough devices even to make a degraded array * of that size */ return -EINVAL; - err = resize_stripes(conf, raid_disks); - if (err) - return err; - atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - conf->raid_disks = raid_disks; + conf->raid_disks += mddev->delta_disks; conf->expand_progress = 0; conf->expand_lo = 0; spin_unlock_irq(&conf->device_lock); @@ -2660,12 +2672,8 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) break; } - mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; - mddev->new_chunk = mddev->chunk_size; - mddev->new_layout = mddev->layout; - mddev->new_level = mddev->level; - mddev->raid_disks = raid_disks; - mddev->delta_disks = raid_disks - conf->previous_raid_disks; + mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; + mddev->raid_disks = conf->raid_disks; mddev->reshape_position = 0; mddev->sb_dirty = 1; @@ -2679,7 +2687,6 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; - mddev->delta_disks = 0; conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; @@ -2752,7 +2759,8 @@ static struct mdk_personality raid5_personality = .sync_request = sync_request, .resize = raid5_resize, #ifdef CONFIG_MD_RAID5_RESHAPE - .reshape = raid5_reshape, + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, #endif .quiesce = raid5_quiesce, }; diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 1a6f9f2f6282..002ee631fabb 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -261,7 +261,8 @@ struct mdk_personality int (*spare_active) (mddev_t *mddev); sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (mddev_t *mddev, sector_t sectors); - int (*reshape) (mddev_t *mddev, int raid_disks); + int (*check_reshape) (mddev_t *mddev); + int (*start_reshape) (mddev_t *mddev); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); /* quiesce moves between quiescence states * 0 - fully active -- cgit v1.2.3 From 16484bf59634e25d1299761e5ed8bacf22bc6368 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:13 -0800 Subject: [PATCH] md: Make 'reshape' a possible sync_action action This allows reshape to be triggerred via sysfs (which is the only way to start it happening). Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1a637676a930..a79dd33d343d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2247,7 +2247,14 @@ action_store(mddev_t *mddev, const char *page, size_t len) return -EBUSY; else if (cmd_match(page, "resync") || cmd_match(page, "recover")) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - else { + else if (cmd_match(page, "reshape")) { + int err; + if (mddev->pers->start_reshape == NULL) + return -EINVAL; + err = mddev->pers->start_reshape(mddev); + if (err) + return err; + } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); else if (cmd_match(page, "repair")) -- cgit v1.2.3 From e464eafdb4400c6d6576ba3840d8bd40340f8a96 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:14 -0800 Subject: [PATCH] md: Support suspending of IO to regions of an md array This allows user-space to access data safely. This is needed for raid5 reshape as user-space needs to take a backup of the first few stripes before allowing reshape to commence. It will also be useful in cluster-aware raid1 configurations so that all cluster members can leave a section of the array untouched while a resync/recovery happens. A 'start' and 'end' of the suspended range are written to 2 sysfs attributes. Note that only one range can be suspended at a time. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/raid5.c | 14 +++++++++++ include/linux/raid/md_k.h | 4 ++++ 3 files changed, 77 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index a79dd33d343d..92fd0104fa04 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2365,6 +2365,63 @@ sync_completed_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); +static ssize_t +suspend_lo_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); +} + +static ssize_t +suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + if (new >= mddev->suspend_hi || + (new > mddev->suspend_lo && new < mddev->suspend_hi)) { + mddev->suspend_lo = new; + mddev->pers->quiesce(mddev, 2); + return len; + } else + return -EINVAL; +} +static struct md_sysfs_entry md_suspend_lo = +__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); + + +static ssize_t +suspend_hi_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); +} + +static ssize_t +suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long long new = simple_strtoull(buf, &e, 10); + + if (mddev->pers->quiesce == NULL) + return -EINVAL; + if (buf == e || (*e && *e != '\n')) + return -EINVAL; + if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || + (new > mddev->suspend_lo && new > mddev->suspend_hi)) { + mddev->suspend_hi = new; + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + return len; + } else + return -EINVAL; +} +static struct md_sysfs_entry md_suspend_hi = +__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); + + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, @@ -2382,6 +2439,8 @@ static struct attribute *md_redundancy_attrs[] = { &md_sync_max.attr, &md_sync_speed.attr, &md_sync_completed.attr, + &md_suspend_lo.attr, + &md_suspend_hi.attr, NULL, }; static struct attribute_group md_redundancy_group = { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 355dafb98aac..bb16ac231a40 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1805,6 +1805,15 @@ static int make_request(request_queue_t *q, struct bio * bi) goto retry; } } + /* FIXME what if we get a false positive because these + * are being updated. + */ + if (logical_sector >= mddev->suspend_lo && + logical_sector < mddev->suspend_hi) { + release_stripe(sh); + schedule(); + goto retry; + } if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { @@ -2725,6 +2734,10 @@ static void raid5_quiesce(mddev_t *mddev, int state) raid5_conf_t *conf = mddev_to_conf(mddev); switch(state) { + case 2: /* resume for a suspend */ + wake_up(&conf->wait_for_overlap); + break; + case 1: /* stop all writes */ spin_lock_irq(&conf->device_lock); conf->quiesce = 1; @@ -2738,6 +2751,7 @@ static void raid5_quiesce(mddev_t *mddev, int state) spin_lock_irq(&conf->device_lock); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); + wake_up(&conf->wait_for_overlap); spin_unlock_irq(&conf->device_lock); break; } diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 002ee631fabb..c0d3097846a7 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -151,6 +151,10 @@ struct mddev_s sector_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; -- cgit v1.2.3 From 8ddeeae51f2f197b4fafcba117ee8191b49d843e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:18 -0800 Subject: [PATCH] md: Fix md grow/size code to correctly find the maximum available space An md array can be asked to change the amount of each device that it is using, and in particular can be asked to use the maximum available space. This currently only works if the first device is not larger than the rest. As 'size' gets changed and so 'fit' becomes wrong. So check if a 'fit' is required early and don't corrupt it. Signed-off-by: Doug Ledford Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 92fd0104fa04..147efcb1e8ca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3575,6 +3575,7 @@ static int update_size(mddev_t *mddev, unsigned long size) mdk_rdev_t * rdev; int rv; struct list_head *tmp; + int fit = (size == 0); if (mddev->pers->resize == NULL) return -EINVAL; @@ -3592,7 +3593,6 @@ static int update_size(mddev_t *mddev, unsigned long size) return -EBUSY; ITERATE_RDEV(mddev,rdev,tmp) { sector_t avail; - int fit = (size == 0); if (rdev->sb_offset > rdev->data_offset) avail = (rdev->sb_offset*2) - rdev->data_offset; else -- cgit v1.2.3 From 48c9c27b8bcd2a328a06151e2d5c1170db0b701b Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 27 Mar 2006 01:18:20 -0800 Subject: [PATCH] sem2mutex: drivers/md Semaphore to mutex conversion. The conversion was generated via scripts, and the result was validated automatically via a script as well. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/dm-table.c | 11 ++++++----- drivers/md/dm.c | 19 ++++++++++--------- drivers/md/kcopyd.c | 11 ++++++----- drivers/md/md.c | 11 ++++++----- 4 files changed, 28 insertions(+), 24 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 76610a6ac01c..8f56a54cf0ce 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #define MAX_DEPTH 16 @@ -770,14 +771,14 @@ int dm_table_complete(struct dm_table *t) return r; } -static DECLARE_MUTEX(_event_lock); +static DEFINE_MUTEX(_event_lock); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context) { - down(&_event_lock); + mutex_lock(&_event_lock); t->event_fn = fn; t->event_context = context; - up(&_event_lock); + mutex_unlock(&_event_lock); } void dm_table_event(struct dm_table *t) @@ -788,10 +789,10 @@ void dm_table_event(struct dm_table *t) */ BUG_ON(in_interrupt()); - down(&_event_lock); + mutex_lock(&_event_lock); if (t->event_fn) t->event_fn(t->event_context); - up(&_event_lock); + mutex_unlock(&_event_lock); } sector_t dm_table_get_size(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 973e63d530ae..4d710b7a133b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -743,14 +744,14 @@ static int dm_any_congested(void *congested_data, int bdi_bits) /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ -static DECLARE_MUTEX(_minor_lock); +static DEFINE_MUTEX(_minor_lock); static DEFINE_IDR(_minor_idr); static void free_minor(unsigned int minor) { - down(&_minor_lock); + mutex_lock(&_minor_lock); idr_remove(&_minor_idr, minor); - up(&_minor_lock); + mutex_unlock(&_minor_lock); } /* @@ -763,7 +764,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor) if (minor >= (1 << MINORBITS)) return -EINVAL; - down(&_minor_lock); + mutex_lock(&_minor_lock); if (idr_find(&_minor_idr, minor)) { r = -EBUSY; @@ -788,7 +789,7 @@ static int specific_minor(struct mapped_device *md, unsigned int minor) } out: - up(&_minor_lock); + mutex_unlock(&_minor_lock); return r; } @@ -797,7 +798,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor) int r; unsigned int m; - down(&_minor_lock); + mutex_lock(&_minor_lock); r = idr_pre_get(&_minor_idr, GFP_KERNEL); if (!r) { @@ -819,7 +820,7 @@ static int next_free_minor(struct mapped_device *md, unsigned int *minor) *minor = m; out: - up(&_minor_lock); + mutex_unlock(&_minor_lock); return r; } @@ -1014,13 +1015,13 @@ static struct mapped_device *dm_find_md(dev_t dev) if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) return NULL; - down(&_minor_lock); + mutex_lock(&_minor_lock); md = idr_find(&_minor_idr, minor); if (!md || (dm_disk(md)->first_minor != minor)) md = NULL; - up(&_minor_lock); + mutex_unlock(&_minor_lock); return md; } diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c index ed71f3f94620..72480a48d88b 100644 --- a/drivers/md/kcopyd.c +++ b/drivers/md/kcopyd.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "kcopyd.h" @@ -581,21 +582,21 @@ int kcopyd_cancel(struct kcopyd_job *job, int block) /*----------------------------------------------------------------- * Unit setup *---------------------------------------------------------------*/ -static DECLARE_MUTEX(_client_lock); +static DEFINE_MUTEX(_client_lock); static LIST_HEAD(_clients); static void client_add(struct kcopyd_client *kc) { - down(&_client_lock); + mutex_lock(&_client_lock); list_add(&kc->list, &_clients); - up(&_client_lock); + mutex_unlock(&_client_lock); } static void client_del(struct kcopyd_client *kc) { - down(&_client_lock); + mutex_lock(&_client_lock); list_del(&kc->list); - up(&_client_lock); + mutex_unlock(&_client_lock); } static DEFINE_MUTEX(kcopyd_init_lock); diff --git a/drivers/md/md.c b/drivers/md/md.c index 147efcb1e8ca..c9c9c096ad80 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -43,6 +43,7 @@ #include /* for invalidate_bdev */ #include #include +#include #include @@ -2500,7 +2501,7 @@ int mdp_major = 0; static struct kobject *md_probe(dev_t dev, int *part, void *data) { - static DECLARE_MUTEX(disks_sem); + static DEFINE_MUTEX(disks_mutex); mddev_t *mddev = mddev_find(dev); struct gendisk *disk; int partitioned = (MAJOR(dev) != MD_MAJOR); @@ -2510,15 +2511,15 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) if (!mddev) return NULL; - down(&disks_sem); + mutex_lock(&disks_mutex); if (mddev->gendisk) { - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev_put(mddev); return NULL; } disk = alloc_disk(1 << shift); if (!disk) { - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev_put(mddev); return NULL; } @@ -2536,7 +2537,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) disk->queue = mddev->queue; add_disk(disk); mddev->gendisk = disk; - up(&disks_sem); + mutex_unlock(&disks_mutex); mddev->kobj.parent = &disk->kobj; mddev->kobj.k_name = NULL; snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); -- cgit v1.2.3 From df5b89b323b922f56650b4b4d7c41899b937cf19 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:20 -0800 Subject: [PATCH] md: Convert reconfig_sem to reconfig_mutex ... being careful that mutex_trylock is inverted wrt down_trylock Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 14 +++++++------- include/linux/raid/md_k.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index c9c9c096ad80..039e071c1007 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -255,7 +255,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; - init_MUTEX(&new->reconfig_sem); + mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); @@ -277,22 +277,22 @@ static mddev_t * mddev_find(dev_t unit) static inline int mddev_lock(mddev_t * mddev) { - return down_interruptible(&mddev->reconfig_sem); + return mutex_lock_interruptible(&mddev->reconfig_mutex); } static inline void mddev_lock_uninterruptible(mddev_t * mddev) { - down(&mddev->reconfig_sem); + mutex_lock(&mddev->reconfig_mutex); } static inline int mddev_trylock(mddev_t * mddev) { - return down_trylock(&mddev->reconfig_sem); + return mutex_trylock(&mddev->reconfig_mutex); } static inline void mddev_unlock(mddev_t * mddev) { - up(&mddev->reconfig_sem); + mutex_unlock(&mddev->reconfig_mutex); md_wakeup_thread(mddev->thread); } @@ -4893,7 +4893,7 @@ void md_check_recovery(mddev_t *mddev) )) return; - if (mddev_trylock(mddev)==0) { + if (mddev_trylock(mddev)) { int spares =0; spin_lock_irq(&mddev->write_lock); @@ -5029,7 +5029,7 @@ static int md_notify_reboot(struct notifier_block *this, printk(KERN_INFO "md: stopping all md devices.\n"); ITERATE_MDDEV(mddev,tmp) - if (mddev_trylock(mddev)==0) + if (mddev_trylock(mddev)) do_md_stop (mddev, 1); /* * certain more exotic SCSI devices are known to be diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index c0d3097846a7..e2df61f5b09a 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -185,7 +185,7 @@ struct mddev_s unsigned long recovery; int in_sync; /* know to not need resync */ - struct semaphore reconfig_sem; + struct mutex reconfig_mutex; atomic_t active; int changed; /* true if we might need to reread partition info */ -- cgit v1.2.3