From c40f341f1e7fd4eddcfc5881d94cfa8669071ee6 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 19 Aug 2015 08:14:42 +1000 Subject: md-cluster: Use a small window for resync Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window (32M) is maintained in r1conf as cluster_sync_low and cluster_sync_high and processed in raid1's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Check if the sync will fit in the new window, if not issue a wait_barrier() and set cluster_sync_low to sector_nr. 3. Set cluster_sync_high to cluster_sync_low + resync_window. 4. Send a message to all nodes so they may add it in their suspension list. bitmap_cond_end_sync is modified to allow to force a sync inorder to get the curr_resync_completed uptodate with the sector passed. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/raid1.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 049df6c4a8cc..1dd13bb52940 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) @@ -2488,6 +2490,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_close_sync(mddev->bitmap); close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* Send zeros to mark end of resync */ + md_cluster_ops->resync_info_update(mddev, 0, 0); + } return 0; } @@ -2508,7 +2517,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp return sync_blocks; } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS + */ + + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf, sector_nr); @@ -2699,6 +2713,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio_full: r1_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* For a user-requested sync, we read all readable devices and do a * compare */ -- cgit v1.2.3 From 70bcecdb1534a7dcd82503b705c27a048d568c9d Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 21 Aug 2015 10:33:39 -0500 Subject: md-cluster: Improve md_reload_sb to be less error prone md_reload_sb is too simplistic and it explicitly needs to determine the changes made by the writing node. However, there are multiple areas where a simple reload could fail. Instead, read the superblock of one of the "good" rdevs and update the necessary information: - read the superblock into a newly allocated page, by temporarily swapping out rdev->sb_page and calling ->load_super. - if that fails return - if it succeeds, call check_sb_changes 1. iterates over list of active devices and checks the matching dev_roles[] value. If that is 'faulty', the device must be marked as faulty - call md_error to mark the device as faulty. Make sure not to set CHANGE_DEVS and wakeup mddev->thread or else it would initiate a resync process, which is the responsibility of the "primary" node. - clear the Blocked bit - Call remove_and_add_spares() to hot remove the device. If the device is 'spare': - call remove_and_add_spares() to get the number of spares added in this operation. - Reduce mddev->degraded to mark the array as not degraded. 2. reset recovery_cp - read the rest of the rdevs to update recovery_offset. If recovery_offset is equal to MaxSector, call spare_active() to set it In_sync This required that recovery_offset be initialized to MaxSector, as opposed to zero so as to communicate the end of sync for a rdev. Signed-off-by: Goldwyn Rodrigues --- drivers/md/md-cluster.c | 27 ++++++----- drivers/md/md.c | 121 ++++++++++++++++++++++++++++++++++++++++++------ drivers/md/md.h | 2 +- drivers/md/raid1.c | 9 ++++ 4 files changed, 133 insertions(+), 26 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 58eadc06a1b6..2eb3a5019a63 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -427,8 +427,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - - md_reload_sb(mddev); + md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); } @@ -834,11 +833,23 @@ static int metadata_update_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; - int ret; + struct md_rdev *rdev; + int ret = 0; memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); - ret = __sendmsg(cinfo, &cmsg); + cmsg.raid_slot = -1; + /* Pick up a good active device number to send. + */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); + break; + } + if (cmsg.raid_slot >= 0) + ret = __sendmsg(cinfo, &cmsg); + else + pr_warn("md-cluster: No good device id found to send\n"); unlock_comm(cinfo); return ret; } @@ -922,15 +933,9 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) static int add_new_disk_finish(struct mddev *mddev) { - struct cluster_msg cmsg; - struct md_cluster_info *cinfo = mddev->cluster_info; - int ret; /* Write sb and inform others */ md_update_sb(mddev, 1); - cmsg.type = METADATA_UPDATED; - ret = __sendmsg(cinfo, &cmsg); - unlock_comm(cinfo); - return ret; + return metadata_update_finish(mddev); } static int new_disk_ack(struct mddev *mddev, bool ack) diff --git a/drivers/md/md.c b/drivers/md/md.c index e21a2feed826..12cc28ab9a41 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8924,25 +8924,118 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + /* recovery_cp changed */ + if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp) + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; } + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; + } + + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); diff --git a/drivers/md/md.h b/drivers/md/md.h index ab339571e57f..2ea00356bb23 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -658,7 +658,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); -extern void md_reload_sb(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1dd13bb52940..b54fefc85b66 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1592,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; + for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { -- cgit v1.2.3 From c186b128cda5a246da25f474e4689cb2bfacfcac Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 30 Sep 2015 13:20:35 -0500 Subject: md-cluster: Perform resync/recovery under a DLM lock Resync or recovery must be performed by only one node at a time. A DLM lock resource, resync_lockres provides the mutual exclusion so that only one node performs the recovery/resync at a time. If a node is unable to get the resync_lockres, because recovery is being performed by another node, it set MD_RECOVER_NEEDED so as to schedule recovery in the future. Remove the debug message in resync_info_update() used during development. Signed-off-by: Goldwyn Rodrigues --- drivers/md/md-cluster.c | 29 ++++++++++++++++++++++++++--- drivers/md/md-cluster.h | 2 ++ drivers/md/md.c | 29 +++++++++++++++++++++++++---- drivers/md/raid1.c | 2 -- 4 files changed, 53 insertions(+), 9 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 2eb3a5019a63..e1ce9c9a0473 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -55,6 +55,7 @@ struct md_cluster_info { struct completion completion; struct mutex sb_mutex; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; struct md_thread *recovery_thread; @@ -384,6 +385,8 @@ static void process_suspend_info(struct mddev *mddev, if (!hi) { remove_suspend_info(mddev, slot); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return; } s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); @@ -758,6 +761,10 @@ static int join(struct mddev *mddev, int nodes) goto err; } + cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); + if (!cinfo->resync_lockres) + goto err; + ret = gather_all_resync_info(mddev, nodes); if (ret) goto err; @@ -768,6 +775,7 @@ err: lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) dlm_release_lockspace(cinfo->lockspace, 2); @@ -861,6 +869,13 @@ static int metadata_update_cancel(struct mddev *mddev) return dlm_unlock_sync(cinfo->token_lockres); } +static int resync_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; + return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -870,16 +885,22 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); /* Re-acquire the lock to refresh LVB */ dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); - pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, - (unsigned long long)lo, - (unsigned long long)hi); cmsg.type = cpu_to_le32(RESYNCING); cmsg.slot = cpu_to_le32(slot); cmsg.low = cpu_to_le64(lo); cmsg.high = cpu_to_le64(hi); + return sendmsg(cinfo, &cmsg); } +static int resync_finish(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; + dlm_unlock_sync(cinfo->resync_lockres); + return resync_info_update(mddev, 0, 0); +} + static int area_resyncing(struct mddev *mddev, int direction, sector_t lo, sector_t hi) { @@ -995,6 +1016,8 @@ static struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, + .resync_start = resync_start, + .resync_finish = resync_finish, .resync_info_update = resync_info_update, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index f5bdc0c86eaa..c94172673599 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -16,6 +16,8 @@ struct md_cluster_operations { int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); int (*metadata_update_cancel)(struct mddev *mddev); + int (*resync_start)(struct mddev *mddev); + int (*resync_finish)(struct mddev *mddev); int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); int (*add_new_disk_finish)(struct mddev *mddev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 5f0967803dc7..61e897def04f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7657,6 +7657,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7959,7 +7960,11 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality that we are finished */ + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { + md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -7997,6 +8002,11 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -8078,14 +8088,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b54fefc85b66..a2d813c9eabd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2503,8 +2503,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp if (mddev_is_clustered(mddev)) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; - /* Send zeros to mark end of resync */ - md_cluster_ops->resync_info_update(mddev, 0, 0); } return 0; } -- cgit v1.2.3