md-cluster: release RESYNC lock after the last resync message

All the RESYNC messages are sent with resync lock held, the only exception is resync_finish which releases resync_lockres before send the last resync message, this should be changed as well. Otherwise, we can see deadlock issue as follows: clustermd2-gqjiang2:~ # cat /proc/mdstat Personalities : [raid10] [raid1] md0 : active raid1 sdg[0] sdf[1] 134144 blocks super 1.2 [2/2] [UU] [===================>.] resync = 99.6% (134144/134144) finish=0.0min speed=26K/sec bitmap: 1/1 pages [4KB], 65536KB chunk unused devices: <none> clustermd2-gqjiang2:~ # ps aux|grep md|grep D root 20497 0.0 0.0 0 0 ? D 16:00 0:00 [md0_raid1] clustermd2-gqjiang2:~ # cat /proc/20497/stack [<ffffffffc05ff51e>] dlm_lock_sync+0x8e/0xc0 [md_cluster] [<ffffffffc05ff7e8>] __sendmsg+0x98/0x130 [md_cluster] [<ffffffffc05ff900>] sendmsg+0x20/0x30 [md_cluster] [<ffffffffc05ffc35>] resync_info_update+0xb5/0xc0 [md_cluster] [<ffffffffc0593e84>] md_reap_sync_thread+0x134/0x170 [md_mod] [<ffffffffc059514c>] md_check_recovery+0x28c/0x510 [md_mod] [<ffffffffc060c882>] raid1d+0x42/0x800 [raid1] [<ffffffffc058ab61>] md_thread+0x121/0x150 [md_mod] [<ffffffff9a0a5b3f>] kthread+0xff/0x140 [<ffffffff9a800235>] ret_from_fork+0x35/0x40 [<ffffffffffffffff>] 0xffffffffffffffff clustermd-gqjiang1:~ # ps aux|grep md|grep D root 20531 0.0 0.0 0 0 ? D 16:00 0:00 [md0_raid1] root 20537 0.0 0.0 0 0 ? D 16:00 0:00 [md0_cluster_rec] root 20676 0.0 0.0 0 0 ? D 16:01 0:00 [md0_resync] clustermd-gqjiang1:~ # cat /proc/mdstat Personalities : [raid10] [raid1] md0 : active raid1 sdf[1] sdg[0] 134144 blocks super 1.2 [2/2] [UU] [===================>.] resync = 97.3% (131072/134144) finish=8076.8min speed=0K/sec bitmap: 1/1 pages [4KB], 65536KB chunk unused devices: <none> clustermd-gqjiang1:~ # cat /proc/20531/stack [<ffffffffc080974d>] metadata_update_start+0xcd/0xd0 [md_cluster] [<ffffffffc079c897>] md_update_sb.part.61+0x97/0x820 [md_mod] [<ffffffffc079f15b>] md_check_recovery+0x29b/0x510 [md_mod] [<ffffffffc0816882>] raid1d+0x42/0x800 [raid1] [<ffffffffc0794b61>] md_thread+0x121/0x150 [md_mod] [<ffffffff9e0a5b3f>] kthread+0xff/0x140 [<ffffffff9e800235>] ret_from_fork+0x35/0x40 [<ffffffffffffffff>] 0xffffffffffffffff clustermd-gqjiang1:~ # cat /proc/20537/stack [<ffffffffc0813222>] freeze_array+0xf2/0x140 [raid1] [<ffffffffc080a56e>] recv_daemon+0x41e/0x580 [md_cluster] [<ffffffffc0794b61>] md_thread+0x121/0x150 [md_mod] [<ffffffff9e0a5b3f>] kthread+0xff/0x140 [<ffffffff9e800235>] ret_from_fork+0x35/0x40 [<ffffffffffffffff>] 0xffffffffffffffff clustermd-gqjiang1:~ # cat /proc/20676/stack [<ffffffffc080951e>] dlm_lock_sync+0x8e/0xc0 [md_cluster] [<ffffffffc080957f>] lock_token+0x2f/0xa0 [md_cluster] [<ffffffffc0809622>] lock_comm+0x32/0x90 [md_cluster] [<ffffffffc08098f5>] sendmsg+0x15/0x30 [md_cluster] [<ffffffffc0809c0a>] resync_info_update+0x8a/0xc0 [md_cluster] [<ffffffffc08130ba>] raid1_sync_request+0xa9a/0xb10 [raid1] [<ffffffffc079b8ea>] md_do_sync+0xbaa/0xf90 [md_mod] [<ffffffffc0794b61>] md_thread+0x121/0x150 [md_mod] [<ffffffff9e0a5b3f>] kthread+0xff/0x140 [<ffffffff9e800235>] ret_from_fork+0x35/0x40 [<ffffffffffffffff>] 0xffffffffffffffff Reviewed-by: NeilBrown <neilb@suse.com> Signed-off-by: Guoqing Jiang <gqjiang@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
author: Guoqing Jiang <gqjiang@suse.com> 2018-08-31 10:05:57 +0800
committer: Shaohua Li <shli@fb.com> 2018-08-31 17:38:10 -0700
commit: 41a95041126522a921fb73df22cbdd520dfdebad (patch)
tree: 0e87791ff4e604d43523917f37fa71421d01d1c0 /drivers/md
parent: 1d0ffd264204eba1861865560f1f7f7a92919384 (diff)
download: talos-obmc-linux-41a95041126522a921fb73df22cbdd520dfdebad.tar.gz
talos-obmc-linux-41a95041126522a921fb73df22cbdd520dfdebad.zip
1 files changed, 5 insertions, 5 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 94329e03001e..0b2af6e74fc3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1276,18 +1276,18 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 static int resync_finish(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret = 0;
 
 	clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
-	dlm_unlock_sync(cinfo->resync_lockres);
 
 	/*
 	 * If resync thread is interrupted so we can't say resync is finished,
 	 * another node will launch resync thread to continue.
 	 */
-	if (test_bit(MD_CLOSING, &mddev->flags))
-		return 0;
-	else
-		return resync_info_update(mddev, 0, 0);
+	if (!test_bit(MD_CLOSING, &mddev->flags))
+		ret = resync_info_update(mddev, 0, 0);
+	dlm_unlock_sync(cinfo->resync_lockres);
+	return ret;
 }
 
 static int area_resyncing(struct mddev *mddev, int direction,
author	Guoqing Jiang <gqjiang@suse.com>	2018-08-31 10:05:57 +0800
committer	Shaohua Li <shli@fb.com>	2018-08-31 17:38:10 -0700
commit	41a95041126522a921fb73df22cbdd520dfdebad (patch)
tree	0e87791ff4e604d43523917f37fa71421d01d1c0 /drivers/md
parent	1d0ffd264204eba1861865560f1f7f7a92919384 (diff)
download	talos-obmc-linux-41a95041126522a921fb73df22cbdd520dfdebad.tar.gz talos-obmc-linux-41a95041126522a921fb73df22cbdd520dfdebad.zip