summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorGuoqing Jiang <gqjiang@suse.com>2015-07-10 17:01:17 +0800
committerNeilBrown <neilb@suse.com>2015-08-31 19:41:56 +0200
commitb5ef56789b808a57fcd07271ff509911662fd877 (patch)
tree829be9cdca65a521f8cba1caf765f7acba59310c /drivers/md
parentb83d51c0785c34d552111f38fbecbe00cd58b913 (diff)
downloadtalos-op-linux-b5ef56789b808a57fcd07271ff509911662fd877.tar.gz
talos-op-linux-b5ef56789b808a57fcd07271ff509911662fd877.zip
md-cluster: add the error check if failed to get dlm lock
In complicated cluster environment, it is possible that the dlm lock couldn't be get/convert on purpose, the related err info is added for better debug potential issue. For lockres_free, if the lock is blocking by a lock request or conversion request, then dlm_unlock just put it back to grant queue, so need to ensure the lock is free finally. Signed-off-by: Guoqing Jiang <gqjiang@suse.com> Signed-off-by: NeilBrown <neilb@suse.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md-cluster.c41
1 files changed, 35 insertions, 6 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 2a57f193b103..b80a689aad04 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -166,10 +166,24 @@ out_err:
static void lockres_free(struct dlm_lock_resource *res)
{
+ int ret;
+
if (!res)
return;
- dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ /* cancel a lock request or a conversion request that is blocked */
+ res->flags |= DLM_LKF_CANCEL;
+retry:
+ ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+ if (unlikely(ret != 0)) {
+ pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
+
+ /* if a lock conversion is cancelled, then the lock is put
+ * back to grant queue, need to ensure it is unlocked */
+ if (ret == -DLM_ECANCEL)
+ goto retry;
+ }
+ res->flags &= ~DLM_LKF_CANCEL;
wait_for_completion(&res->completion);
kfree(res->name);
@@ -474,6 +488,7 @@ static void recv_daemon(struct md_thread *thread)
struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
struct cluster_msg msg;
+ int ret;
/*get CR on Message*/
if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -486,13 +501,21 @@ static void recv_daemon(struct md_thread *thread)
process_recvd_msg(thread->mddev, &msg);
/*release CR on ack_lockres*/
- dlm_unlock_sync(ack_lockres);
+ ret = dlm_unlock_sync(ack_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock ack failed return %d\n", ret);
/*up-convert to PR on message_lockres*/
- dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+ ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+ if (unlikely(ret != 0))
+ pr_info("lock PR on msg failed return %d\n", ret);
/*get CR on ack_lockres again*/
- dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+ if (unlikely(ret != 0))
+ pr_info("lock CR on ack failed return %d\n", ret);
/*release CR on message_lockres*/
- dlm_unlock_sync(message_lockres);
+ ret = dlm_unlock_sync(message_lockres);
+ if (unlikely(ret != 0))
+ pr_info("unlock msg failed return %d\n", ret);
}
/* lock_comm()
@@ -567,7 +590,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
}
failed_ack:
- dlm_unlock_sync(cinfo->message_lockres);
+ error = dlm_unlock_sync(cinfo->message_lockres);
+ if (unlikely(error != 0)) {
+ pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
+ error);
+ /* in case the message can't be released due to some reason */
+ goto failed_ack;
+ }
failed_message:
return error;
}
OpenPOWER on IntegriCloud