summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/mdia
diff options
context:
space:
mode:
authorChris Phan <cphan@us.ibm.com>2014-05-19 16:33:49 -0500
committerA. Patrick Williams III <iawillia@us.ibm.com>2014-06-19 18:22:54 -0500
commitd4b3ea209d3ed70135a4110d52a9e073b2df9efd (patch)
tree0ad3c9da0cb3329d5196137a175316fe6228e835 /src/usr/diag/mdia
parente013336ca98f91d0e6bb6cc7b95a79a265884718 (diff)
downloadtalos-hostboot-d4b3ea209d3ed70135a4110d52a9e073b2df9efd.tar.gz
talos-hostboot-d4b3ea209d3ed70135a4110d52a9e073b2df9efd.zip
MDIA: reset timeoutCnt for new MBA work requests
Also make SW timeout error an info log with no callout Change-Id: I72da08eadf0c2a209493680cda46f82c6126faca CQ: SW262375 RTC: 106213 Backport: release-fips811 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/11170 Tested-by: Jenkins Server Reviewed-by: Zane Shelley <zshelle@us.ibm.com> Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com> Reviewed-by: Prem Shanker Jha <premjha2@in.ibm.com> Reviewed-by: Bilicon Patil <bilpatil@in.ibm.com> Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src/usr/diag/mdia')
-rw-r--r--src/usr/diag/mdia/mdiasm.C65
-rw-r--r--src/usr/diag/mdia/mdiasm.H6
2 files changed, 49 insertions, 22 deletions
diff --git a/src/usr/diag/mdia/mdiasm.C b/src/usr/diag/mdia/mdiasm.C
index 44d52d5b1..bda324767 100644
--- a/src/usr/diag/mdia/mdiasm.C
+++ b/src/usr/diag/mdia/mdiasm.C
@@ -279,27 +279,51 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs)
// Pending maint cmd complete, reset timer
if(mbaspa & ~mbaspamask)
{
- if((*wit)->timeoutCnt >= MBA_TIMEOUTCNT_MAX)
+ // Commiting an info log to help debug SW timeout
+ if((*wit)->timeoutCnt >= MBA_TIMEOUT_LOG)
{
- MDIA_FAST("sm: work item %d timed out on: %x, "
- "timeoutCnt: %d", *((*wit)->workItem),
- get_huid(target), (*wit)->timeoutCnt);
+ MDIA_FAST("sm: commiting a SW timed out info log "
+ "for %x", get_huid(target));
+
+ /*@
+ * @errortype
+ * @reasoncode MDIA::MAINT_COMMAND_SW_TIMED_OUT
+ * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL
+ * @moduleid MDIA::PROCESS_COMMAND_TIMEOUT
+ * @userData1 Associated memory diag work item
+ * @userData2 Target HUID
+ * @devdesc A maint command SW timed out
+ */
+ err = new ErrlEntry(ERRL_SEV_INFORMATIONAL,
+ PROCESS_COMMAND_TIMEOUT,
+ MAINT_COMMAND_SW_TIMED_OUT,
+ *((*wit)->workItem),
+ get_huid(target));
+
+ // collect ffdc
+
+ addTimeoutFFDC(target, err);
+
+ errlCommit(err, MDIA_COMP_ID);
+
+ // reset for the next logging
+ (*wit)->timeoutCnt = 0;
}
else
{
- MDIA_FAST("sm: work item %d reset timed out on: %x, "
- "timeoutCnt: %d", *((*wit)->workItem),
- get_huid(target), (*wit)->timeoutCnt);
- // register a new timeout monitor
- uint64_t monitorId =
- getMonitor().addMonitor(MBA_TIMEOUT);
- (*wit)->timer = monitorId;
-
// advance timeout counter
(*wit)->timeoutCnt++;
-
- break;
}
+
+ MDIA_FAST("sm: work item %d reset SW timed out on: %x, "
+ "timeoutCnt: %d", *((*wit)->workItem),
+ get_huid(target), (*wit)->timeoutCnt);
+ // register a new timeout monitor
+ uint64_t monitorId =
+ getMonitor().addMonitor(MBA_TIMEOUT);
+ (*wit)->timer = monitorId;
+
+ break;
}
// If maint cmd complete bit is not on, time out
@@ -311,24 +335,26 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs)
wkflprop = *wit;
// log a timeout event
- MDIA_ERR("sm: command %p: %d timed out on: %x",
+ MDIA_ERR("sm: command %p: %d HW timed out on: %x",
stopCmds.back(),
*((*wit)->workItem),
get_huid(target));
/*@
* @errortype
- * @reasoncode MDIA::MAINT_COMMAND_TIMED_OUT
+ * @reasoncode MDIA::MAINT_COMMAND_HW_TIMED_OUT
* @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
* @moduleid MDIA::PROCESS_COMMAND_TIMEOUT
* @userData1 Associated memory diag work item
- * @devdesc A maint command timed out
+ * @userData2 Target HUID
+ * @devdesc A maint command HW timed out
*/
err = new ErrlEntry(
ERRL_SEV_UNRECOVERABLE,
PROCESS_COMMAND_TIMEOUT,
- MAINT_COMMAND_TIMED_OUT,
- *((*wit)->workItem), 0);
+ MAINT_COMMAND_HW_TIMED_OUT,
+ *((*wit)->workItem),
+ get_huid(target));
// collect ffdc
@@ -777,6 +803,7 @@ errlHndl_t StateMachine::doMaintCommand(WorkFlowProperties & i_wfp)
uint64_t monitorId = getMonitor().addMonitor(MBA_TIMEOUT);
i_wfp.timer = monitorId;
+ i_wfp.timeoutCnt = 0; // reset for new work item
workItem = *i_wfp.workItem;
restart = i_wfp.restartCommand;
targetMba = getTarget(i_wfp);
diff --git a/src/usr/diag/mdia/mdiasm.H b/src/usr/diag/mdia/mdiasm.H
index 6372ee7a6..9299dd123 100644
--- a/src/usr/diag/mdia/mdiasm.H
+++ b/src/usr/diag/mdia/mdiasm.H
@@ -41,9 +41,9 @@ namespace MDIA
//MBA timeout value - 30 secs
static const uint64_t MBA_TIMEOUT = 30000000000;
-//MBA timeout count max before timing out (~3 min)
-//TODO: RTC: 106213 enhance memdiags timeout detection
-static const uint64_t MBA_TIMEOUTCNT_MAX = 6;
+//Commit an info log for SW timeout every 10 mins
+static const uint64_t MBA_TIMEOUT_LOG =
+ ( 10 * 60000000000 ) / MBA_TIMEOUT;
/**
* @brief work flow phases
OpenPOWER on IntegriCloud