From d4b3ea209d3ed70135a4110d52a9e073b2df9efd Mon Sep 17 00:00:00 2001 From: Chris Phan Date: Mon, 19 May 2014 16:33:49 -0500 Subject: MDIA: reset timeoutCnt for new MBA work requests Also make SW timeout error an info log with no callout Change-Id: I72da08eadf0c2a209493680cda46f82c6126faca CQ: SW262375 RTC: 106213 Backport: release-fips811 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/11170 Tested-by: Jenkins Server Reviewed-by: Zane Shelley Reviewed-by: Sachin Gupta Reviewed-by: Prem Shanker Jha Reviewed-by: Bilicon Patil Reviewed-by: A. Patrick Williams III --- src/usr/diag/mdia/mdiasm.C | 65 ++++++++++++++++++++++++++++++++-------------- src/usr/diag/mdia/mdiasm.H | 6 ++--- 2 files changed, 49 insertions(+), 22 deletions(-) (limited to 'src/usr/diag') diff --git a/src/usr/diag/mdia/mdiasm.C b/src/usr/diag/mdia/mdiasm.C index 44d52d5b1..bda324767 100644 --- a/src/usr/diag/mdia/mdiasm.C +++ b/src/usr/diag/mdia/mdiasm.C @@ -279,27 +279,51 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs) // Pending maint cmd complete, reset timer if(mbaspa & ~mbaspamask) { - if((*wit)->timeoutCnt >= MBA_TIMEOUTCNT_MAX) + // Commiting an info log to help debug SW timeout + if((*wit)->timeoutCnt >= MBA_TIMEOUT_LOG) { - MDIA_FAST("sm: work item %d timed out on: %x, " - "timeoutCnt: %d", *((*wit)->workItem), - get_huid(target), (*wit)->timeoutCnt); + MDIA_FAST("sm: commiting a SW timed out info log " + "for %x", get_huid(target)); + + /*@ + * @errortype + * @reasoncode MDIA::MAINT_COMMAND_SW_TIMED_OUT + * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL + * @moduleid MDIA::PROCESS_COMMAND_TIMEOUT + * @userData1 Associated memory diag work item + * @userData2 Target HUID + * @devdesc A maint command SW timed out + */ + err = new ErrlEntry(ERRL_SEV_INFORMATIONAL, + PROCESS_COMMAND_TIMEOUT, + MAINT_COMMAND_SW_TIMED_OUT, + *((*wit)->workItem), + get_huid(target)); + + // collect ffdc + + addTimeoutFFDC(target, err); + + errlCommit(err, MDIA_COMP_ID); + + // reset for the next logging + (*wit)->timeoutCnt = 0; } else { - MDIA_FAST("sm: work item %d reset timed out on: %x, " - "timeoutCnt: %d", *((*wit)->workItem), - get_huid(target), (*wit)->timeoutCnt); - // register a new timeout monitor - uint64_t monitorId = - getMonitor().addMonitor(MBA_TIMEOUT); - (*wit)->timer = monitorId; - // advance timeout counter (*wit)->timeoutCnt++; - - break; } + + MDIA_FAST("sm: work item %d reset SW timed out on: %x, " + "timeoutCnt: %d", *((*wit)->workItem), + get_huid(target), (*wit)->timeoutCnt); + // register a new timeout monitor + uint64_t monitorId = + getMonitor().addMonitor(MBA_TIMEOUT); + (*wit)->timer = monitorId; + + break; } // If maint cmd complete bit is not on, time out @@ -311,24 +335,26 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs) wkflprop = *wit; // log a timeout event - MDIA_ERR("sm: command %p: %d timed out on: %x", + MDIA_ERR("sm: command %p: %d HW timed out on: %x", stopCmds.back(), *((*wit)->workItem), get_huid(target)); /*@ * @errortype - * @reasoncode MDIA::MAINT_COMMAND_TIMED_OUT + * @reasoncode MDIA::MAINT_COMMAND_HW_TIMED_OUT * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE * @moduleid MDIA::PROCESS_COMMAND_TIMEOUT * @userData1 Associated memory diag work item - * @devdesc A maint command timed out + * @userData2 Target HUID + * @devdesc A maint command HW timed out */ err = new ErrlEntry( ERRL_SEV_UNRECOVERABLE, PROCESS_COMMAND_TIMEOUT, - MAINT_COMMAND_TIMED_OUT, - *((*wit)->workItem), 0); + MAINT_COMMAND_HW_TIMED_OUT, + *((*wit)->workItem), + get_huid(target)); // collect ffdc @@ -777,6 +803,7 @@ errlHndl_t StateMachine::doMaintCommand(WorkFlowProperties & i_wfp) uint64_t monitorId = getMonitor().addMonitor(MBA_TIMEOUT); i_wfp.timer = monitorId; + i_wfp.timeoutCnt = 0; // reset for new work item workItem = *i_wfp.workItem; restart = i_wfp.restartCommand; targetMba = getTarget(i_wfp); diff --git a/src/usr/diag/mdia/mdiasm.H b/src/usr/diag/mdia/mdiasm.H index 6372ee7a6..9299dd123 100644 --- a/src/usr/diag/mdia/mdiasm.H +++ b/src/usr/diag/mdia/mdiasm.H @@ -41,9 +41,9 @@ namespace MDIA //MBA timeout value - 30 secs static const uint64_t MBA_TIMEOUT = 30000000000; -//MBA timeout count max before timing out (~3 min) -//TODO: RTC: 106213 enhance memdiags timeout detection -static const uint64_t MBA_TIMEOUTCNT_MAX = 6; +//Commit an info log for SW timeout every 10 mins +static const uint64_t MBA_TIMEOUT_LOG = + ( 10 * 60000000000 ) / MBA_TIMEOUT; /** * @brief work flow phases -- cgit v1.2.1