diff options
author | Chris Phan <cphan@us.ibm.com> | 2014-05-19 16:33:49 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2014-06-19 18:22:54 -0500 |
commit | d4b3ea209d3ed70135a4110d52a9e073b2df9efd (patch) | |
tree | 0ad3c9da0cb3329d5196137a175316fe6228e835 /src/usr/diag/mdia | |
parent | e013336ca98f91d0e6bb6cc7b95a79a265884718 (diff) | |
download | talos-hostboot-d4b3ea209d3ed70135a4110d52a9e073b2df9efd.tar.gz talos-hostboot-d4b3ea209d3ed70135a4110d52a9e073b2df9efd.zip |
MDIA: reset timeoutCnt for new MBA work requests
Also make SW timeout error an info log with no callout
Change-Id: I72da08eadf0c2a209493680cda46f82c6126faca
CQ: SW262375
RTC: 106213
Backport: release-fips811
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/11170
Tested-by: Jenkins Server
Reviewed-by: Zane Shelley <zshelle@us.ibm.com>
Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com>
Reviewed-by: Prem Shanker Jha <premjha2@in.ibm.com>
Reviewed-by: Bilicon Patil <bilpatil@in.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src/usr/diag/mdia')
-rw-r--r-- | src/usr/diag/mdia/mdiasm.C | 65 | ||||
-rw-r--r-- | src/usr/diag/mdia/mdiasm.H | 6 |
2 files changed, 49 insertions, 22 deletions
diff --git a/src/usr/diag/mdia/mdiasm.C b/src/usr/diag/mdia/mdiasm.C index 44d52d5b1..bda324767 100644 --- a/src/usr/diag/mdia/mdiasm.C +++ b/src/usr/diag/mdia/mdiasm.C @@ -279,27 +279,51 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs) // Pending maint cmd complete, reset timer if(mbaspa & ~mbaspamask) { - if((*wit)->timeoutCnt >= MBA_TIMEOUTCNT_MAX) + // Commiting an info log to help debug SW timeout + if((*wit)->timeoutCnt >= MBA_TIMEOUT_LOG) { - MDIA_FAST("sm: work item %d timed out on: %x, " - "timeoutCnt: %d", *((*wit)->workItem), - get_huid(target), (*wit)->timeoutCnt); + MDIA_FAST("sm: commiting a SW timed out info log " + "for %x", get_huid(target)); + + /*@ + * @errortype + * @reasoncode MDIA::MAINT_COMMAND_SW_TIMED_OUT + * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL + * @moduleid MDIA::PROCESS_COMMAND_TIMEOUT + * @userData1 Associated memory diag work item + * @userData2 Target HUID + * @devdesc A maint command SW timed out + */ + err = new ErrlEntry(ERRL_SEV_INFORMATIONAL, + PROCESS_COMMAND_TIMEOUT, + MAINT_COMMAND_SW_TIMED_OUT, + *((*wit)->workItem), + get_huid(target)); + + // collect ffdc + + addTimeoutFFDC(target, err); + + errlCommit(err, MDIA_COMP_ID); + + // reset for the next logging + (*wit)->timeoutCnt = 0; } else { - MDIA_FAST("sm: work item %d reset timed out on: %x, " - "timeoutCnt: %d", *((*wit)->workItem), - get_huid(target), (*wit)->timeoutCnt); - // register a new timeout monitor - uint64_t monitorId = - getMonitor().addMonitor(MBA_TIMEOUT); - (*wit)->timer = monitorId; - // advance timeout counter (*wit)->timeoutCnt++; - - break; } + + MDIA_FAST("sm: work item %d reset SW timed out on: %x, " + "timeoutCnt: %d", *((*wit)->workItem), + get_huid(target), (*wit)->timeoutCnt); + // register a new timeout monitor + uint64_t monitorId = + getMonitor().addMonitor(MBA_TIMEOUT); + (*wit)->timer = monitorId; + + break; } // If maint cmd complete bit is not on, time out @@ -311,24 +335,26 @@ void StateMachine::processCommandTimeout(const MonitorIDs & i_monitorIDs) wkflprop = *wit; // log a timeout event - MDIA_ERR("sm: command %p: %d timed out on: %x", + MDIA_ERR("sm: command %p: %d HW timed out on: %x", stopCmds.back(), *((*wit)->workItem), get_huid(target)); /*@ * @errortype - * @reasoncode MDIA::MAINT_COMMAND_TIMED_OUT + * @reasoncode MDIA::MAINT_COMMAND_HW_TIMED_OUT * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE * @moduleid MDIA::PROCESS_COMMAND_TIMEOUT * @userData1 Associated memory diag work item - * @devdesc A maint command timed out + * @userData2 Target HUID + * @devdesc A maint command HW timed out */ err = new ErrlEntry( ERRL_SEV_UNRECOVERABLE, PROCESS_COMMAND_TIMEOUT, - MAINT_COMMAND_TIMED_OUT, - *((*wit)->workItem), 0); + MAINT_COMMAND_HW_TIMED_OUT, + *((*wit)->workItem), + get_huid(target)); // collect ffdc @@ -777,6 +803,7 @@ errlHndl_t StateMachine::doMaintCommand(WorkFlowProperties & i_wfp) uint64_t monitorId = getMonitor().addMonitor(MBA_TIMEOUT); i_wfp.timer = monitorId; + i_wfp.timeoutCnt = 0; // reset for new work item workItem = *i_wfp.workItem; restart = i_wfp.restartCommand; targetMba = getTarget(i_wfp); diff --git a/src/usr/diag/mdia/mdiasm.H b/src/usr/diag/mdia/mdiasm.H index 6372ee7a6..9299dd123 100644 --- a/src/usr/diag/mdia/mdiasm.H +++ b/src/usr/diag/mdia/mdiasm.H @@ -41,9 +41,9 @@ namespace MDIA //MBA timeout value - 30 secs static const uint64_t MBA_TIMEOUT = 30000000000; -//MBA timeout count max before timing out (~3 min) -//TODO: RTC: 106213 enhance memdiags timeout detection -static const uint64_t MBA_TIMEOUTCNT_MAX = 6; +//Commit an info log for SW timeout every 10 mins +static const uint64_t MBA_TIMEOUT_LOG = + ( 10 * 60000000000 ) / MBA_TIMEOUT; /** * @brief work flow phases |