summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Cain <cjcain@us.ibm.com>2015-09-04 10:18:52 -0500
committerA. Patrick Williams III <iawillia@us.ibm.com>2015-09-18 20:37:54 -0500
commit978b2c75e301281fe4882dd30a2e365e6106faa4 (patch)
tree8897560408b39ba7418ebe775ebafd2037b49e83
parentef63b1a277b8ad231b308c0c197d663bd4b0eb13 (diff)
downloadtalos-hostboot-978b2c75e301281fe4882dd30a2e365e6106faa4.tar.gz
talos-hostboot-978b2c75e301281fe4882dd30a2e365e6106faa4.zip
Prevent OCC exception from logging duplicate errors
Change-Id: I3262c42c906fc9cfa879e7df0e501e2f7fe1e2f3 CQ:SW320752 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/20578 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: SHELDON R. BAILEY <baileysh@us.ibm.com> Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
-rw-r--r--src/include/usr/htmgt/htmgt_reasoncodes.H1
-rw-r--r--src/usr/htmgt/htmgt.C23
-rw-r--r--src/usr/htmgt/htmgt_cfgdata.C15
-rw-r--r--src/usr/htmgt/htmgt_occ.C63
-rw-r--r--src/usr/htmgt/htmgt_occ.H24
-rw-r--r--src/usr/htmgt/htmgt_occcmd.C102
-rw-r--r--src/usr/htmgt/htmgt_poll.C130
7 files changed, 252 insertions, 106 deletions
diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H
index 5fc1b9638..d6a106949 100644
--- a/src/include/usr/htmgt/htmgt_reasoncodes.H
+++ b/src/include/usr/htmgt/htmgt_reasoncodes.H
@@ -63,6 +63,7 @@ namespace HTMGT
HTMGT_RC_OT_THROTTLE_INVALID_N = HTMGT_COMP_ID | 0x04,
HTMGT_RC_OCC_NOT_READY = HTMGT_COMP_ID | 0x05,
HTMGT_RC_ATTRIBUTE_ERROR = HTMGT_COMP_ID | 0x06,
+ HTMGT_RC_OCC_EXCEPTION = HTMGT_COMP_ID | 0x0E,
HTMGT_RC_NO_SUPPORT = HTMGT_COMP_ID | 0x0F,
HTMGT_RC_OCC_RESET = HTMGT_COMP_ID | 0x15,
HTMGT_RC_OCC_CRIT_FAILURE = HTMGT_COMP_ID | 0x16,
diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C
index 257f2df04..d0ce39a1a 100644
--- a/src/usr/htmgt/htmgt.C
+++ b/src/usr/htmgt/htmgt.C
@@ -102,8 +102,18 @@ namespace HTMGT
l_err = OccManager::sendOccPoll();
if (l_err)
{
- // Continue even if failed (poll will be retried)
- ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
+ if (OccManager::occNeedsReset())
+ {
+ // No need to continue if a reset is required
+ TMGT_ERR("sendOccConfigData(): OCCs need to "
+ "be reset");
+ break;
+ }
+ else
+ {
+ // Continue even if failed (will be retried)
+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
+ }
}
// Send ALL config data
@@ -415,17 +425,22 @@ namespace HTMGT
{
// Create an elog so the user knows the cmd failed.
TMGT_ERR("enableOccActuation(): System is in safe mode");
+ uint32_t safeInstance = 0;
+ uint32_t safeRc = OccManager::getSafeModeReason(safeInstance);
/*@
* @errortype
* @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
* @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION
- * @userdata1 OCC activate [1==true][0==false]
+ * @userdata1[0:31] OCC activate [1==true][0==false]
+ * @userdata1[32:63] return code triggering safe mode
+ * @userdata2[0:31] safeMode flag
+ * @userdata2[32:63] OCC instance
* @devdesc Operation not allowed, system is in safe mode
*/
bldErrLog(l_err,
HTMGT_MOD_ENABLE_OCC_ACTUATION,
HTMGT_RC_OCC_CRIT_FAILURE,
- 0, i_occActivation, 0, safeMode,
+ i_occActivation, safeRc, safeMode, safeInstance,
ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
diff --git a/src/usr/htmgt/htmgt_cfgdata.C b/src/usr/htmgt/htmgt_cfgdata.C
index 3c5ef6b24..5c12a1a76 100644
--- a/src/usr/htmgt/htmgt_cfgdata.C
+++ b/src/usr/htmgt/htmgt_cfgdata.C
@@ -196,7 +196,7 @@ namespace HTMGT
break;
default:
- TMGT_ERR("send_occ_config_data: Unsupported"
+ TMGT_ERR("sendOccConfigData: Unsupported"
" format type 0x%02X",
format);
cmdDataLen = 0;
@@ -204,7 +204,7 @@ namespace HTMGT
if (cmdDataLen > 0)
{
- TMGT_INF("send_occ_config_data: Sending config"
+ TMGT_INF("sendOccConfigData: Sending config"
" 0x%02X to OCC%d",
format, occInstance);
OccCmd cmd(occ, OCC_CMD_SETUP_CFG_DATA,
@@ -212,7 +212,7 @@ namespace HTMGT
errlHndl_t l_err = cmd.sendOccCmd();
if (l_err != NULL)
{
- TMGT_ERR("send_occ_config_data: OCC%d cfg "
+ TMGT_ERR("sendOccConfigData: OCC%d cfg "
"format 0x%02X failed with rc=0x%04X",
occInstance, format,
l_err->reasonCode());
@@ -222,7 +222,7 @@ namespace HTMGT
{
if (OCC_RC_SUCCESS != cmd.getRspStatus())
{
- TMGT_ERR("send_occ_config_data: OCC%d cfg "
+ TMGT_ERR("sendOccConfigData: OCC%d cfg "
"format 0x%02X had bad rsp status"
" 0x%02X for sysConfig",
occInstance, format,
@@ -239,12 +239,17 @@ namespace HTMGT
}
} // if (sendData)
+ if (OccManager::occNeedsReset())
+ {
+ TMGT_ERR("sendOccConfigData(): OCCs need to be reset");
+ }
+
} // for each config format
} // for each OCC
}
- } // end send_occ_config_data()
+ } // end sendOccConfigData()
/** OCC configuration data message versions */
diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
index 52a27c14b..e23c0360f 100644
--- a/src/usr/htmgt/htmgt_occ.C
+++ b/src/usr/htmgt/htmgt_occ.C
@@ -63,6 +63,8 @@ namespace HTMGT
iv_target(i_target),
iv_lastPollValid(false),
iv_occsPresent(1 << i_instance),
+ iv_resetReason(OCC_RESET_REASON_NONE),
+ iv_exceptionLogged(0),
iv_resetCount(0),
iv_version(0x01)
{
@@ -235,6 +237,7 @@ namespace HTMGT
iv_failed = false;
iv_lastPollValid = false;
iv_resetReason = OCC_RESET_REASON_NONE;
+ iv_exceptionLogged = 0;
}
@@ -622,6 +625,7 @@ namespace HTMGT
{
TMGT_INF("_setOccState: All OCCs have reached state "
"0x%02X", requestedState);
+ iv_state = requestedState;
if (OCC_STATE_ACTIVE == requestedState)
{
@@ -769,13 +773,15 @@ namespace HTMGT
* @errortype
* @moduleid HTMGT_MOD_OCC_RESET
* @reasoncode HTMGT_RC_OCC_RESET_THREHOLD
+ * @userdata1 return code triggering safe mode
+ * @userdata2 OCC instance
* @devdesc OCC reset threshold reached.
* Leaving OCCs in reset state
*/
bldErrLog(err,
HTMGT_MOD_OCC_RESET,
HTMGT_RC_OCC_CRIT_FAILURE,
- 0, 0, 0, 0,
+ 0, cv_safeReturnCode, 0, cv_safeOccInstance,
ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
@@ -910,6 +916,13 @@ namespace HTMGT
}
+ uint32_t OccManager::_getSafeModeReason(uint32_t & o_instance)
+ {
+ o_instance = cv_safeOccInstance;
+ return cv_safeReturnCode;
+ }
+
+
bool OccManager::_occNeedsReset()
{
bool needsReset = false;
@@ -1039,6 +1052,40 @@ namespace HTMGT
return err;
}
+ // Consolidate all OCC states
+ void OccManager::_syncOccStates()
+ {
+ occStateId currentState = OCC_STATE_NO_CHANGE;
+
+ for(occList_t::const_iterator occ_itr = iv_occArray.begin();
+ (occ_itr != iv_occArray.end());
+ ++occ_itr)
+ {
+ Occ * occ = *occ_itr;
+ if (OCC_STATE_NO_CHANGE == currentState)
+ {
+ currentState = occ->getState();
+ }
+ else
+ {
+ if (currentState != occ->getState())
+ {
+ // States do not match yet...
+ currentState = OCC_STATE_NO_CHANGE;
+ break;
+ }
+ }
+ }
+ if (OCC_STATE_NO_CHANGE != currentState)
+ {
+ if (iv_state != currentState)
+ {
+ TMGT_INF("syncOccStates: All OCCs are in 0x%02X", currentState);
+ iv_state = currentState;
+ }
+ }
+ }
+
uint8_t OccManager::getNumOccs()
{
@@ -1084,16 +1131,22 @@ namespace HTMGT
void OccManager::waitForOccCheckpoint()
{
- return Singleton<OccManager>::instance()._waitForOccCheckpoint();
+ Singleton<OccManager>::instance()._waitForOccCheckpoint();
}
void OccManager::updateSafeModeReason(uint32_t i_src,
uint32_t i_instance)
{
- return Singleton<OccManager>::instance().
+ Singleton<OccManager>::instance().
_updateSafeModeReason(i_src, i_instance);
}
+ uint32_t OccManager::getSafeModeReason(uint32_t & o_instance)
+ {
+ return Singleton<OccManager>::instance().
+ _getSafeModeReason(o_instance);
+ }
+
bool OccManager::occNeedsReset()
{
return Singleton<OccManager>::instance()._occNeedsReset();
@@ -1124,6 +1177,10 @@ namespace HTMGT
Singleton<OccManager>::instance()._setPstateTable(i_useNormal);
}
+ void OccManager::syncOccStates()
+ {
+ Singleton<OccManager>::instance()._syncOccStates();
+ }
} // end namespace
diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
index 267da6896..3e9e87b3f 100644
--- a/src/usr/htmgt/htmgt_occ.H
+++ b/src/usr/htmgt/htmgt_occ.H
@@ -343,6 +343,8 @@ namespace HTMGT
occResetReason iv_resetReason;
+ // Value of last exception committed (to prevent duplicates)
+ uint8_t iv_exceptionLogged;
/**
* @brief Clear flags after OCC has been reset
@@ -491,6 +493,16 @@ namespace HTMGT
/**
+ * @brief Return the reason the system entered safe mode
+ *
+ * @param[out] o_instance OCC instance
+ *
+ * @return SRC which triggered safe mode
+ */
+ static uint32_t getSafeModeReason(uint32_t & o_instance);
+
+
+ /**
* @brief Check if any OCCs need to be reset
*
* @return true if any OCC needs to be reset
@@ -555,6 +567,13 @@ namespace HTMGT
static bool occFailed();
+ /**
+ * @brief Update OCC manager state with consolidated OCC state
+ *
+ */
+ static void syncOccStates();
+
+
private:
typedef std::vector<Occ*> occList_t;
@@ -641,6 +660,9 @@ namespace HTMGT
void _updateSafeModeReason(uint32_t i_src,
uint32_t i_instance);
+ /** See getSafeModeReason() above */
+ uint32_t _getSafeModeReason(uint32_t & o_instance);
+
/** See occNeedsReset() above */
bool _occNeedsReset();
@@ -665,6 +687,8 @@ namespace HTMGT
iv_normalPstateTables = i_useNormal;
};
+ /** See syncOccStates() above */
+ void _syncOccStates();
};
typedef Singleton<OccManager> occMgr;
diff --git a/src/usr/htmgt/htmgt_occcmd.C b/src/usr/htmgt/htmgt_occcmd.C
index 3752bdd92..d842e8265 100644
--- a/src/usr/htmgt/htmgt_occcmd.C
+++ b/src/usr/htmgt/htmgt_occcmd.C
@@ -473,15 +473,46 @@ namespace HTMGT
((false == l_commEstablished) &&
(OCC_CMD_POLL == iv_OccCmd.cmdType)) )
{
- iv_RetryCmd = false;
- do
+ if (0 == iv_Occ->iv_exceptionLogged)
{
- // Send the command and receive the response
- l_errlHndl = writeOccCmd();
-
- processOccResponse(l_errlHndl, cmdTraced);
-
- } while (iv_RetryCmd);
+ iv_RetryCmd = false;
+ do
+ {
+ // Send the command and receive the response
+ l_errlHndl = writeOccCmd();
+
+ // process response if OCC did not hit an exception
+ if (0 == iv_Occ->iv_exceptionLogged)
+ {
+ processOccResponse(l_errlHndl, cmdTraced);
+ }
+
+ // skip retry if an exception was logged
+ } while ((iv_RetryCmd) &&
+ (0 == iv_Occ->iv_exceptionLogged));
+ }
+ else
+ {
+ // OCC has already logged an exception, no need to send
+ TMGT_ERR("Skipping 0x%02X cmd since OCC has already "
+ "logged an exception 0x%04X",
+ iv_OccCmd.cmdType, iv_Occ->iv_exceptionLogged);
+ /*@
+ * @errortype
+ * @reasoncode HTMGT_RC_OCC_EXCEPTION
+ * @moduleid HTMGT_MOD_SEND_OCC_CMD
+ * @userdata1 OCC command
+ * @userdata2 comm established
+ * @userdata3 OCC state
+ * @userdata4 exception
+ * @devdesc Unable to send cmd to OCC exception
+ */
+ bldErrLog(l_errlHndl, HTMGT_MOD_SEND_OCC_CMD,
+ HTMGT_RC_OCC_EXCEPTION,
+ iv_OccCmd.cmdType, l_commEstablished,
+ l_occState, iv_Occ->iv_exceptionLogged,
+ ERRORLOG::ERRL_SEV_INFORMATIONAL);
+ }
}
else
{
@@ -813,29 +844,38 @@ namespace HTMGT
TMGT_BIN("OCC Exception Data (up to 64 bytes)",
sramRspPtr, std::min(exceptionLength,(uint32_t)64));
- /*@
- * @errortype
- * @reasoncode HTMGT_RC_INTERNAL_ERROR
- * @moduleid HTMGT_MOD_HANLDE_OCC_EXCEPTION
- * @userdata1[0-31] rsp status
- * @userdata1[32-63] exception data length
- * @userdata2[0-31] OCC instance
- * @userdata2[32-63] exception data
- * @devdesc OCC reported exception
- */
- errlHndl_t l_excErr = NULL;
- bldErrLog(l_excErr, HTMGT_MOD_HANLDE_OCC_EXCEPTION,
- (htmgtReasonCode)(OCCC_COMP_ID | exceptionType),
- exceptionType, exceptionDataLength,
- iv_Occ->iv_instance, UINT32_GET(&sramRspPtr[5]),
- ERRORLOG::ERRL_SEV_UNRECOVERABLE);
- l_excErr->addFFDC(OCCC_COMP_ID,
- sramRspPtr,
- std::min(exceptionLength,(uint32_t)MAX_FFDC),
- 1, // version
- exceptionType); // subsection
- ERRORLOG::errlCommit(l_excErr, HTMGT_COMP_ID);
-
+ if (iv_Occ->iv_exceptionLogged != exceptionType)
+ {
+ /*@
+ * @errortype
+ * @reasoncode HTMGT_RC_INTERNAL_ERROR
+ * @moduleid HTMGT_MOD_HANLDE_OCC_EXCEPTION
+ * @userdata1[0-31] rsp status
+ * @userdata1[32-63] exception data length
+ * @userdata2[0-31] OCC instance
+ * @userdata2[32-63] exception data
+ * @devdesc OCC reported exception
+ */
+ errlHndl_t l_excErr = NULL;
+ bldErrLog(l_excErr, HTMGT_MOD_HANLDE_OCC_EXCEPTION,
+ (htmgtReasonCode)(OCCC_COMP_ID | exceptionType),
+ exceptionType, exceptionDataLength,
+ iv_Occ->iv_instance, UINT32_GET(&sramRspPtr[5]),
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE);
+ l_excErr->addFFDC(OCCC_COMP_ID,
+ sramRspPtr,
+ std::min(exceptionLength,
+ (uint32_t)MAX_FFDC),
+ 1, // version
+ exceptionType); // subsection
+ ERRORLOG::errlCommit(l_excErr, HTMGT_COMP_ID);
+
+ // Save exception so we don't log it again
+ iv_Occ->iv_exceptionLogged = exceptionType;
+ // This OCC needs to be reset to recover
+ iv_Occ->failed(true);
+ iv_Occ->iv_needsReset = true;
+ }
}
}
#endif
diff --git a/src/usr/htmgt/htmgt_poll.C b/src/usr/htmgt/htmgt_poll.C
index 1b1cc7f1d..2bba2aca3 100644
--- a/src/usr/htmgt/htmgt_poll.C
+++ b/src/usr/htmgt/htmgt_poll.C
@@ -84,87 +84,91 @@ namespace HTMGT
errlHndl_t err = NULL;
uint8_t * poll_rsp = NULL;
- TMGT_INF("sendOccPoll: Polling OCC%d", iv_instance);
- bool continuePolling = false;
- size_t elogCount = 10;
-
- do
+ // Only send poll if OCC has not logged an exception
+ if (0 == iv_exceptionLogged)
{
- // create 1 byte buffer for poll command data
- const uint8_t l_cmdData[1] = { 0x10 /*version*/ };
+ TMGT_INF("sendOccPoll: Polling OCC%d", iv_instance);
+ bool continuePolling = false;
+ size_t elogCount = 10;
- OccCmd cmd(this,
- OCC_CMD_POLL,
- sizeof(l_cmdData),
- l_cmdData);
-
- err = cmd.sendOccCmd();
- if (err != NULL)
+ do
{
- // Poll failed
- TMGT_ERR("sendOccPoll: OCC%d poll failed with rc=0x%04X",
- iv_instance,
- err->reasonCode());
+ // create 1 byte buffer for poll command data
+ const uint8_t l_cmdData[1] = { 0x10 /*version*/ };
- continuePolling = false;
- }
- else
- {
- // Poll succeeded, check response
- uint32_t poll_rsp_size = cmd.getResponseData(poll_rsp);
- if (poll_rsp_size >= OCC_POLL_DATA_MIN_SIZE)
+ OccCmd cmd(this,
+ OCC_CMD_POLL,
+ sizeof(l_cmdData),
+ l_cmdData);
+
+ err = cmd.sendOccCmd();
+ if (err != NULL)
{
- if (i_flushAllErrors)
+ // Poll failed
+ TMGT_ERR("sendOccPoll: OCC%d poll failed with rc=0x%04X",
+ iv_instance,
+ err->reasonCode());
+
+ continuePolling = false;
+ }
+ else
+ {
+ // Poll succeeded, check response
+ uint32_t poll_rsp_size = cmd.getResponseData(poll_rsp);
+ if (poll_rsp_size >= OCC_POLL_DATA_MIN_SIZE)
{
- const occPollRspStruct_t *currentPollRsp =
- (occPollRspStruct_t *) poll_rsp;
- if (currentPollRsp->errorId != 0)
+ if (i_flushAllErrors)
{
- if (--elogCount > 0)
+ const occPollRspStruct_t *currentPollRsp =
+ (occPollRspStruct_t *) poll_rsp;
+ if (currentPollRsp->errorId != 0)
{
- // An error was returned, keep polling OCC
- continuePolling = true;
+ if (--elogCount > 0)
+ {
+ // An error was returned, keep polling OCC
+ continuePolling = true;
+ }
+ else
+ {
+ // Limit number of elogs retrieved so
+ // we do not get stuck in loop
+ TMGT_INF("sendOccPoll: OCC%d still has"
+ "more errors to report.",
+ iv_instance);
+ continuePolling = false;
+ }
}
else
{
- // Limit number of elogs retrieved so
- // we do not get stuck in loop
- TMGT_INF("sendOccPoll: OCC%d still has"
- "more errors to report.",
- iv_instance);
continuePolling = false;
}
}
- else
- {
- continuePolling = false;
- }
+ pollRspHandler(poll_rsp, poll_rsp_size);
+ }
+ else
+ {
+ TMGT_ERR("sendOccPoll: OCC%d poll command response "
+ "failed with invalid data length %d",
+ iv_instance, poll_rsp_size);
+ /*@
+ * @errortype
+ * @reasoncode HTMGT_RC_INVALID_LENGTH
+ * @moduleid HTMGT_MOD_OCC_POLL
+ * @userdata1 OCC instance
+ * @devdesc Invalid POLL response length
+ */
+ bldErrLog(err,
+ HTMGT_MOD_OCC_POLL,
+ HTMGT_RC_INVALID_LENGTH,
+ iv_instance, 0, 0, 0,
+ ERRORLOG::ERRL_SEV_INFORMATIONAL);
+
+ continuePolling = false;
}
- pollRspHandler(poll_rsp, poll_rsp_size);
}
- else
- {
- TMGT_ERR("sendOccPoll: OCC%d poll command response "
- "failed with invalid data length %d",
- iv_instance, poll_rsp_size);
- /*@
- * @errortype
- * @reasoncode HTMGT_RC_INVALID_LENGTH
- * @moduleid HTMGT_MOD_OCC_POLL
- * @userdata1 OCC instance
- * @devdesc Invalid POLL response length
- */
- bldErrLog(err,
- HTMGT_MOD_OCC_POLL,
- HTMGT_RC_INVALID_LENGTH,
- iv_instance, 0, 0, 0,
- ERRORLOG::ERRL_SEV_INFORMATIONAL);
-
- continuePolling = false;
- }
}
+ while (continuePolling);
}
- while (continuePolling);
return err;
}
OpenPOWER on IntegriCloud