diff options
author | Doug Gilbert <dgilbert@us.ibm.com> | 2015-03-03 16:00:29 -0600 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2015-03-10 22:47:21 -0500 |
commit | 2a1d6e44f9dee4b665978be5e0eb0ef47a9e0be0 (patch) | |
tree | f8ecd1df6a7105b9aa6985f51585f65c677f37a8 /src | |
parent | 9f5ce5b51abea9ac2fc04fb231e03d619f942696 (diff) | |
download | talos-hostboot-2a1d6e44f9dee4b665978be5e0eb0ef47a9e0be0.tar.gz talos-hostboot-2a1d6e44f9dee4b665978be5e0eb0ef47a9e0be0.zip |
HTMGT add attempt to reset OCC when OCC Activate fails
Change-Id: I964d2b68216c3ddabae73ce3b851bbc468ec96a7
RTC: 123180
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/16145
Reviewed-by: Christopher Cain <cjcain@us.ibm.com>
Tested-by: Jenkins Server
Reviewed-by: Matt Spinler <spinler@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/include/usr/htmgt/htmgt_reasoncodes.H | 1 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt.C | 123 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_activate.C | 9 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.C | 34 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.H | 1 |
5 files changed, 118 insertions, 50 deletions
diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H index ade192d13..6fe269de8 100644 --- a/src/include/usr/htmgt/htmgt_reasoncodes.H +++ b/src/include/usr/htmgt/htmgt_reasoncodes.H @@ -48,6 +48,7 @@ namespace HTMGT HTMGT_MOD_CHECK_OCC_RSP = 0x92, HTMGT_MOD_PARSE_OCC_RSP = 0x94, HTMGT_MOD_HANLDE_OCC_EXCEPTION = 0xE0, + HTMGT_MOD_ENABLE_OCC_ACTUATION = 0xE1, }; enum htmgtReasonCode diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C index aff25008a..a2f556f94 100644 --- a/src/usr/htmgt/htmgt.C +++ b/src/usr/htmgt/htmgt.C @@ -184,42 +184,30 @@ namespace HTMGT if (NULL != l_err) { - TMGT_ERR("OCCs not all active. System will stay in safe mode"); + TMGT_ERR("OCCs not all active. Attempting OCC Reset"); TMGT_CONSOLE("OCCs are not active (rc=0x%04X). " - "System will remain in safe mode", + "Attempting OCC Reset", l_err->reasonCode()); - TMGT_INF("Calling HBOCC::stopAllOCCs"); - errlHndl_t err2 = HBOCC::stopAllOCCs(); + TMGT_INF("Calling resetOccs"); + errlHndl_t err2 = OccManager::resetOccs(NULL); if(err2) { - TMGT_ERR("stopAllOCCs() failed with 0x%04X", + TMGT_ERR("OccManager:;resetOccs failed with 0x%04X", err2->reasonCode()); - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); - } - - // Update error log to unrecoverable and set SRC - // to indicate the system will remain in safe mode - /*@ - * @errortype - * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE - * @moduleid HTMGT_MOD_LOAD_START_STATUS - * @userdata1[0:7] load/start completed - * @devdesc OCCs did not all reach active state, - * system will be in Safe Mode - */ - bldErrLog(l_err, HTMGT_MOD_LOAD_START_STATUS, - HTMGT_RC_OCC_CRIT_FAILURE, - i_startCompleted, 0, 0, 1, - ERRORLOG::ERRL_SEV_UNRECOVERABLE); - // Add level 2 support callout - l_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, - HWAS::SRCI_PRIORITY_MED); - // Add HB firmware callout - l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, - HWAS::SRCI_PRIORITY_MED); + // Set original error log as unrecoverable and commit + l_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); + // Commit occReset error + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); + } + else + { + // retry worked - commit original error as informational + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); + } } } // end processOccStartStatus() @@ -229,6 +217,19 @@ namespace HTMGT // Notify HTMGT that an OCC has an error to report void processOccError(TARGETING::Target * i_procTarget) { + TARGETING::Target* sys = NULL; + TARGETING::targetService().getTopLevelTarget(sys); + uint8_t safeMode = 0; + + // If the system is in safemode then can't talk to OCCs - + // ignore call to processOccError + if(sys && + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && + safeMode) + { + return; + } + bool polledOneOcc = false; OccManager::buildOccs(); @@ -347,29 +348,57 @@ namespace HTMGT // Set the OCC state errlHndl_t enableOccActuation(bool i_occActivation) { - occStateId targetState = OCC_STATE_ACTIVE; - if (false == i_occActivation) - { - targetState = OCC_STATE_OBSERVATION; - } + errlHndl_t l_err = NULL; + TARGETING::Target* sys = NULL; + + TARGETING::targetService().getTopLevelTarget(sys); + uint8_t safeMode = 0; - // Set state for all OCCs - errlHndl_t l_err = OccManager::setOccState(targetState); - if (NULL == l_err) + // If the system is in safemode then can't talk to OCCs - + // ignore call to enableOccActuation + if(sys && + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && + safeMode) { - TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", - targetState); + /*@ + * @errortype + * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE + * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION + * @userdata1[0:7] OCC activate [1==true][0==false] + * @devdesc Invalid operation when OCCs are in safemode + */ + bldErrLog(l_err, + HTMGT_MOD_ENABLE_OCC_ACTUATION, + HTMGT_RC_OCC_CRIT_FAILURE, + i_occActivation, 0, 0, 1, + ERRORLOG::ERRL_SEV_UNRECOVERABLE); } - - if (OccManager::occNeedsReset()) + else { - TMGT_ERR("enableOccActuation(): OCCs need to be reset"); - // Don't pass failed target as OCC should have already - // been marked as failed during the poll. - errlHndl_t err2 = OccManager::resetOccs(NULL); - if(err2) + occStateId targetState = OCC_STATE_ACTIVE; + if (false == i_occActivation) { - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); + targetState = OCC_STATE_OBSERVATION; + } + + // Set state for all OCCs + l_err = OccManager::setOccState(targetState); + if (NULL == l_err) + { + TMGT_INF("enableOccActuation: OCC states updated to 0x%02X", + targetState); + } + + if (OccManager::occNeedsReset()) + { + TMGT_ERR("enableOccActuation(): OCCs need to be reset"); + // Don't pass failed target as OCC should have already + // been marked as failed during the poll. + errlHndl_t err2 = OccManager::resetOccs(NULL); + if(err2) + { + ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); + } } } diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C index 7f54d6dbe..4cb46f033 100644 --- a/src/usr/htmgt/htmgt_activate.C +++ b/src/usr/htmgt/htmgt_activate.C @@ -39,6 +39,7 @@ #include <ipmi/ipmisensor.H> #include <sys/time.h> +#include <console/consoleif.H> using namespace TARGETING; @@ -163,6 +164,14 @@ namespace HTMGT l_err = occ->ipmiSensor(i_activate); if( l_err ) { + TMGT_ERR("setOccActiveSensors failed. (OCC%d state:%d)", + occ->getInstance(), + i_activate); + + TMGT_CONSOLE("setOccActiveSensors failed. (OCC%d state:%d)", + occ->getInstance(), + i_activate); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } } diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C index 8a539f446..6bbc0ae57 100644 --- a/src/usr/htmgt/htmgt_occ.C +++ b/src/usr/htmgt/htmgt_occ.C @@ -248,7 +248,8 @@ namespace HTMGT OccManager::OccManager() :iv_occMaster(NULL), iv_state(OCC_STATE_UNKNOWN), - iv_targetState(OCC_STATE_ACTIVE) + iv_targetState(OCC_STATE_ACTIVE), + iv_resetCount(0) { } @@ -605,6 +606,21 @@ namespace HTMGT } } + if(false == _occNeedsReset()) + { + // No occ target needs reset - increment system reset count + ++iv_resetCount; + + TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d", + iv_resetCount); + + if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD) + { + atThreshold = true; + } + + } + uint64_t retryCount = OCC_RESET_COUNT_THRESHOLD; while(retryCount) { @@ -663,7 +679,7 @@ namespace HTMGT */ bldErrLog(err, HTMTG_MOD_OCC_RESET, - HTMGT_RC_OCC_RESET_THREHOLD, + HTMGT_RC_OCC_CRIT_FAILURE, 0, 0, 0, 0, ERRORLOG::ERRL_SEV_UNRECOVERABLE); } @@ -673,6 +689,13 @@ namespace HTMGT { err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); + // Add level 2 support callout + err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, + HWAS::SRCI_PRIORITY_MED); + // Add HB firmware callout + err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_MED); + TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 1; @@ -683,8 +706,13 @@ namespace HTMGT sys->setAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); } - TMGT_ERR("_resetOccs: Safe Mode RC: 0x%04X (OCC%d)", + TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)", cv_safeReturnCode, cv_safeOccInstance); + + TMGT_CONSOLE("OCCs are not active. The system will remain in " + "safe mode (RC: 0x%04x for OCC%d)", + cv_safeReturnCode, + cv_safeOccInstance); } return err; diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H index dec19b883..5ac545ab7 100644 --- a/src/usr/htmgt/htmgt_occ.H +++ b/src/usr/htmgt/htmgt_occ.H @@ -507,6 +507,7 @@ namespace HTMGT occList_t iv_occArray; occStateId iv_state; occStateId iv_targetState; + uint8_t iv_resetCount; /** * @brief SRC that caused system to enter safe mode |