diff options
author | Chris Cain <cjcain@us.ibm.com> | 2015-08-05 14:46:40 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2015-08-06 16:15:45 -0500 |
commit | 138464217cf3608fa529550e88742e386a5b1d57 (patch) | |
tree | 69a48579049e843ace5de1fd2ff36593a6bb074f /src/usr/htmgt | |
parent | 77fb3e2cb457a1fdd2ea08abf495b54278070af0 (diff) | |
download | talos-hostboot-138464217cf3608fa529550e88742e386a5b1d57.tar.gz talos-hostboot-138464217cf3608fa529550e88742e386a5b1d57.zip |
Fix multi-OCC reset at runtime
Change-Id: I8310a8f16eb19ee955961c3f8824d7aff348ee62
RTC: 132943
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/19586
Tested-by: Jenkins Server
Tested-by: Jenkins OP Build CI
Tested-by: Jenkins OP HW
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: SHELDON R. BAILEY <baileysh@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src/usr/htmgt')
-rw-r--r-- | src/usr/htmgt/htmgt.C | 77 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_activate.C | 7 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.C | 54 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.H | 11 |
4 files changed, 114 insertions, 35 deletions
diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C index 990b9a05e..d92f7a315 100644 --- a/src/usr/htmgt/htmgt.C +++ b/src/usr/htmgt/htmgt.C @@ -57,6 +57,8 @@ namespace HTMGT void processOccStartStatus(const bool i_startCompleted, TARGETING::Target * i_failedOccTarget) { + TMGT_INF(">>processOccStartStatus(%d,0x%p)", + i_startCompleted, i_failedOccTarget); errlHndl_t l_err = NULL; uint32_t l_huid = 0; if (i_failedOccTarget) @@ -184,7 +186,7 @@ namespace HTMGT errlHndl_t err2 = OccManager::resetOccs(NULL); if(err2) { - TMGT_ERR("OccManager:;resetOccs failed with 0x%04X", + TMGT_ERR("OccManager::resetOccs failed with 0x%04X", err2->reasonCode()); // Set original error log as unrecoverable and commit @@ -201,6 +203,7 @@ namespace HTMGT ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } } + TMGT_INF("<<processOccStartStatus()"); } // end processOccStartStatus() @@ -209,6 +212,8 @@ namespace HTMGT // Notify HTMGT that an OCC has an error to report void processOccError(TARGETING::Target * i_procTarget) { + TMGT_INF(">>processOccError(0x%p)", i_procTarget); + TARGETING::Target* sys = NULL; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; @@ -274,6 +279,7 @@ namespace HTMGT TMGT_ERR("processOccError() called, but unable to find OCCs"); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } + TMGT_INF("<<processOccError()"); } // end processOccError() @@ -282,6 +288,7 @@ namespace HTMGT // Notify HTMGT that an OCC has failed and needs to be reset void processOccReset(TARGETING::Target * i_proc) { + TMGT_INF(">>processOccReset(0x%p)", i_proc); errlHndl_t errl = NULL; TARGETING::Target * failedOccTarget = NULL; @@ -341,6 +348,7 @@ namespace HTMGT { ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL } + TMGT_INF("<<processOccReset()"); } // end processOccReset() @@ -348,32 +356,19 @@ namespace HTMGT // Set the OCC state errlHndl_t enableOccActuation(bool i_occActivation) { + TMGT_INF(">>enableOccActuation(%c)", i_occActivation?'Y':'N'); errlHndl_t l_err = NULL; TARGETING::Target* sys = NULL; + // If the system is already in safemode then can't talk to OCCs TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; - - // If the system is in safemode then can't talk to OCCs - - // ignore call to enableOccActuation - if(sys && - sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) && - safeMode) + if(sys) { - /*@ - * @errortype - * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE - * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION - * @userdata1[0:7] OCC activate [1==true][0==false] - * @devdesc Invalid operation when OCCs are in safemode - */ - bldErrLog(l_err, - HTMGT_MOD_ENABLE_OCC_ACTUATION, - HTMGT_RC_OCC_CRIT_FAILURE, - i_occActivation, 0, 0, 1, - ERRORLOG::ERRL_SEV_UNRECOVERABLE); + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); } - else + + if (0 == safeMode) { occStateId targetState = OCC_STATE_ACTIVE; if (false == i_occActivation) @@ -391,17 +386,53 @@ namespace HTMGT if (OccManager::occNeedsReset()) { + if (l_err) + { + // Commit setOccState elog since OCCs will be reset + // and recovery attempted. + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); + } + TMGT_ERR("enableOccActuation(): OCCs need to be reset"); // Don't pass failed target as OCC should have already // been marked as failed during the poll. - errlHndl_t err2 = OccManager::resetOccs(NULL); - if(err2) + l_err = OccManager::resetOccs(NULL); + + // NOTE: If the system exceeded its reset count and ended up + // in safe mode an error may not be returned here (if a + // failure happened after the first reset attempt). + // This is because the resets are recursive: + // HTMGT calls back into HBRT to initiate the reset, then + // HBRT calls into HTMGT when reset completed + // To detected this condition we need to check for safe mode + // after the recovery attempts and return error if in safe. + if(sys) { - ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); + sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode); } } } + if ((NULL == l_err) && safeMode) + { + // Create an elog so the user knows the cmd failed. + TMGT_ERR("enableOccActuation(): System is in safe mode"); + /*@ + * @errortype + * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE + * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION + * @userdata1 OCC activate [1==true][0==false] + * @devdesc Operation not allowed, system is in safe mode + */ + bldErrLog(l_err, + HTMGT_MOD_ENABLE_OCC_ACTUATION, + HTMGT_RC_OCC_CRIT_FAILURE, + 0, i_occActivation, 0, safeMode, + ERRORLOG::ERRL_SEV_UNRECOVERABLE); + } + + TMGT_INF("<<enableOccActuation() returning 0x%04X", + (l_err==NULL) ? 0 : l_err->reasonCode()); return l_err; } // end enableOccActuation() diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C index 4cb46f033..54a3695f4 100644 --- a/src/usr/htmgt/htmgt_activate.C +++ b/src/usr/htmgt/htmgt_activate.C @@ -232,7 +232,7 @@ namespace HTMGT } else { - //The OCC knows it isn't activated by getting a value of 0. + //The OCC knows it isn't active by getting a value of 0. limit = 0; } @@ -246,6 +246,11 @@ namespace HTMGT TMGT_INF("sendOccUserPowerCap: Sending power cap %d to OCC %d", limit, occ->getInstance()); + if (limit > 0) + { + TMGT_CONSOLE("User power limit has been set to %dW", + limit); + } OccCmd cmd(occ, OCC_CMD_SET_POWER_CAP, 2, data); diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C index 5534a8c58..7dd5c2d1a 100644 --- a/src/usr/htmgt/htmgt_occ.C +++ b/src/usr/htmgt/htmgt_occ.C @@ -385,6 +385,21 @@ namespace HTMGT ERRORLOG::ERRL_SEV_UNRECOVERABLE); } } + + if (NULL != iv_occMaster) + { + // update master occsPresent bit for each slave OCC + for(occList_t::const_iterator occ = iv_occArray.begin(); + occ != iv_occArray.end(); + ++occ) + { + if((*occ) != iv_occMaster) + { + iv_occMaster-> + updateOccPresentBits((*occ)->getPresentBits()); + } + } + } } // for each processor } else @@ -684,9 +699,9 @@ namespace HTMGT } } - if(false == _occNeedsReset()) + if (false == _occFailed()) { - // No occ target needs reset - increment system reset count + // No OCC has been marked failed, increment system reset count ++iv_resetCount; TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d", @@ -696,8 +711,8 @@ namespace HTMGT { atThreshold = true; } - } + // else the failed OCC reset count will be incremented automatically uint64_t retryCount = OCC_RESET_COUNT_THRESHOLD; while(retryCount) @@ -875,14 +890,6 @@ namespace HTMGT TMGT_ERR("waitForOccCheckpoint OCC%d still NOT ready!", (*pOcc)->getInstance()); } - - if ((OCC_ROLE_MASTER != (*pOcc)->getRole()) && - (NULL != iv_occMaster)) - { - // update master occsPresent bit for each slave OCC - iv_occMaster-> - updateOccPresentBits((*pOcc)->getPresentBits()); - } } } #endif @@ -920,6 +927,26 @@ namespace HTMGT } + // Return true if any OCC has been marked as failed + bool OccManager::_occFailed() + { + bool failed = false; + + for (std::vector<Occ*>::iterator pOcc = iv_occArray.begin(); + pOcc < iv_occArray.end(); + pOcc++) + { + if ((*pOcc)->iv_failed) + { + failed = true; + break; + } + } + + return failed; + } + + uint8_t OccManager::getNumOccs() { return Singleton<OccManager>::instance()._getNumOccs(); @@ -979,6 +1006,11 @@ namespace HTMGT return Singleton<OccManager>::instance()._occNeedsReset(); } + bool OccManager::occFailed() + { + return Singleton<OccManager>::instance()._occFailed(); + } + } // end namespace diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H index 4be28b35c..f91cfdb38 100644 --- a/src/usr/htmgt/htmgt_occ.H +++ b/src/usr/htmgt/htmgt_occ.H @@ -507,6 +507,14 @@ namespace HTMGT void updateForSafeMode(errlHndl_t & io_err); + /** + * @brief Check if any OCC has failed + * + * @return true if any OCC has been marked as failed + */ + static bool occFailed(); + + private: typedef std::vector<Occ*> occList_t; @@ -588,6 +596,9 @@ namespace HTMGT /** See occNeedsReset() above */ bool _occNeedsReset(); + + /** See occFailed() above */ + bool _occFailed(); }; typedef Singleton<OccManager> occMgr; |