summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorChris Cain <cjcain@us.ibm.com>2015-08-05 14:46:40 -0500
committerA. Patrick Williams III <iawillia@us.ibm.com>2015-08-06 16:15:45 -0500
commit138464217cf3608fa529550e88742e386a5b1d57 (patch)
tree69a48579049e843ace5de1fd2ff36593a6bb074f /src
parent77fb3e2cb457a1fdd2ea08abf495b54278070af0 (diff)
downloadtalos-hostboot-138464217cf3608fa529550e88742e386a5b1d57.tar.gz
talos-hostboot-138464217cf3608fa529550e88742e386a5b1d57.zip
Fix multi-OCC reset at runtime
Change-Id: I8310a8f16eb19ee955961c3f8824d7aff348ee62 RTC: 132943 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/19586 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: SHELDON R. BAILEY <baileysh@us.ibm.com> Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com> Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r--src/usr/htmgt/htmgt.C77
-rw-r--r--src/usr/htmgt/htmgt_activate.C7
-rw-r--r--src/usr/htmgt/htmgt_occ.C54
-rw-r--r--src/usr/htmgt/htmgt_occ.H11
4 files changed, 114 insertions, 35 deletions
diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C
index 990b9a05e..d92f7a315 100644
--- a/src/usr/htmgt/htmgt.C
+++ b/src/usr/htmgt/htmgt.C
@@ -57,6 +57,8 @@ namespace HTMGT
void processOccStartStatus(const bool i_startCompleted,
TARGETING::Target * i_failedOccTarget)
{
+ TMGT_INF(">>processOccStartStatus(%d,0x%p)",
+ i_startCompleted, i_failedOccTarget);
errlHndl_t l_err = NULL;
uint32_t l_huid = 0;
if (i_failedOccTarget)
@@ -184,7 +186,7 @@ namespace HTMGT
errlHndl_t err2 = OccManager::resetOccs(NULL);
if(err2)
{
- TMGT_ERR("OccManager:;resetOccs failed with 0x%04X",
+ TMGT_ERR("OccManager::resetOccs failed with 0x%04X",
err2->reasonCode());
// Set original error log as unrecoverable and commit
@@ -201,6 +203,7 @@ namespace HTMGT
ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
}
}
+ TMGT_INF("<<processOccStartStatus()");
} // end processOccStartStatus()
@@ -209,6 +212,8 @@ namespace HTMGT
// Notify HTMGT that an OCC has an error to report
void processOccError(TARGETING::Target * i_procTarget)
{
+ TMGT_INF(">>processOccError(0x%p)", i_procTarget);
+
TARGETING::Target* sys = NULL;
TARGETING::targetService().getTopLevelTarget(sys);
uint8_t safeMode = 0;
@@ -274,6 +279,7 @@ namespace HTMGT
TMGT_ERR("processOccError() called, but unable to find OCCs");
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
}
+ TMGT_INF("<<processOccError()");
} // end processOccError()
@@ -282,6 +288,7 @@ namespace HTMGT
// Notify HTMGT that an OCC has failed and needs to be reset
void processOccReset(TARGETING::Target * i_proc)
{
+ TMGT_INF(">>processOccReset(0x%p)", i_proc);
errlHndl_t errl = NULL;
TARGETING::Target * failedOccTarget = NULL;
@@ -341,6 +348,7 @@ namespace HTMGT
{
ERRORLOG::errlCommit(errl, HTMGT_COMP_ID); // sets errl to NULL
}
+ TMGT_INF("<<processOccReset()");
} // end processOccReset()
@@ -348,32 +356,19 @@ namespace HTMGT
// Set the OCC state
errlHndl_t enableOccActuation(bool i_occActivation)
{
+ TMGT_INF(">>enableOccActuation(%c)", i_occActivation?'Y':'N');
errlHndl_t l_err = NULL;
TARGETING::Target* sys = NULL;
+ // If the system is already in safemode then can't talk to OCCs
TARGETING::targetService().getTopLevelTarget(sys);
uint8_t safeMode = 0;
-
- // If the system is in safemode then can't talk to OCCs -
- // ignore call to enableOccActuation
- if(sys &&
- sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
- safeMode)
+ if(sys)
{
- /*@
- * @errortype
- * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
- * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION
- * @userdata1[0:7] OCC activate [1==true][0==false]
- * @devdesc Invalid operation when OCCs are in safemode
- */
- bldErrLog(l_err,
- HTMGT_MOD_ENABLE_OCC_ACTUATION,
- HTMGT_RC_OCC_CRIT_FAILURE,
- i_occActivation, 0, 0, 1,
- ERRORLOG::ERRL_SEV_UNRECOVERABLE);
+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
}
- else
+
+ if (0 == safeMode)
{
occStateId targetState = OCC_STATE_ACTIVE;
if (false == i_occActivation)
@@ -391,17 +386,53 @@ namespace HTMGT
if (OccManager::occNeedsReset())
{
+ if (l_err)
+ {
+ // Commit setOccState elog since OCCs will be reset
+ // and recovery attempted.
+ ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID);
+ }
+
TMGT_ERR("enableOccActuation(): OCCs need to be reset");
// Don't pass failed target as OCC should have already
// been marked as failed during the poll.
- errlHndl_t err2 = OccManager::resetOccs(NULL);
- if(err2)
+ l_err = OccManager::resetOccs(NULL);
+
+ // NOTE: If the system exceeded its reset count and ended up
+ // in safe mode an error may not be returned here (if a
+ // failure happened after the first reset attempt).
+ // This is because the resets are recursive:
+ // HTMGT calls back into HBRT to initiate the reset, then
+ // HBRT calls into HTMGT when reset completed
+ // To detected this condition we need to check for safe mode
+ // after the recovery attempts and return error if in safe.
+ if(sys)
{
- ERRORLOG::errlCommit(err2, HTMGT_COMP_ID);
+ sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
}
}
}
+ if ((NULL == l_err) && safeMode)
+ {
+ // Create an elog so the user knows the cmd failed.
+ TMGT_ERR("enableOccActuation(): System is in safe mode");
+ /*@
+ * @errortype
+ * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE
+ * @moduleid HTMGT_MOD_ENABLE_OCC_ACTUATION
+ * @userdata1 OCC activate [1==true][0==false]
+ * @devdesc Operation not allowed, system is in safe mode
+ */
+ bldErrLog(l_err,
+ HTMGT_MOD_ENABLE_OCC_ACTUATION,
+ HTMGT_RC_OCC_CRIT_FAILURE,
+ 0, i_occActivation, 0, safeMode,
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE);
+ }
+
+ TMGT_INF("<<enableOccActuation() returning 0x%04X",
+ (l_err==NULL) ? 0 : l_err->reasonCode());
return l_err;
} // end enableOccActuation()
diff --git a/src/usr/htmgt/htmgt_activate.C b/src/usr/htmgt/htmgt_activate.C
index 4cb46f033..54a3695f4 100644
--- a/src/usr/htmgt/htmgt_activate.C
+++ b/src/usr/htmgt/htmgt_activate.C
@@ -232,7 +232,7 @@ namespace HTMGT
}
else
{
- //The OCC knows it isn't activated by getting a value of 0.
+ //The OCC knows it isn't active by getting a value of 0.
limit = 0;
}
@@ -246,6 +246,11 @@ namespace HTMGT
TMGT_INF("sendOccUserPowerCap: Sending power cap %d to OCC %d",
limit, occ->getInstance());
+ if (limit > 0)
+ {
+ TMGT_CONSOLE("User power limit has been set to %dW",
+ limit);
+ }
OccCmd cmd(occ, OCC_CMD_SET_POWER_CAP, 2, data);
diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
index 5534a8c58..7dd5c2d1a 100644
--- a/src/usr/htmgt/htmgt_occ.C
+++ b/src/usr/htmgt/htmgt_occ.C
@@ -385,6 +385,21 @@ namespace HTMGT
ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
}
+
+ if (NULL != iv_occMaster)
+ {
+ // update master occsPresent bit for each slave OCC
+ for(occList_t::const_iterator occ = iv_occArray.begin();
+ occ != iv_occArray.end();
+ ++occ)
+ {
+ if((*occ) != iv_occMaster)
+ {
+ iv_occMaster->
+ updateOccPresentBits((*occ)->getPresentBits());
+ }
+ }
+ }
} // for each processor
}
else
@@ -684,9 +699,9 @@ namespace HTMGT
}
}
- if(false == _occNeedsReset())
+ if (false == _occFailed())
{
- // No occ target needs reset - increment system reset count
+ // No OCC has been marked failed, increment system reset count
++iv_resetCount;
TMGT_INF("resetOCCs: Incrementing system OCC reset count to %d",
@@ -696,8 +711,8 @@ namespace HTMGT
{
atThreshold = true;
}
-
}
+ // else the failed OCC reset count will be incremented automatically
uint64_t retryCount = OCC_RESET_COUNT_THRESHOLD;
while(retryCount)
@@ -875,14 +890,6 @@ namespace HTMGT
TMGT_ERR("waitForOccCheckpoint OCC%d still NOT ready!",
(*pOcc)->getInstance());
}
-
- if ((OCC_ROLE_MASTER != (*pOcc)->getRole()) &&
- (NULL != iv_occMaster))
- {
- // update master occsPresent bit for each slave OCC
- iv_occMaster->
- updateOccPresentBits((*pOcc)->getPresentBits());
- }
}
}
#endif
@@ -920,6 +927,26 @@ namespace HTMGT
}
+ // Return true if any OCC has been marked as failed
+ bool OccManager::_occFailed()
+ {
+ bool failed = false;
+
+ for (std::vector<Occ*>::iterator pOcc = iv_occArray.begin();
+ pOcc < iv_occArray.end();
+ pOcc++)
+ {
+ if ((*pOcc)->iv_failed)
+ {
+ failed = true;
+ break;
+ }
+ }
+
+ return failed;
+ }
+
+
uint8_t OccManager::getNumOccs()
{
return Singleton<OccManager>::instance()._getNumOccs();
@@ -979,6 +1006,11 @@ namespace HTMGT
return Singleton<OccManager>::instance()._occNeedsReset();
}
+ bool OccManager::occFailed()
+ {
+ return Singleton<OccManager>::instance()._occFailed();
+ }
+
} // end namespace
diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
index 4be28b35c..f91cfdb38 100644
--- a/src/usr/htmgt/htmgt_occ.H
+++ b/src/usr/htmgt/htmgt_occ.H
@@ -507,6 +507,14 @@ namespace HTMGT
void updateForSafeMode(errlHndl_t & io_err);
+ /**
+ * @brief Check if any OCC has failed
+ *
+ * @return true if any OCC has been marked as failed
+ */
+ static bool occFailed();
+
+
private:
typedef std::vector<Occ*> occList_t;
@@ -588,6 +596,9 @@ namespace HTMGT
/** See occNeedsReset() above */
bool _occNeedsReset();
+
+ /** See occFailed() above */
+ bool _occFailed();
};
typedef Singleton<OccManager> occMgr;
OpenPOWER on IntegriCloud