summaryrefslogtreecommitdiffstats
path: root/src/usr/sbeio/common/sbe_retry_handler.C
diff options
context:
space:
mode:
Diffstat (limited to 'src/usr/sbeio/common/sbe_retry_handler.C')
-rw-r--r--src/usr/sbeio/common/sbe_retry_handler.C1295
1 files changed, 733 insertions, 562 deletions
diff --git a/src/usr/sbeio/common/sbe_retry_handler.C b/src/usr/sbeio/common/sbe_retry_handler.C
index e2889bf16..0af3eedb2 100644
--- a/src/usr/sbeio/common/sbe_retry_handler.C
+++ b/src/usr/sbeio/common/sbe_retry_handler.C
@@ -45,7 +45,6 @@
#include <initservice/initserviceif.H>
#include <initservice/istepdispatcherif.H>
#include <errl/errludtarget.H>
-#include <sys/time.h>
#include <util/misc.H>
#include <ipmi/ipmiwatchdog.H>
@@ -92,16 +91,16 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode,
: iv_useSDB(false)
, iv_secureModeDisabled(false) //Per HW team this should always be 0
-, iv_sbeRestarted(false)
-, iv_sbeSide(0)
-, iv_errorLogPLID(0)
, iv_callerErrorLogPLID(i_plid)
, iv_switchSidesCount(0)
, iv_currentAction(P9_EXTRACT_SBE_RC::ERROR_RECOVERED)
-, iv_currentSBEState(SBE_REG_RETURN::SBE_FAILED_TO_BOOT)
-, iv_retriggeredMain(false)
+, iv_currentSBEState(SBE_REG_RETURN::SBE_NOT_AT_RUNTIME)
+, iv_shutdownReturnCode(0)
+, iv_currentSideBootAttempts(1) // It is safe to assume that the current side has attempted to boot
+, iv_ffdcSetAction(false)
, iv_sbeMode(i_sbeMode)
-, iv_sbeRestartMethod(SBE_RESTART_METHOD::START_CBS)
+, iv_sbeRestartMethod(SBE_RESTART_METHOD::HRESET)
+, iv_initialPowerOn(false)
{
SBE_TRACF(ENTER_MRK "SbeRetryHandler::SbeRetryHandler()");
@@ -111,209 +110,380 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode,
SBE_TRACF(EXIT_MRK "SbeRetryHandler::SbeRetryHandler()");
}
-SbeRetryHandler::~SbeRetryHandler()
-{
-
-}
+SbeRetryHandler::~SbeRetryHandler() {}
void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
{
SBE_TRACF(ENTER_MRK "main_sbe_handler()");
-
do
{
- errlHndl_t l_errl = NULL;
+ errlHndl_t l_errl = nullptr;
+ // Only set the secure debug bit (SDB) if we are not using xscom yet
if(!i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>().useXscom)
{
this->iv_useSDB = true;
}
- const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget(
- const_cast<TARGETING::Target*> (i_target));
+ // Get the SBE status register, this will tell us what state
+ // the SBE is in , if the asynFFDC bit is set on the sbe_reg
+ // then FFDC will be collected at this point in time.
+ // sbe_run_extract_msg_reg will return true if there was an error reading the status
+ if(!this->sbe_run_extract_msg_reg(i_target))
+ {
+ SBE_TRACF("main_sbe_handler(): Failed to get sbe register something is seriously wrong, we should always be able to read that!!");
+ //Error log should have already committed in sbe_run_extract_msg_reg for this issue
+ break;
+ }
+
+ // We will only trust the currState value if we know the SBE has just been booted.
+ // In this case we have been told by the caller that the sbe just powered on
+ // so it is safe to assume that the currState value is legit and we can trust that
+ // the sbe has booted successfully to runtime.
+ if( this->iv_initialPowerOn && (this->iv_sbeRegister.currState == SBE_STATE_RUNTIME))
+ {
+ //We have successfully powered on the SBE
+ SBE_TRACF("main_sbe_handler(): Initial power on of the SBE was a success!!");
+ break;
+ }
- bool l_retry = false;
+ //////******************************************************************
+ // If we have made it this far we can assume that something is wrong w/ the SBE
+ //////******************************************************************
- if(this->iv_sbeMode != INFORMATIONAL_ONLY)
+ // If something is wrong w/ the SBE during IPL time on a FSP based system then
+ // we will always TI and let hwsv deal with the problem. This is a unique path
+ // so we will have it handled in a separate procedure
+#ifndef __HOSTBOOT_RUNTIME
+ if(INITSERVICE::spBaseServicesEnabled())
{
- this->get_sbe_reg(i_target);
+ // This function will TI Hostboot so don't expect to return
+ handleFspIplTimeFail(i_target);
+ SBE_TRACF("main_sbe_handler(): We failed to TI the system when we should have, forcing an assert(0) call");
+ // We should never return from handleFspIplTimeFail
+ assert(0, "We have determined that there was an error with the SBE and should have TI'ed but for some reason we did not.");
+ }
+#endif
- if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) &&
- !(this->iv_sbeMode == SBE_ACTION_SET))
- {
- // return, false if no boot is needed, true if boot is needed.
- l_retry = this->sbe_boot_fail_handler(i_target);
- }
- else if(this->iv_sbeMode == SBE_ACTION_SET)
- {
- l_retry = true;
- }
+ // If iv_ffdcSetAction is true, that means that we found ffdc to parse
+ // this indicates that the SBE already determined what went wrong and
+ // reported the error via asyncFFDC so there is no need to
+ // run p9_extract_sbe_rc
+ // Also if the sbe is not booted at all, extract_rc will fail so we don't want to run it
+ if(!this->iv_ffdcSetAction && this->iv_sbeRegister.sbeBooted)
+ {
+ SBE_TRACF("main_sbe_handler(): No async ffdc found and sbe says it has been booted, running run p9_sbe_extract_rc.");
+ // Call the function that runs extract_rc, this needs to run to determine
+ // what broke and what our retry action should be
+ this->sbe_run_extract_rc(i_target);
+ }
+ // If we have determined that the sbe never booted
+ // then set the current action to be "restart sbe"
+ // that way we will attempt to start the sbe again
+ else if(!this->iv_sbeRegister.sbeBooted)
+ {
+ SBE_TRACF("main_sbe_handler(): SBE reports it was never booted, calling p9_sbe_extract_rc will fail. Setting action to be RESTART_SBE");
+ //Maybe commit log here saying initial start_cbs didnt run
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE;
+ }
- while((this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) &&
- l_retry)
- {
+ // If the mode was marked as informational that means the caller did not want
+ // any actions to take place, the caller only wanted information collected
+ if(this->iv_sbeMode == INFORMATIONAL_ONLY)
+ {
+ SBE_TRACF("main_sbe_handler(): Retry handler is being called in INFORMATIONAL mode so we are exiting without attempting any retry actions");
+ break;
+ }
- SBE_TRACF("main_sbe_handler(): current SBE state is %d, retry "
- "is %d current SBE action is %d",
- this->iv_sbeRegister.currState,
- l_retry, this->iv_currentAction);
+ // This do-while loop will continuously look at iv_currentAction, act
+ // accordingly, then read status register and determine next action.
+ // The ideal way to exit the loop is if the SBE makes it up to runtime after
+ // attempting a retry which indicates we have recovered. If the currentAction
+ // says NO_RECOVERY_ACTION then we break out of this loop. Also if we fail
+ // to read the sbe's status register or if we get write fails when trying to switch
+ // seeprom sides. Both the fails mentioned last indicate there is a larger problem
+ do
+ {
+ // We need to handle the following values that currentAction could be,
+ // it is possible that iv_currentAction can be any of these values except there
+ // is currently no path that will set it to be ERROR_RECOVERED
+ // ERROR_RECOVERED = 0,
+ // - We should never hit this, if we have recovered then
+ // curreState should be RUNTIME
+ // RESTART_SBE = 1,
+ // RESTART_CBS = 2,
+ // - We will not listen to p9_extract_rc on HOW to restart the
+ // sbe. We will assume iv_sbeRestartMethod is correct and
+ // perform the restart method that iv_sbeRestartMethod says
+ // regardless if currentAction = RESTART_SBE or RESTART_CBS
+ // REIPL_BKP_SEEPROM = 3,
+ // REIPL_UPD_SEEPROM = 4,
+ // - We will switch the seeprom side (if we have not already)
+ // - then attempt to restart the sbe w/ iv_sbeRestartMethod
+ // NO_RECOVERY_ACTION = 5,
+ // - we deconfigure the processor we are retrying and fail out
+ //
+ // Important things to remember, we only want to attempt a single side
+ // a maxiumum of 2 times, and also we only want to switch sides once
+
+ SBE_TRACF("main_sbe_handler(): iv_sbeRegister.currState: %d , "
+ "iv_currentSideBootAttempts: %d , "
+ "iv_currentAction: %d , ",
+ this->iv_sbeRegister.currState,
+ this->iv_currentSideBootAttempts,
+ this->iv_currentAction);
+ if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION)
+ {
+ // There is no action possible. Gard and Callout the proc
/*@
- * @errortype
- * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL
- * @moduleid SBEIO_EXTRACT_RC_HANDLER
- * @reasoncode SBEIO_EXTRACT_RC_ERROR
- * @userdata1 HUID of proc that had the SBE timeout
- * @userdata2 SBE failing code
- *
- * @devdesc SBE did not start, this function is looking at
- * the error to determine next course of action
- *
- * @custdesc The SBE did not start, we will attempt a reboot
- * if possible
- */
+ * @errortype ERRL_SEV_UNRECOVERABLE
+ * @moduleid SBEIO_EXTRACT_RC_HANDLER
+ * @reasoncode SBEIO_NO_RECOVERY_ACTION
+ * @userdata1 SBE current error
+ * @userdata2 HUID of proc
+ * @devdesc There is no recovery action on the SBE.
+ * We're deconfiguring this proc
+ * @custdesc Processor Error
+ */
l_errl = new ERRORLOG::ErrlEntry(
- ERRORLOG::ERRL_SEV_INFORMATIONAL,
- SBEIO_EXTRACT_RC_HANDLER,
- SBEIO_EXTRACT_RC_ERROR,
- TARGETING::get_huid(i_target),
- this->iv_currentAction);
-
- l_errl->collectTrace("ISTEPS_TRACE",256);
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+ SBEIO_EXTRACT_RC_HANDLER,
+ SBEIO_NO_RECOVERY_ACTION,
+ P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION,
+ TARGETING::get_huid(i_target));
+ l_errl->collectTrace( "ISTEPS_TRACE", 256);
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256);
+ l_errl->addHwCallout( i_target,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::DECONFIG,
+ HWAS::GARD_NULL );
// Set the PLID of the error log to caller's PLID,
// if provided
if (iv_callerErrorLogPLID)
{
- l_errl->plid(iv_callerErrorLogPLID);
+ l_errl->plid(iv_callerErrorLogPLID);
}
- // Commit error and continue
errlCommit(l_errl, ISTEP_COMP_ID);
+ this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG;
+ SBE_TRACF("main_sbe_handler(): We have concluded there are no further recovery actions to take, deconfiguring proc and exiting handler");
+ break;
+ }
- // if no recovery action, fail out.
- if(this->iv_currentAction ==
- P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION)
+ // if the bkp_seeprom or upd_seeprom, attempt to switch sides.
+ // This is also dependent on the iv_switchSideCount.
+ // Note: we do this for upd_seeprom because we don't support
+ // updating the seeprom during IPL time
+ if((this->iv_currentAction ==
+ P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM ||
+ this->iv_currentAction ==
+ P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM))
+ {
+ if(this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT)
{
- // There is no action possible. Gard and Callout the proc
/*@
- * @errortype ERRL_SEV_UNRECOVERABLE
- * @moduleid SBEIO_EXTRACT_RC_HANDLER
- * @reasoncode SBEIO_NO_RECOVERY_ACTION
- * @userdata1 SBE current error
- * @userdata2 HUID of proc
- * @devdesc There is no recovery action on the SBE.
- * We're garding this proc
- */
+ * @errortype ERRL_SEV_PREDICTIVE
+ * @moduleid SBEIO_EXTRACT_RC_HANDLER
+ * @reasoncode SBEIO_EXCEED_MAX_SIDE_SWITCHES
+ * @userdata1 Switch Sides Count
+ * @userdata2 HUID of proc
+ * @devdesc We have already flipped seeprom sides once
+ * and we should not have attempted to flip again
+ * @custdesc Processor Error
+ */
l_errl = new ERRORLOG::ErrlEntry(
- ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+ ERRORLOG::ERRL_SEV_PREDICTIVE,
SBEIO_EXTRACT_RC_HANDLER,
- SBEIO_NO_RECOVERY_ACTION,
- P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION,
+ SBEIO_EXCEED_MAX_SIDE_SWITCHES,
+ this->iv_switchSidesCount,
TARGETING::get_huid(i_target));
- l_errl->collectTrace( "ISTEPS_TRACE", 256);
- l_errl->addHwCallout( i_target,
- HWAS::SRCI_PRIORITY_HIGH,
- HWAS::DECONFIG,
- HWAS::GARD_NULL );
-
- // Cache PLID of error log
- iv_errorLogPLID = l_errl->plid();
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256);
// Set the PLID of the error log to caller's PLID,
// if provided
if (iv_callerErrorLogPLID)
{
- l_errl->plid(iv_callerErrorLogPLID);
+ l_errl->plid(iv_callerErrorLogPLID);
}
-
+ errlCommit(l_errl, SBEIO_COMP_ID);
+ // Break out of loop, something bad happened and we dont want end
+ // up in a endless loop
+ break;
+ }
+ l_errl = this->switch_sbe_sides(i_target);
+ if(l_errl)
+ {
errlCommit(l_errl, ISTEP_COMP_ID);
-
- SBE_TRACF("main_sbe_handler(): updating return value "
- "to indicate that we have deconfigured the proc");
- this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG;
-
+ // If any error occurs while we are trying to switch sides
+ // this indicates big problems so we want to break out of the
+ // retry loop
break;
}
+ // Note that we do not want to continue here because we want to
+ // attempt to restart using whatever sbeRestartMethod is set to after
+ // switching seeprom sides
+ }
- // if the bkp_seeprom or upd_seeprom, attempt to switch sides.
- // This is also dependent on the iv_switchSideCount.
- if(this->iv_currentAction ==
- P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM ||
- this->iv_currentAction ==
- P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM)
+ if(this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS)
+ {
+ /*@
+ * @errortype ERRL_SEV_PREDICTIVE
+ * @moduleid SBEIO_EXTRACT_RC_HANDLER
+ * @reasoncode SBEIO_EXCEED_MAX_SIDE_BOOTS
+ * @userdata1 # of boots attempts on this side
+ * @userdata2 HUID of proc
+ * @devdesc We have already done the max attempts for
+ * the current seeprom side. For some reason
+ * we are attempting to do another boot.
+ * @custdesc Processor Error
+ */
+ l_errl = new ERRORLOG::ErrlEntry(
+ ERRORLOG::ERRL_SEV_PREDICTIVE,
+ SBEIO_EXTRACT_RC_HANDLER,
+ SBEIO_EXCEED_MAX_SIDE_BOOTS,
+ this->iv_currentSideBootAttempts,
+ TARGETING::get_huid(i_target));
+
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256);
+
+ // Set the PLID of the error log to caller's PLID,
+ // if provided
+ if (iv_callerErrorLogPLID)
{
- l_errl = this->switch_sbe_sides(i_target);
- if(l_errl)
- {
- errlCommit(l_errl, ISTEP_COMP_ID);
- break;
- }
+ l_errl->plid(iv_callerErrorLogPLID);
}
+ errlCommit(l_errl, SBEIO_COMP_ID);
+ // Break out of loop, something bad happened and we dont want end
+ // up in a endless loop
+ break;
+ }
+ // Look at the sbeRestartMethd instance variable to determine which method
+ // we will use to attempt the restart. In general during IPL time we will
+ // attempt CBS, during runtime we will want to use HRESET.
+ else if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS)
+ {
+ SBE_TRACF("Invoking p9_start_cbs HWP on processor %.8X", get_huid(i_target));
+ const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP>
+ l_fapi2_proc_target (i_target);
+
+ FAPI_INVOKE_HWP(l_errl, p9_start_cbs,
+ l_fapi2_proc_target, true);
- // Attempt SBE restart
- if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS)
+ //Increment attempt count for this side
+ this->iv_currentSideBootAttempts++;
+
+ if(l_errl)
{
- SBE_TRACF("Invoking p9_start_cbs HWP");
- const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP>
- l_fapi2_proc_target (i_target);
+ SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x",
+ l_errl->plid() );
+ l_errl->collectTrace( "ISTEPS_TRACE", 256 );
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256 );
+
+ // Gard the target, when SBE Retry fails
+ l_errl->addHwCallout(i_target,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_Predictive);
- FAPI_INVOKE_HWP(l_errl, p9_start_cbs,
- l_fapi2_proc_target, true);
- if(l_errl)
+ // Set the PLID of the error log to caller's PLID,
+ // if provided
+ if (iv_callerErrorLogPLID)
{
- SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x",
- l_errl->plid() );
- l_errl->collectTrace( "ISTEPS_TRACE", 256 );
-
- // Gard the target, when SBE Retry fails
- l_errl->addHwCallout(i_target,
- HWAS::SRCI_PRIORITY_HIGH,
- HWAS::NO_DECONFIG,
- HWAS::GARD_Predictive);
-
- // Set the PLID of the error log to caller's PLID,
- // if provided
- if (iv_callerErrorLogPLID)
- {
- l_errl->plid(iv_callerErrorLogPLID);
- }
-
- errlCommit( l_errl, ISTEP_COMP_ID);
+ l_errl->plid(iv_callerErrorLogPLID);
}
- }else
- {
- //@todo - RTC:180242 - Restart SBE
+
+ errlCommit( l_errl, ISTEP_COMP_ID);
+ // If we got an errlog while attempting start_cbs
+ // we will assume that no future retry actions
+ // will work so we will break out of the retry loop
+ break;
}
+ }else
+ {
+ //@todo RTC:180242 Right now we don't have the support
+ // to perform an hreset, when we do remove this error
+ // log and perform the hreset.
+
+ //Increment attempt count for this side
+ this->iv_currentSideBootAttempts++;
+ /*@
+ * @errortype
+ * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
+ * @moduleid SBEIO_EXTRACT_RC_HANDLER
+ * @reasoncode SBEIO_UNSUPPORTED_REQUEST
+ * @userdata1 HUID of proc that had the SBE timeout
+ * @userdata2 SBE failing code
+ *
+ * @devdesc SBE did not start, this function is looking at
+ * the error to determine next course of action
+ *
+ * @custdesc The SBE did not start, we will attempt a reboot
+ * if possible
+ */
+ l_errl = new ERRORLOG::ErrlEntry(
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+ SBEIO_EXTRACT_RC_HANDLER,
+ SBEIO_UNSUPPORTED_REQUEST,
+ TARGETING::get_huid(i_target),
+ this->iv_currentAction);
+
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256 );
- // Get the sbe register
- this->get_sbe_reg(i_target);
+ // Gard the proc, when SBE Retry fails
+ l_errl->addHwCallout(i_target,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_Predictive);
- if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME))
+ // Set the PLID of the error log to caller's PLID,
+ // if provided
+ if (iv_callerErrorLogPLID)
{
- // return, false if no boot is needed.
- l_retry = this->sbe_boot_fail_handler(i_target);
+ l_errl->plid(iv_callerErrorLogPLID);
}
+
+ errlCommit(l_errl, ISTEP_COMP_ID);
+
+ // If we got an errlog while attempting hreset
+ // we will assume that no future retry actions
+ // will work so we will exit
+ break;
}
- }
- else
- {
- // In the informational only mode, we just need enough information
- // to get the SBE RC returned from the HWP. We are running with
- // the knowledge that the SBE has failed already.
- // pass true to have log show up
- this->sbe_boot_fail_handler(i_target, true);
- this->iv_currentSBEState = SBE_FAILED_TO_BOOT;
- }
+ // We have performed the action, so make sure that ffdcSetAction is set back to 0
+ this->iv_ffdcSetAction = 0;
- this->handle_sbe_reg_value(i_target);
+ // Get the sbe register (note that if asyncFFDC bit is set in status register then
+ // we will read it in this call)
+ if(!this->sbe_run_extract_msg_reg(i_target))
+ {
+ // Error log should have already committed in sbe_run_extract_msg_reg for this issue
+ // we need to stop our recovery efforts and bail out of the retry handler
+ break;
+ }
- // if we have started the sbe, and the current action is upd_seeprom
- // or bkp_seeprom, note that we started on an unexpected side
- if(i_target->getAttr<TARGETING::ATTR_SBE_IS_STARTED>() &&
- (this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM ||
- this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) )
+ // If our retry attempt fail, and we didnt see any asyncFFDC after
+ if (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)
+ {
+ // Again, if ffdcSetAction is set, that means we have found FFDC
+ // already that the SBE saved away prior to failing so we don't need
+ // to run extract_rc if ffdcSetAction is true
+ if(!this->iv_ffdcSetAction)
+ {
+ SBE_TRACF("main_sbe_handler(): Failed to reach runtime after sbe restart and no asyncFFDC found. Calling p9_sbe_extract_rc.");
+ // Run extract rc to figure out why the sbe did not make it to
+ // runtime state
+ this->sbe_run_extract_rc(i_target);
+ }
+ }
+
+ } while((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME);
+
+ // If we ended up switching sides we want to mark it down as
+ // as informational log
+ if(this->iv_switchSidesCount)
{
/*@
* @errortype ERRL_SEV_INFORMATIONAL
@@ -329,6 +499,7 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
SBEIO_BOOTED_UNEXPECTED_SIDE,
0,TARGETING::get_huid(i_target));
l_errl->collectTrace("ISTEPS_TRACE",256);
+ l_errl->collectTrace(SBEIO_COMP_NAME,256);
// Set the PLID of the error log to caller's PLID,
// if provided
@@ -345,212 +516,106 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
SBE_TRACF(EXIT_MRK "main_sbe_handler()");
}
-void SbeRetryHandler::get_sbe_reg(TARGETING::Target * i_target)
+bool SbeRetryHandler::sbe_run_extract_msg_reg(TARGETING::Target * i_target)
{
- SBE_TRACF(ENTER_MRK "get_sbe_reg()");
+ SBE_TRACF(ENTER_MRK "sbe_run_extract_msg_reg()");
errlHndl_t l_errl = nullptr;
- do
+ //Assume that reading the status succeeded
+ bool l_statusReadSuccess = true;
+
+ // This function will poll the status register for 60 seconds
+ // waiting for the SBE to reach runtime
+ // we will exit the polling before 60 seconds if we either reach
+ // runtime, or get an error reading the status reg, or if the asyncFFDC
+ // bit is set
+ l_errl = this->sbe_poll_status_reg(i_target);
+
+ // If there is no error getting the status register, and the SBE
+ // did not make it to runtime AND the asyncFFDC bit is set, we will
+ // use the FFDC to decide our actions rather than using p9_extract_sbe_rc
+ if((!l_errl) &&
+ (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) &&
+ this->iv_sbeRegister.asyncFFDC)
{
- l_errl = this->sbe_timeout_handler(i_target);
-
- if((!l_errl) && (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME))
- {
- // See if async FFDC bit is set in SBE register
- if(this->iv_sbeRegister.asyncFFDC)
- {
- bool l_flowCtrl = this->sbe_get_ffdc_handler(i_target);
-
- if(l_flowCtrl)
- {
- break;
- }
- }
- }
- else if (l_errl)
- {
- SBE_TRACF("ERROR: call get_sbe_reg, PLID=0x%x", l_errl->plid() );
-
- // capture the target data in the elog
- ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl );
+ SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X . "
+ "There was asyncFFDC found though so we will run the FFDC parser",
+ TARGETING::get_huid(i_target));
+ // The SBE has responded to an asyncronus request that hostboot
+ // made with FFDC indicating an error has occurred.
+ // This should be the path we hit when we are waiting to see
+ // if the sbe boots
+ this->sbe_get_ffdc_handler(i_target);
+ }
+ // If there was an error log that means that we failed to read the
+ // cfam register to get the SBE status, something is seriously wrong
+ // if we hit this
+ else if (l_errl)
+ {
+ l_statusReadSuccess = false;
+ SBE_TRACF("ERROR: call sbe_run_extract_msg_reg, PLID=0x%x", l_errl->plid() );
- // Commit error log
- errlCommit( l_errl, HWPF_COMP_ID );
- }
- // No error and still functional
- else if(i_target->getAttr<TARGETING::ATTR_HWAS_STATE>().functional)
+ l_errl->collectTrace(SBEIO_COMP_NAME,256);
+ // Set the PLID of the error log to caller's PLID,
+ // if provided
+ if (iv_callerErrorLogPLID)
{
- // Set attribute indicating that SBE is started
- i_target->setAttr<TARGETING::ATTR_SBE_IS_STARTED>(1);
- this->iv_sbeRestarted = true;
-
- SBE_TRACF("SUCCESS: get_sbe_reg completed okay for proc 0x%.8X",
- TARGETING::get_huid(i_target));
+ l_errl->plid(iv_callerErrorLogPLID);
}
- //@TODO-RTC:100963 - this should match the logic in
- //call_proc_check_slave_sbe_seeprom.C
- } while(0);
- SBE_TRACF(EXIT_MRK "get_sbe_reg()");
-
-}
-
-void SbeRetryHandler::handle_sbe_reg_value(TARGETING::Target * i_target)
-{
- errlHndl_t l_errl = NULL;
-
- SBE_TRACF(ENTER_MRK "handle_sbe_reg_value()");
-
- const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP>
- l_fapi2_proc_target(i_target);
+ // capture the target data in the elog
+ ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl );
- switch(this->iv_currentSBEState)
+ // Commit error log
+ errlCommit( l_errl, HWPF_COMP_ID );
+ }
+ // No error, able to read the sbe status register okay
+ // No guarantees that the SBE made it to runtime
+ else
{
- case SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR:
- {
- SBE_TRACF("handle_sbe_reg_value(): case FUNCTION_ERROR");
- // There has been a failure getting the SBE register
- // We cannot continue any further, return failure.
- this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
- break;
- }
- case SbeRetryHandler::SBE_REG_RETURN::SBE_AT_RUNTIME:
- {
- SBE_TRACF("handle_sbe_reg_value(): case SBE_AT_RUNTIME");
- // The SBE has successfully booted at runtime
- this->iv_currentAction = P9_EXTRACT_SBE_RC::ERROR_RECOVERED;
- break;
- }
- case SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT:
- {
- SBE_TRACF("handle_sbe_reg_value(): case SBE_FAILED_TO_BOOT");
- if((this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM)
- && (!iv_retriggeredMain))
-
- {
- iv_retriggeredMain = true;
-
-#ifndef __HOSTBOOT_RUNTIME
- // This could potentially take awhile, reset watchdog
- INITSERVICE::sendProgressCode();
-#endif
- SBE_TRACF("handle_sbe_reg_value(): Attempting "
- "REIPL_UPD_SEEPROM failed. Recalling with BKP_SEEPROM");
- // If we were trying to reipl and hit the error, we need
- // to start with a new seeprom before hitting the threshold
- this->iv_currentAction =
- P9_EXTRACT_SBE_RC::RETURN_ACTION::REIPL_BKP_SEEPROM;
- this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET;
- main_sbe_handler(i_target);
- break;
- }
-
- // Failed to boot, setting the final action for debugging.
- SBE_TRACF("Inside handle_sbe_reg_value, calling "
- "p9_extract_sbe_rc HWP");
- // Get SBE extract rc
- P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction =
- P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;
- FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc,
- l_fapi2_proc_target, l_rcAction);
- this->iv_currentAction = l_rcAction;
-
- SBE_TRACF("handle_sbe_reg_value(): SBE failed to boot. Final "
- "action is %llx", l_rcAction);
-
- if(l_errl)
- {
- SBE_TRACF("ERROR : p9_extract_sbe_rc HWP returning errorlog "
- "PLID-0x%x", l_errl->plid());
-
- // capture the target data in the elog
- ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl);
+ SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X",
+ TARGETING::get_huid(i_target));
+ }
- // Cache PLID of error log
- iv_errorLogPLID = l_errl->plid();
+ SBE_TRACF(EXIT_MRK "sbe_run_extract_msg_reg()");
- // Set the PLID of the error log to caller's PLID,
- // if provided
- if (iv_callerErrorLogPLID)
- {
- l_errl->plid(iv_callerErrorLogPLID);
- }
+ return l_statusReadSuccess;
- // Commit error log
- errlCommit( l_errl, HWPF_COMP_ID );
- }
-
- break;
- }
- default:
- {
- // This should never happened
- // error out, unexpected enum value returned.
- //return P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
- /*@
- * @errortype ERRL_SEV_PREDICTIVE
- * @moduleid SBEIO_HANDLE_SBE_REG_VALUE
- * @reasoncode SBEIO_INCORRECT_FCN_CALL
- * @userdata1 HUID of target
- * @userdata2 SBE current state
- * @devdesc This function was called incorrectly or
- * there is a new enum that is not handled yet.
- */
- l_errl = new ERRORLOG::ErrlEntry(
- ERRORLOG::ERRL_SEV_PREDICTIVE,
- SBEIO_HANDLE_SBE_REG_VALUE,
- SBEIO_INCORRECT_FCN_CALL,
- get_huid(i_target),this->iv_currentSBEState);
- l_errl->collectTrace("ISTEPS_TRACE",256);
-
- // Set the PLID of the error log to caller's PLID,
- // if provided
- if (iv_callerErrorLogPLID)
- {
- l_errl->plid(iv_callerErrorLogPLID);
- }
-
- errlCommit(l_errl, ISTEP_COMP_ID);
- this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
- break;
- }
- }
- SBE_TRACF(EXIT_MRK "handle_sbe_reg_value()");
}
-errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target)
+errlHndl_t SbeRetryHandler::sbe_poll_status_reg(TARGETING::Target * i_target)
{
- SBE_TRACF(ENTER_MRK "sbe_timeout_handler()");
+ SBE_TRACF(ENTER_MRK "sbe_poll_status_reg()");
- errlHndl_t l_errl = NULL;
+ errlHndl_t l_errl = nullptr;
this->iv_currentSBEState =
- SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT;
+ SbeRetryHandler::SBE_REG_RETURN::SBE_NOT_AT_RUNTIME;
const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP>
l_fapi2_proc_target(i_target);
- // Each slave sbe gets 60s to respond with the fact that it's
+ // Each sbe gets 60s to respond with the fact that it's
// booted and at runtime (stable state)
- uint64_t SBE_TIMEOUT_NSEC = 60*NS_PER_SEC; //60 sec
+ uint64_t l_sbeTimeout = SBE_RETRY_TIMEOUT_HW; // 60 seconds
// Bump this up really high for simics, things are slow there
if( Util::isSimicsRunning() )
{
- SBE_TIMEOUT_NSEC *= 10;
+ l_sbeTimeout = SBE_RETRY_TIMEOUT_SIMICS; // 600 seconds
}
- const uint64_t SBE_NUM_LOOPS = 100;
- const uint64_t SBE_WAIT_SLEEP = (SBE_TIMEOUT_NSEC/SBE_NUM_LOOPS);
+
+ const uint64_t SBE_WAIT_SLEEP = (l_sbeTimeout/SBE_RETRY_NUM_LOOPS);
SBE_TRACF("Running p9_get_sbe_msg_register HWP on proc target %.8X",
TARGETING::get_huid(i_target));
- for( uint64_t l_loops = 0; l_loops < SBE_NUM_LOOPS; l_loops++ )
+ for( uint64_t l_loops = 0; l_loops < SBE_RETRY_NUM_LOOPS; l_loops++ )
{
sbeMsgReg_t l_reg;
FAPI_INVOKE_HWP(l_errl, p9_get_sbe_msg_register,
l_fapi2_proc_target, l_reg);
- this->iv_sbeRegister = l_reg;
+ this->iv_sbeRegister.reg = l_reg.reg;
if (l_errl)
{
SBE_TRACF("ERROR : call p9_get_sbe_msg_register, PLID=0x%x, "
@@ -558,7 +623,7 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target)
l_errl->plid(),
l_loops );
this->iv_currentSBEState =
- SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR;
+ SbeRetryHandler::SBE_REG_RETURN::FAILED_COLLECTING_REG;
break;
}
else if ((this->iv_sbeRegister).currState == SBE_STATE_RUNTIME)
@@ -591,46 +656,74 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target)
(this->iv_sbeRegister).reg);
}
l_loops++;
+#ifndef __HOSTBOOT_RUNTIME
+ // reset watchdog before performing the nanosleep
+ INITSERVICE::sendProgressCode();
+#endif
nanosleep(0,SBE_WAIT_SLEEP);
}
}
if ((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME)
{
- // Switch to using FSI SCOM
+ // Switch to using FSI SCOM if we are not using xscom
TARGETING::ScomSwitches l_switches =
i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>();
TARGETING::ScomSwitches l_switches_before = l_switches;
- // Turn off SBE SCOM and turn on FSI SCOM.
- l_switches.useFsiScom = 1;
- l_switches.useSbeScom = 0;
-
- SBE_TRACF("sbe_timeout_handler: changing SCOM switches from 0x%.2X "
- "to 0x%.2X for proc 0x%.8X",
- l_switches_before,
- l_switches,
- TARGETING::get_huid(i_target));
- i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches);
+ if(!l_switches.useXscom)
+ {
+ // Turn off SBE SCOM and turn on FSI SCOM.
+ l_switches.useFsiScom = 1;
+ l_switches.useSbeScom = 0;
+
+ SBE_TRACF("sbe_poll_status_reg: changing SCOM switches from 0x%.2X "
+ "to 0x%.2X for proc 0x%.8X",
+ l_switches_before,
+ l_switches,
+ TARGETING::get_huid(i_target));
+ i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches);
+ }
}
- // Set the PLID of the error log to caller's PLID,
- // if provided
- if (l_errl && iv_callerErrorLogPLID)
+ SBE_TRACF(EXIT_MRK "sbe_poll_status_reg()");
+ return l_errl;
+}
+
+#ifndef __HOSTBOOT_RUNTIME
+void SbeRetryHandler::handleFspIplTimeFail(TARGETING::Target * i_target)
+{
+ // If we found that there was async FFDC available we need to notify hwsv of this
+ // even if we did not find anything useful in the ffdc for us, its possible hwsv
+ // will be able to use it.
+ if ((this->iv_sbeRegister).asyncFFDC)
{
- l_errl->plid(iv_callerErrorLogPLID);
+ iv_shutdownReturnCode = SBEIO_HWSV_COLLECT_SBE_RC;
}
-
- SBE_TRACF(EXIT_MRK "sbe_timeout_handler()");
- return l_errl;
+ // If the asyncFFDC bit is not set on the sbeRegister
+ // then we need to pass the DEAD_SBE RC to hwsv when we
+ // TI
+ else
+ {
+ this->iv_shutdownReturnCode = SBEIO_DEAD_SBE;
+ }
+ SBE_TRACF("handleFspIplTimeFail(): During IPL time on FSP system hostboot will TI so that HWSV can handle the error. "
+ "Shutting down w/ the error code %s" ,
+ this->iv_sbeRegister.asyncFFDC ? "SBEIO_HWSV_COLLECT_SBE_RC" : "SBEIO_DEAD_SBE" );
+
+ // On FSP systems if we failed to recover the SBE then we should shutdown w/ the
+ // correct error so that HWSV will know what FFDC to collect
+ INITSERVICE::doShutdownWithError(this->iv_shutdownReturnCode,
+ TARGETING::get_huid(i_target));
}
+#endif
-P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc(
+uint32_t SbeRetryHandler::action_for_ffdc_rc(
uint32_t i_rc)
{
SBE_TRACF(ENTER_MRK "action_for_ffdc_rc()");
- P9_EXTRACT_SBE_RC::RETURN_ACTION l_action;
+ uint32_t l_action;
switch(i_rc)
{
@@ -675,22 +768,22 @@ P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc(
case fapi2::RC_EXTRACT_SBE_RC_BRANCH_TO_SEEPROM_FAIL:
case fapi2::RC_EXTRACT_SBE_RC_UNEXPECTED_OTPROM_HALT:
case fapi2::RC_EXTRACT_SBE_RC_OTP_ECC_ERR:
- default:
l_action = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
break;
+ default:
+
+ l_action = NO_ACTION_FOUND_FOR_THIS_RC;
}
SBE_TRACF(EXIT_MRK "action_for_ffdc_rc()");
return l_action;
}
-bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
+void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
{
SBE_TRACF(ENTER_MRK "sbe_get_ffdc_handler()");
-
- bool l_flowCtrl = false;
uint32_t l_responseSize = SbeFifoRespBuffer::MSG_BUFFER_SIZE;
uint32_t *l_pFifoResponse =
reinterpret_cast<uint32_t *>(malloc(l_responseSize));
@@ -715,12 +808,43 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
else
{
// Parse the FFDC package(s) in the response
- SbeFFDCParser * l_ffdc_parser =
- new SbeFFDCParser();
+ auto l_ffdc_parser = std::make_shared<SbeFFDCParser>();
l_ffdc_parser->parseFFDCData(reinterpret_cast<void *>(l_pFifoResponse));
uint8_t l_pkgs = l_ffdc_parser->getTotalPackages();
- P9_EXTRACT_SBE_RC::RETURN_ACTION l_action;
+
+ // Currently we expect a maxiumum of 2 FFDC packets. These packets would be
+ // a HWP FFDC packet which we will look at to determine what our retry action
+ // should be. The other type of packet we might see would be details on the
+ // internal SBE fail. For internal SBE fail packets we will just add the FFDC
+ // to the error log and move on.
+ //
+ // Note: If we exceed MAX_EXPECTED_FFDC_PACKAGES, commit an informational log.
+ // It shouldn't break anything but this could help us understand if something odd
+ // is happening
+ if(l_pkgs > MAX_EXPECTED_FFDC_PACKAGES)
+ {
+ /*@
+ * @errortype
+ * @moduleid SBEIO_GET_FFDC_HANDLER
+ * @reasoncode SBEIO_MORE_FFDC_THAN_EXPECTED
+ * @userdata1 Maximum expected packages
+ * @userdata2 Number of FFDC packages
+ * @devdesc Unexpected number of FFDC packages in buffer
+ * @custdesc Extra FFDC gathered, marked information event
+ */
+ l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL,
+ SBEIO_GET_FFDC_HANDLER,
+ SBEIO_MORE_FFDC_THAN_EXPECTED,
+ MAX_EXPECTED_FFDC_PACKAGES,
+ l_pkgs);
+
+ l_errl->collectTrace( SBEIO_COMP_NAME, 256);
+
+ // Also log the failing proc as FFDC
+ ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl);
+ errlCommit(l_errl, SBEIO_COMP_ID);
+ }
// If there are FFDC packages, make a log for FFDC from SBE
if(l_pkgs > 0)
@@ -742,35 +866,47 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
// Also log the failing proc as FFDC
ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl);
- }
- // Process each FFDC package
- for(auto i=0; i<l_pkgs; i++)
- {
- // Add each package to the log
- l_errl->addFFDC( SBEIO_COMP_ID,
- l_ffdc_parser->getFFDCPackage(i),
- l_ffdc_parser->getPackageLength(i),
- 0,
- SBEIO_UDT_PARAMETERS,
- false );
-
- // Get the RC from the FFDC package
- uint32_t l_rc = l_ffdc_parser->getPackageRC(i);
-
- // Determine an action for the RC
- l_action = action_for_ffdc_rc(l_rc);
-
- // Handle that action
- this->iv_currentAction = l_action;
- this->iv_retriggeredMain = true;
- this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET;
- main_sbe_handler(i_target);
- }
- // If there are FFDC packages, commit the log
- if(l_pkgs > 0)
- {
+ // Process each FFDC package
+ for(auto i=0; i<l_pkgs; i++)
+ {
+ // Add each package to the log
+ l_errl->addFFDC( SBEIO_COMP_ID,
+ l_ffdc_parser->getFFDCPackage(i),
+ l_ffdc_parser->getPackageLength(i),
+ 0,
+ SBEIO_UDT_PARAMETERS,
+ false );
+
+ // Get the RC from the FFDC package
+ uint32_t l_rc = l_ffdc_parser->getPackageRC(i);
+
+ // Determine an action for the RC
+ P9_EXTRACT_SBE_RC::RETURN_ACTION l_action =
+ static_cast<P9_EXTRACT_SBE_RC::RETURN_ACTION>(action_for_ffdc_rc(l_rc));
+
+ if(l_action != NO_ACTION_FOUND_FOR_THIS_RC)
+ {
+ // Set the action associated with the RC that we found
+ this->iv_currentAction = l_action;
+
+ // This call will look at what action_for_ffdc_rc had set the return action to
+ // checks on how many times we have attempted to boot this side,
+ // and if we have already tried switching sides
+ //
+ //
+ // Note this call is important, if this is not called we could end up in a
+ // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS
+ this->bestEffortCheck();
+
+ // Set the instance variable ffdcSetAction to let us
+ // know that the current action was set from what we
+ // found in the asyncFFDC
+ this->iv_ffdcSetAction = true;
+ }
+ }
+
l_errl->collectTrace( SBEIO_COMP_NAME, KILOBYTE/4);
l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4);
@@ -783,11 +919,6 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
errlCommit(l_errl, ISTEP_COMP_ID);
}
-
- delete l_ffdc_parser;
- l_ffdc_parser = nullptr;
-
- l_flowCtrl = true;
}
#endif
@@ -795,155 +926,60 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
l_pFifoResponse = nullptr;
SBE_TRACF(EXIT_MRK "sbe_get_ffdc_handler()");
- return l_flowCtrl;
}
-//By default we want to call the 2 param version of the func w/ "true"
-//passed in to tell the function we want to hide the mandatory errlog
-bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target)
-{
- return SbeRetryHandler::sbe_boot_fail_handler(i_target, false);
-}
-bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target,
- bool i_exposeLog)
+void SbeRetryHandler::sbe_run_extract_rc(TARGETING::Target * i_target)
{
- SBE_TRACF(ENTER_MRK "sbe_boot_fail_handler()");
+ SBE_TRACF(ENTER_MRK "sbe_run_extract_rc()");
errlHndl_t l_errl = nullptr;
fapi2::ReturnCode l_rc;
- bool o_needRetry = false;
-
- SBE_TRACF("SBE 0x%.8X never started, sbeReg=0x%.8X",
- TARGETING::get_huid(i_target),(this->iv_sbeRegister).reg );
- /*@
- * @errortype
- * @reasoncode SBEIO_SLAVE_TIMEOUT
- * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL
- * @moduleid SBEIO_EXTRACT_RC_HANDLER
- * @userdata1 HUID of proc which had SBE timeout
- * @userdata2 SBE MSG Register
- *
- * @devdesc Slave SBE did not get to ready state within
- * allotted time
- *
- * @custdesc A processor in the system has failed to initialize
- */
- l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL,
- SBEIO_EXTRACT_RC_HANDLER,
- SBEIO_SLAVE_TIMEOUT,
- TARGETING::get_huid(i_target),
- (this->iv_sbeRegister).reg);
-
- l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4);
-
- // Set the PLID of the error log to caller's PLID,
- // if provided
- if (iv_callerErrorLogPLID)
- {
- l_errl->plid(iv_callerErrorLogPLID);
- }
-
- if(i_exposeLog)
- {
- l_errl->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
-
- }
- // Commit error and continue, this is not terminating since
- // we can still at least boot with master proc
- errlCommit(l_errl,ISTEP_COMP_ID);
-
- SBE_TRACF("Inside sbe_boot_fail_handler, calling p9_extract_sbe_rc HWP");
+ SBE_TRACF("Inside sbe_run_extract_rc, calling p9_extract_sbe_rc HWP");
// Setup for the HWP
const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget(
const_cast<TARGETING::Target*> (i_target));
+ // Default the return action to be NO_RECOVERY , if something goes
+ // wrong in p9_extract_sbe_rc and l_ret doesn't get set in that function
+ // then we want to fall back on NO_RECOVERY which we will handle
+ // accordingly in bestEffortCheck
P9_EXTRACT_SBE_RC::RETURN_ACTION l_ret =
- P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;
+ P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
- //Note that we are calling this while we are already inside
- //of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err
- //but unsure how to get around it.
+ // TODO RTC: 190528 Force FAPI_INVOKE_HWP to call FAPI_EXEC_HWP when FAPI_INVOKE
+ // is blocked by mutex
+ // Note that it's possible we are calling this while we are already inside
+ // of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err
+ // but unsure how to get around it.
FAPI_EXEC_HWP(l_rc, p9_extract_sbe_rc, l_fapi2ProcTarget,
l_ret, iv_useSDB, iv_secureModeDisabled);
+ // Convert the returnCode into an UNRECOVERABLE error log which we will
+ // associated w/ the caller's errlog via plid
l_errl = rcToErrl(l_rc, ERRORLOG::ERRL_SEV_UNRECOVERABLE);
this->iv_currentAction = l_ret;
- if(this->iv_currentAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED)
- {
+ // Set the instance variable ffdcSetAction to let us
+ // know that the current action was not set by what
+ // we found in asyncFFDC
+ this->iv_ffdcSetAction = false;
- if(l_errl)
- {
- SBE_TRACF("p9_extract_sbe_rc HWP returned action %d and errorlog "
- "PLID=0x%x, rc=0x%.4X", this->iv_currentAction,
- l_errl->plid(), l_errl->reasonCode() );
- errlCommit(l_errl, SBEIO_COMP_ID);
- }
+ // This call will look at what p9_extact_sbe_rc had set the return action to
+ // checks on how many times we have attempted to boot this side,
+ // and if we have already tried switching sides
+ //
+ // Note this call is important, if this is not called we could end up in a
+ // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS
+ this->bestEffortCheck();
- SBE_TRACF("sbe_boot_fail_handler: We have hit an error in the SBE "
- "and hostboot will now attempt to reboot the SBE");
- /*@
- * @errortype
- * @severity ERRORLOG::ERRL_SEV_PREDICTIVE
- * @moduleid SBEIO_EXTRACT_RC_HANDLER
- * @reasoncode SBEIO_ATTEMPTING_REBOOT
- * @userdata1 HUID of proc which had the SBE timeout
- * @userdata2 Current action to be taken on the SBE
- * @devdesc HWP has returned a reboot action to be taken
- * Hostboot will now attempt to reboot the SBE
- * @custdesc A processor in the system has failed to initialize.
- * Hostboot is attempting a recovery.
- */
- l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_PREDICTIVE,
- SBEIO_EXTRACT_RC_HANDLER,
- SBEIO_ATTEMPTING_REBOOT,
- TARGETING::get_huid(i_target),
- this->iv_currentAction);
- l_errl->collectTrace("SBEIO_TRACE",KILOBYTE/4);
-
- // Set the PLID of the error log to caller's PLID if provided
- if(iv_callerErrorLogPLID)
- {
- l_errl->plid(iv_callerErrorLogPLID);
- }
- errlCommit(l_errl,SBEIO_COMP_ID);
-
- if(INITSERVICE::spBaseServicesEnabled())
- {
#ifndef __HOSTBOOT_RUNTIME
- // When we are on an FSP machine, we want to fail out of
- // hostboot and give control back to the FSP. They have
- // better diagnostics for this type of error.
- INITSERVICE::doShutdownWithError(SBEIO_HWSV_COLLECT_SBE_RC,
- TARGETING::get_huid(i_target));
+ // This could potentially take awhile, reset watchdog
+ INITSERVICE::sendProgressCode();
#endif
- }
-#ifndef __HOSTBOOT_RUNTIME
- // This could potentially take awhile, reset watchdog
- INITSERVICE::sendProgressCode();
-#endif
- SBE_TRACF("sbe_boot_fail_handler. iv_switchSides count is %llx",
- iv_switchSidesCount);
- if((this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) &&
- (iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT))
- {
- this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM;
- o_needRetry = true;
- }
- else if(iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT)
- {
- o_needRetry = false;
- }
- else
- {
- o_needRetry = true;
- }
-
- }
if(l_errl)
{
SBE_TRACF("Error: sbe_boot_fail_handler : p9_extract_sbe_rc HWP "
@@ -964,84 +1000,219 @@ bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target,
errlCommit( l_errl, HWPF_COMP_ID );
}
- SBE_TRACF(EXIT_MRK "sbe_boot_fail_handler() current action is %llx",
+ SBE_TRACF(EXIT_MRK "sbe_run_extract_rc() current action is %llx",
this->iv_currentAction);
- return o_needRetry;
+}
+
+void SbeRetryHandler::bestEffortCheck()
+{
+ // We don't want to accept that there is no recovery action just
+ // because that is what extract_rc is telling us. We want to make
+ // sure we have tried booting on this seeprom twice, and that we
+ // have tried the other seeprom twice as well. If we have tried all of
+ // those cases then we will fail out
+ if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION)
+ {
+ if (this->iv_currentSideBootAttempts < MAX_SIDE_BOOT_ATTEMPTS)
+ {
+ SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying RESTART_SBE");
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE;
+ }
+ else if (this->iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT)
+ {
+ SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying REIPL_BKP_SEEPROM");
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM;
+ }
+ else
+ {
+ // If we have attempted the max boot attempts on current side
+ // and have already switched sides once, then we will accept
+ // that we don't know how to recover and pass this status out
+ }
+ }
+ // If we have already switched sides, and extract rc is telling us to
+ // switch sides again, there is nothing we can do, so change currentAction
+ // to be NO_RECOVERY_ACTION
+ else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM ||
+ this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM )
+ {
+ if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT)
+ {
+ SBE_TRACF("bestEffortCheck(): suggested action was REIPL_BKP_SEEPROM/REIPL_UPD_SEEPROM but that is not possible so changing to NO_RECOVERY_ACTION");
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
+ }
+ }
+ // If the extract sbe rc hwp tells us to restart, and we have already
+ // done 2 retries on this side, then attempt to switch sides, if we can't
+ // switch sides, set currentAction to NO_RECOVERY_ACTION
+ else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_SBE ||
+ this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_CBS)
+ {
+ if (this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS)
+ {
+ if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT)
+ {
+ SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but no actions possible so changing to NO_RECOVERY_ACTION");
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
+ }
+ else
+ {
+ SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but max attempts tried already so changing to REIPL_BKP_SEEPROM");
+ this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM;
+ }
+ }
+ }
}
errlHndl_t SbeRetryHandler::switch_sbe_sides(TARGETING::Target * i_target)
{
SBE_TRACF(ENTER_MRK "switch_sbe_sides()");
- errlHndl_t l_errl = NULL;
- const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32;
+ errlHndl_t l_errl = nullptr;
+ TARGETING::ATTR_PROC_SBE_MASTER_CHIP_type l_isMaster =
+ i_target->getAttr<TARGETING::ATTR_PROC_SBE_MASTER_CHIP>();
+
+#ifdef __HOSTBOOT_RUNTIME
+ const bool l_isRuntime = true;
+#else
+ const bool l_isRuntime = false;
+#endif
do{
- // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc
- uint32_t l_read_reg = 0;
- size_t l_opSize = sizeof(uint32_t);
- l_errl = DeviceFW::deviceOp(
- DeviceFW::READ,
- i_target,
- &l_read_reg,
- l_opSize,
- DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) );
- if( l_errl )
+ if(!l_isRuntime && !l_isMaster)
{
- SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read "
- "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, "
- "RC=0x%X, PLID=0x%lX",
- PERV_SB_CS_FSI_BYTE, // 0x2820
- TARGETING::get_huid(i_target),
- ERRL_GETRC_SAFE(l_errl),
- ERRL_GETPLID_SAFE(l_errl));
- break;
- }
+ const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32;
+ // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc
+ uint32_t l_read_reg = 0;
+ size_t l_opSize = sizeof(uint32_t);
+ l_errl = DeviceFW::deviceOp(
+ DeviceFW::READ,
+ i_target,
+ &l_read_reg,
+ l_opSize,
+ DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) );
+
+ if( l_errl )
+ {
+ SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read "
+ "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, "
+ "RC=0x%X, PLID=0x%lX",
+ PERV_SB_CS_FSI_BYTE, // 0x2820
+ TARGETING::get_huid(i_target),
+ ERRL_GETRC_SAFE(l_errl),
+ ERRL_GETPLID_SAFE(l_errl));
+ break;
+ }
- // Determine how boot side is currently set
- if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1
- {
- // Set Boot Side 0 by clearing bit for side 1
- SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X",
- iv_switchSidesCount,
- TARGETING::get_huid(i_target));
- l_read_reg &= ~l_sbeBootSelectMask;
- this->iv_sbeSide = 1;
+ // Determine how boot side is currently set
+ if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1
+ {
+ // Set Boot Side 0 by clearing bit for side 1
+ SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X",
+ iv_switchSidesCount,
+ TARGETING::get_huid(i_target));
+ l_read_reg &= ~l_sbeBootSelectMask;
+ }
+ else // Currently set for Boot Side 0
+ {
+ // Set Boot Side 1 by setting bit for side 1
+ SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X",
+ iv_switchSidesCount,
+ TARGETING::get_huid(i_target));
+ l_read_reg |= l_sbeBootSelectMask;
+ }
+
+ // Write updated PERV_SB_CS_FSI 0x2820 back into target proc
+ l_errl = DeviceFW::deviceOp(
+ DeviceFW::WRITE,
+ i_target,
+ &l_read_reg,
+ l_opSize,
+ DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) );
+ if( l_errl )
+ {
+ SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write "
+ "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, "
+ "RC=0x%X, PLID=0x%lX",
+ PERV_SB_CS_FSI_BYTE, // 0x2820
+ TARGETING::get_huid(i_target),
+ ERRL_GETRC_SAFE(l_errl),
+ ERRL_GETPLID_SAFE(l_errl));
+ break;
+ }
}
- else // Currently set for Boot Side 0
+ else
{
- // Set Boot Side 1 by setting bit for side 1
- SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X",
- iv_switchSidesCount,
- TARGETING::get_huid(i_target));
- l_read_reg |= l_sbeBootSelectMask;
- this->iv_sbeSide = 0;
+ // Read PERV_SB_CS_SCOM 0x50008 for target proc
+ uint64_t l_read_reg = 0;
+ size_t l_opSize = sizeof(uint64_t);
+ l_errl = DeviceFW::deviceOp(
+ DeviceFW::READ,
+ i_target,
+ &l_read_reg,
+ l_opSize,
+ DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) );
+
+ if( l_errl )
+ {
+ SBE_TRACF( ERR_MRK"switch_sbe_sides: SCOM device read "
+ "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, "
+ "RC=0x%X, PLID=0x%lX",
+ PERV_SB_CS_SCOM, // 0x50008
+ TARGETING::get_huid(i_target),
+ ERRL_GETRC_SAFE(l_errl),
+ ERRL_GETPLID_SAFE(l_errl));
+ break;
+ }
+
+ // Determine how boot side is currently set
+ if(l_read_reg & SBE::SBE_BOOT_SELECT_MASK) // Currently set for Boot Side 1
+ {
+ // Set Boot Side 0 by clearing bit for side 1
+ SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X",
+ iv_switchSidesCount,
+ TARGETING::get_huid(i_target));
+ l_read_reg &= ~SBE::SBE_BOOT_SELECT_MASK;
+ }
+ else // Currently set for Boot Side 0
+ {
+ // Set Boot Side 1 by setting bit for side 1
+ SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X",
+ iv_switchSidesCount,
+ TARGETING::get_huid(i_target));
+ l_read_reg |= SBE::SBE_BOOT_SELECT_MASK;
+ }
+
+ // Write updated PERV_SB_CS_SCOM 0x50008 back into target proc
+ l_errl = DeviceFW::deviceOp(
+ DeviceFW::WRITE,
+ i_target,
+ &l_read_reg,
+ l_opSize,
+ DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) );
+ if( l_errl )
+ {
+ SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write "
+ "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, "
+ "RC=0x%X, PLID=0x%lX",
+ PERV_SB_CS_SCOM, // 0x50008
+ TARGETING::get_huid(i_target),
+ ERRL_GETRC_SAFE(l_errl),
+ ERRL_GETPLID_SAFE(l_errl));
+ break;
+ }
}
- SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount is %llx",
- iv_switchSidesCount);
// Increment switch sides count
- ++iv_switchSidesCount;
-
- // Write updated PERV_SB_CS_FSI 0x2820 back into target proc
- l_errl = DeviceFW::deviceOp(
- DeviceFW::WRITE,
- i_target,
- &l_read_reg,
- l_opSize,
- DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) );
- if( l_errl )
- {
- SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write "
- "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, "
- "RC=0x%X, PLID=0x%lX",
- PERV_SB_CS_FSI_BYTE, // 0x2820
- TARGETING::get_huid(i_target),
- ERRL_GETRC_SAFE(l_errl),
- ERRL_GETPLID_SAFE(l_errl));
- break;
- }
+ ++(this->iv_switchSidesCount);
+
+ SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount has been incremented to %llx",
+ iv_switchSidesCount);
+
+ // Since we just switched sides, and we havent attempted a boot yet,
+ // set the current attempts for this side to be 0
+ this->iv_currentSideBootAttempts = 0;
}while(0);
// Set the PLID of the error log to caller's PLID,
OpenPOWER on IntegriCloud