diff options
Diffstat (limited to 'src/usr/sbeio/common/sbe_retry_handler.C')
-rw-r--r-- | src/usr/sbeio/common/sbe_retry_handler.C | 1295 |
1 files changed, 733 insertions, 562 deletions
diff --git a/src/usr/sbeio/common/sbe_retry_handler.C b/src/usr/sbeio/common/sbe_retry_handler.C index e2889bf16..0af3eedb2 100644 --- a/src/usr/sbeio/common/sbe_retry_handler.C +++ b/src/usr/sbeio/common/sbe_retry_handler.C @@ -45,7 +45,6 @@ #include <initservice/initserviceif.H> #include <initservice/istepdispatcherif.H> #include <errl/errludtarget.H> -#include <sys/time.h> #include <util/misc.H> #include <ipmi/ipmiwatchdog.H> @@ -92,16 +91,16 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode, : iv_useSDB(false) , iv_secureModeDisabled(false) //Per HW team this should always be 0 -, iv_sbeRestarted(false) -, iv_sbeSide(0) -, iv_errorLogPLID(0) , iv_callerErrorLogPLID(i_plid) , iv_switchSidesCount(0) , iv_currentAction(P9_EXTRACT_SBE_RC::ERROR_RECOVERED) -, iv_currentSBEState(SBE_REG_RETURN::SBE_FAILED_TO_BOOT) -, iv_retriggeredMain(false) +, iv_currentSBEState(SBE_REG_RETURN::SBE_NOT_AT_RUNTIME) +, iv_shutdownReturnCode(0) +, iv_currentSideBootAttempts(1) // It is safe to assume that the current side has attempted to boot +, iv_ffdcSetAction(false) , iv_sbeMode(i_sbeMode) -, iv_sbeRestartMethod(SBE_RESTART_METHOD::START_CBS) +, iv_sbeRestartMethod(SBE_RESTART_METHOD::HRESET) +, iv_initialPowerOn(false) { SBE_TRACF(ENTER_MRK "SbeRetryHandler::SbeRetryHandler()"); @@ -111,209 +110,380 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode, SBE_TRACF(EXIT_MRK "SbeRetryHandler::SbeRetryHandler()"); } -SbeRetryHandler::~SbeRetryHandler() -{ - -} +SbeRetryHandler::~SbeRetryHandler() {} void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) { SBE_TRACF(ENTER_MRK "main_sbe_handler()"); - do { - errlHndl_t l_errl = NULL; + errlHndl_t l_errl = nullptr; + // Only set the secure debug bit (SDB) if we are not using xscom yet if(!i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>().useXscom) { this->iv_useSDB = true; } - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget( - const_cast<TARGETING::Target*> (i_target)); + // Get the SBE status register, this will tell us what state + // the SBE is in , if the asynFFDC bit is set on the sbe_reg + // then FFDC will be collected at this point in time. + // sbe_run_extract_msg_reg will return true if there was an error reading the status + if(!this->sbe_run_extract_msg_reg(i_target)) + { + SBE_TRACF("main_sbe_handler(): Failed to get sbe register something is seriously wrong, we should always be able to read that!!"); + //Error log should have already committed in sbe_run_extract_msg_reg for this issue + break; + } + + // We will only trust the currState value if we know the SBE has just been booted. + // In this case we have been told by the caller that the sbe just powered on + // so it is safe to assume that the currState value is legit and we can trust that + // the sbe has booted successfully to runtime. + if( this->iv_initialPowerOn && (this->iv_sbeRegister.currState == SBE_STATE_RUNTIME)) + { + //We have successfully powered on the SBE + SBE_TRACF("main_sbe_handler(): Initial power on of the SBE was a success!!"); + break; + } - bool l_retry = false; + //////****************************************************************** + // If we have made it this far we can assume that something is wrong w/ the SBE + //////****************************************************************** - if(this->iv_sbeMode != INFORMATIONAL_ONLY) + // If something is wrong w/ the SBE during IPL time on a FSP based system then + // we will always TI and let hwsv deal with the problem. This is a unique path + // so we will have it handled in a separate procedure +#ifndef __HOSTBOOT_RUNTIME + if(INITSERVICE::spBaseServicesEnabled()) { - this->get_sbe_reg(i_target); + // This function will TI Hostboot so don't expect to return + handleFspIplTimeFail(i_target); + SBE_TRACF("main_sbe_handler(): We failed to TI the system when we should have, forcing an assert(0) call"); + // We should never return from handleFspIplTimeFail + assert(0, "We have determined that there was an error with the SBE and should have TI'ed but for some reason we did not."); + } +#endif - if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && - !(this->iv_sbeMode == SBE_ACTION_SET)) - { - // return, false if no boot is needed, true if boot is needed. - l_retry = this->sbe_boot_fail_handler(i_target); - } - else if(this->iv_sbeMode == SBE_ACTION_SET) - { - l_retry = true; - } + // If iv_ffdcSetAction is true, that means that we found ffdc to parse + // this indicates that the SBE already determined what went wrong and + // reported the error via asyncFFDC so there is no need to + // run p9_extract_sbe_rc + // Also if the sbe is not booted at all, extract_rc will fail so we don't want to run it + if(!this->iv_ffdcSetAction && this->iv_sbeRegister.sbeBooted) + { + SBE_TRACF("main_sbe_handler(): No async ffdc found and sbe says it has been booted, running run p9_sbe_extract_rc."); + // Call the function that runs extract_rc, this needs to run to determine + // what broke and what our retry action should be + this->sbe_run_extract_rc(i_target); + } + // If we have determined that the sbe never booted + // then set the current action to be "restart sbe" + // that way we will attempt to start the sbe again + else if(!this->iv_sbeRegister.sbeBooted) + { + SBE_TRACF("main_sbe_handler(): SBE reports it was never booted, calling p9_sbe_extract_rc will fail. Setting action to be RESTART_SBE"); + //Maybe commit log here saying initial start_cbs didnt run + this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE; + } - while((this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && - l_retry) - { + // If the mode was marked as informational that means the caller did not want + // any actions to take place, the caller only wanted information collected + if(this->iv_sbeMode == INFORMATIONAL_ONLY) + { + SBE_TRACF("main_sbe_handler(): Retry handler is being called in INFORMATIONAL mode so we are exiting without attempting any retry actions"); + break; + } - SBE_TRACF("main_sbe_handler(): current SBE state is %d, retry " - "is %d current SBE action is %d", - this->iv_sbeRegister.currState, - l_retry, this->iv_currentAction); + // This do-while loop will continuously look at iv_currentAction, act + // accordingly, then read status register and determine next action. + // The ideal way to exit the loop is if the SBE makes it up to runtime after + // attempting a retry which indicates we have recovered. If the currentAction + // says NO_RECOVERY_ACTION then we break out of this loop. Also if we fail + // to read the sbe's status register or if we get write fails when trying to switch + // seeprom sides. Both the fails mentioned last indicate there is a larger problem + do + { + // We need to handle the following values that currentAction could be, + // it is possible that iv_currentAction can be any of these values except there + // is currently no path that will set it to be ERROR_RECOVERED + // ERROR_RECOVERED = 0, + // - We should never hit this, if we have recovered then + // curreState should be RUNTIME + // RESTART_SBE = 1, + // RESTART_CBS = 2, + // - We will not listen to p9_extract_rc on HOW to restart the + // sbe. We will assume iv_sbeRestartMethod is correct and + // perform the restart method that iv_sbeRestartMethod says + // regardless if currentAction = RESTART_SBE or RESTART_CBS + // REIPL_BKP_SEEPROM = 3, + // REIPL_UPD_SEEPROM = 4, + // - We will switch the seeprom side (if we have not already) + // - then attempt to restart the sbe w/ iv_sbeRestartMethod + // NO_RECOVERY_ACTION = 5, + // - we deconfigure the processor we are retrying and fail out + // + // Important things to remember, we only want to attempt a single side + // a maxiumum of 2 times, and also we only want to switch sides once + + SBE_TRACF("main_sbe_handler(): iv_sbeRegister.currState: %d , " + "iv_currentSideBootAttempts: %d , " + "iv_currentAction: %d , ", + this->iv_sbeRegister.currState, + this->iv_currentSideBootAttempts, + this->iv_currentAction); + if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + { + // There is no action possible. Gard and Callout the proc /*@ - * @errortype - * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_EXTRACT_RC_ERROR - * @userdata1 HUID of proc that had the SBE timeout - * @userdata2 SBE failing code - * - * @devdesc SBE did not start, this function is looking at - * the error to determine next course of action - * - * @custdesc The SBE did not start, we will attempt a reboot - * if possible - */ + * @errortype ERRL_SEV_UNRECOVERABLE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_NO_RECOVERY_ACTION + * @userdata1 SBE current error + * @userdata2 HUID of proc + * @devdesc There is no recovery action on the SBE. + * We're deconfiguring this proc + * @custdesc Processor Error + */ l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_INFORMATIONAL, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_EXTRACT_RC_ERROR, - TARGETING::get_huid(i_target), - this->iv_currentAction); - - l_errl->collectTrace("ISTEPS_TRACE",256); + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_NO_RECOVERY_ACTION, + P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION, + TARGETING::get_huid(i_target)); + l_errl->collectTrace( "ISTEPS_TRACE", 256); + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + l_errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_NULL ); // Set the PLID of the error log to caller's PLID, // if provided if (iv_callerErrorLogPLID) { - l_errl->plid(iv_callerErrorLogPLID); + l_errl->plid(iv_callerErrorLogPLID); } - // Commit error and continue errlCommit(l_errl, ISTEP_COMP_ID); + this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG; + SBE_TRACF("main_sbe_handler(): We have concluded there are no further recovery actions to take, deconfiguring proc and exiting handler"); + break; + } - // if no recovery action, fail out. - if(this->iv_currentAction == - P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + // if the bkp_seeprom or upd_seeprom, attempt to switch sides. + // This is also dependent on the iv_switchSideCount. + // Note: we do this for upd_seeprom because we don't support + // updating the seeprom during IPL time + if((this->iv_currentAction == + P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || + this->iv_currentAction == + P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM)) + { + if(this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) { - // There is no action possible. Gard and Callout the proc /*@ - * @errortype ERRL_SEV_UNRECOVERABLE - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_NO_RECOVERY_ACTION - * @userdata1 SBE current error - * @userdata2 HUID of proc - * @devdesc There is no recovery action on the SBE. - * We're garding this proc - */ + * @errortype ERRL_SEV_PREDICTIVE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_EXCEED_MAX_SIDE_SWITCHES + * @userdata1 Switch Sides Count + * @userdata2 HUID of proc + * @devdesc We have already flipped seeprom sides once + * and we should not have attempted to flip again + * @custdesc Processor Error + */ l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_UNRECOVERABLE, + ERRORLOG::ERRL_SEV_PREDICTIVE, SBEIO_EXTRACT_RC_HANDLER, - SBEIO_NO_RECOVERY_ACTION, - P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION, + SBEIO_EXCEED_MAX_SIDE_SWITCHES, + this->iv_switchSidesCount, TARGETING::get_huid(i_target)); - l_errl->collectTrace( "ISTEPS_TRACE", 256); - l_errl->addHwCallout( i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::DECONFIG, - HWAS::GARD_NULL ); - - // Cache PLID of error log - iv_errorLogPLID = l_errl->plid(); + l_errl->collectTrace( SBEIO_COMP_NAME, 256); // Set the PLID of the error log to caller's PLID, // if provided if (iv_callerErrorLogPLID) { - l_errl->plid(iv_callerErrorLogPLID); + l_errl->plid(iv_callerErrorLogPLID); } - + errlCommit(l_errl, SBEIO_COMP_ID); + // Break out of loop, something bad happened and we dont want end + // up in a endless loop + break; + } + l_errl = this->switch_sbe_sides(i_target); + if(l_errl) + { errlCommit(l_errl, ISTEP_COMP_ID); - - SBE_TRACF("main_sbe_handler(): updating return value " - "to indicate that we have deconfigured the proc"); - this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG; - + // If any error occurs while we are trying to switch sides + // this indicates big problems so we want to break out of the + // retry loop break; } + // Note that we do not want to continue here because we want to + // attempt to restart using whatever sbeRestartMethod is set to after + // switching seeprom sides + } - // if the bkp_seeprom or upd_seeprom, attempt to switch sides. - // This is also dependent on the iv_switchSideCount. - if(this->iv_currentAction == - P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || - this->iv_currentAction == - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) + if(this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS) + { + /*@ + * @errortype ERRL_SEV_PREDICTIVE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_EXCEED_MAX_SIDE_BOOTS + * @userdata1 # of boots attempts on this side + * @userdata2 HUID of proc + * @devdesc We have already done the max attempts for + * the current seeprom side. For some reason + * we are attempting to do another boot. + * @custdesc Processor Error + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_EXCEED_MAX_SIDE_BOOTS, + this->iv_currentSideBootAttempts, + TARGETING::get_huid(i_target)); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - l_errl = this->switch_sbe_sides(i_target); - if(l_errl) - { - errlCommit(l_errl, ISTEP_COMP_ID); - break; - } + l_errl->plid(iv_callerErrorLogPLID); } + errlCommit(l_errl, SBEIO_COMP_ID); + // Break out of loop, something bad happened and we dont want end + // up in a endless loop + break; + } + // Look at the sbeRestartMethd instance variable to determine which method + // we will use to attempt the restart. In general during IPL time we will + // attempt CBS, during runtime we will want to use HRESET. + else if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS) + { + SBE_TRACF("Invoking p9_start_cbs HWP on processor %.8X", get_huid(i_target)); + const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> + l_fapi2_proc_target (i_target); + + FAPI_INVOKE_HWP(l_errl, p9_start_cbs, + l_fapi2_proc_target, true); - // Attempt SBE restart - if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS) + //Increment attempt count for this side + this->iv_currentSideBootAttempts++; + + if(l_errl) { - SBE_TRACF("Invoking p9_start_cbs HWP"); - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> - l_fapi2_proc_target (i_target); + SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x", + l_errl->plid() ); + l_errl->collectTrace( "ISTEPS_TRACE", 256 ); + l_errl->collectTrace( SBEIO_COMP_NAME, 256 ); + + // Gard the target, when SBE Retry fails + l_errl->addHwCallout(i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_Predictive); - FAPI_INVOKE_HWP(l_errl, p9_start_cbs, - l_fapi2_proc_target, true); - if(l_errl) + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x", - l_errl->plid() ); - l_errl->collectTrace( "ISTEPS_TRACE", 256 ); - - // Gard the target, when SBE Retry fails - l_errl->addHwCallout(i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::NO_DECONFIG, - HWAS::GARD_Predictive); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - errlCommit( l_errl, ISTEP_COMP_ID); + l_errl->plid(iv_callerErrorLogPLID); } - }else - { - //@todo - RTC:180242 - Restart SBE + + errlCommit( l_errl, ISTEP_COMP_ID); + // If we got an errlog while attempting start_cbs + // we will assume that no future retry actions + // will work so we will break out of the retry loop + break; } + }else + { + //@todo RTC:180242 Right now we don't have the support + // to perform an hreset, when we do remove this error + // log and perform the hreset. + + //Increment attempt count for this side + this->iv_currentSideBootAttempts++; + /*@ + * @errortype + * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_UNSUPPORTED_REQUEST + * @userdata1 HUID of proc that had the SBE timeout + * @userdata2 SBE failing code + * + * @devdesc SBE did not start, this function is looking at + * the error to determine next course of action + * + * @custdesc The SBE did not start, we will attempt a reboot + * if possible + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_UNSUPPORTED_REQUEST, + TARGETING::get_huid(i_target), + this->iv_currentAction); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256 ); - // Get the sbe register - this->get_sbe_reg(i_target); + // Gard the proc, when SBE Retry fails + l_errl->addHwCallout(i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_Predictive); - if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)) + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - // return, false if no boot is needed. - l_retry = this->sbe_boot_fail_handler(i_target); + l_errl->plid(iv_callerErrorLogPLID); } + + errlCommit(l_errl, ISTEP_COMP_ID); + + // If we got an errlog while attempting hreset + // we will assume that no future retry actions + // will work so we will exit + break; } - } - else - { - // In the informational only mode, we just need enough information - // to get the SBE RC returned from the HWP. We are running with - // the knowledge that the SBE has failed already. - // pass true to have log show up - this->sbe_boot_fail_handler(i_target, true); - this->iv_currentSBEState = SBE_FAILED_TO_BOOT; - } + // We have performed the action, so make sure that ffdcSetAction is set back to 0 + this->iv_ffdcSetAction = 0; - this->handle_sbe_reg_value(i_target); + // Get the sbe register (note that if asyncFFDC bit is set in status register then + // we will read it in this call) + if(!this->sbe_run_extract_msg_reg(i_target)) + { + // Error log should have already committed in sbe_run_extract_msg_reg for this issue + // we need to stop our recovery efforts and bail out of the retry handler + break; + } - // if we have started the sbe, and the current action is upd_seeprom - // or bkp_seeprom, note that we started on an unexpected side - if(i_target->getAttr<TARGETING::ATTR_SBE_IS_STARTED>() && - (this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || - this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) ) + // If our retry attempt fail, and we didnt see any asyncFFDC after + if (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) + { + // Again, if ffdcSetAction is set, that means we have found FFDC + // already that the SBE saved away prior to failing so we don't need + // to run extract_rc if ffdcSetAction is true + if(!this->iv_ffdcSetAction) + { + SBE_TRACF("main_sbe_handler(): Failed to reach runtime after sbe restart and no asyncFFDC found. Calling p9_sbe_extract_rc."); + // Run extract rc to figure out why the sbe did not make it to + // runtime state + this->sbe_run_extract_rc(i_target); + } + } + + } while((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME); + + // If we ended up switching sides we want to mark it down as + // as informational log + if(this->iv_switchSidesCount) { /*@ * @errortype ERRL_SEV_INFORMATIONAL @@ -329,6 +499,7 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) SBEIO_BOOTED_UNEXPECTED_SIDE, 0,TARGETING::get_huid(i_target)); l_errl->collectTrace("ISTEPS_TRACE",256); + l_errl->collectTrace(SBEIO_COMP_NAME,256); // Set the PLID of the error log to caller's PLID, // if provided @@ -345,212 +516,106 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) SBE_TRACF(EXIT_MRK "main_sbe_handler()"); } -void SbeRetryHandler::get_sbe_reg(TARGETING::Target * i_target) +bool SbeRetryHandler::sbe_run_extract_msg_reg(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "get_sbe_reg()"); + SBE_TRACF(ENTER_MRK "sbe_run_extract_msg_reg()"); errlHndl_t l_errl = nullptr; - do + //Assume that reading the status succeeded + bool l_statusReadSuccess = true; + + // This function will poll the status register for 60 seconds + // waiting for the SBE to reach runtime + // we will exit the polling before 60 seconds if we either reach + // runtime, or get an error reading the status reg, or if the asyncFFDC + // bit is set + l_errl = this->sbe_poll_status_reg(i_target); + + // If there is no error getting the status register, and the SBE + // did not make it to runtime AND the asyncFFDC bit is set, we will + // use the FFDC to decide our actions rather than using p9_extract_sbe_rc + if((!l_errl) && + (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && + this->iv_sbeRegister.asyncFFDC) { - l_errl = this->sbe_timeout_handler(i_target); - - if((!l_errl) && (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)) - { - // See if async FFDC bit is set in SBE register - if(this->iv_sbeRegister.asyncFFDC) - { - bool l_flowCtrl = this->sbe_get_ffdc_handler(i_target); - - if(l_flowCtrl) - { - break; - } - } - } - else if (l_errl) - { - SBE_TRACF("ERROR: call get_sbe_reg, PLID=0x%x", l_errl->plid() ); - - // capture the target data in the elog - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl ); + SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X . " + "There was asyncFFDC found though so we will run the FFDC parser", + TARGETING::get_huid(i_target)); + // The SBE has responded to an asyncronus request that hostboot + // made with FFDC indicating an error has occurred. + // This should be the path we hit when we are waiting to see + // if the sbe boots + this->sbe_get_ffdc_handler(i_target); + } + // If there was an error log that means that we failed to read the + // cfam register to get the SBE status, something is seriously wrong + // if we hit this + else if (l_errl) + { + l_statusReadSuccess = false; + SBE_TRACF("ERROR: call sbe_run_extract_msg_reg, PLID=0x%x", l_errl->plid() ); - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - } - // No error and still functional - else if(i_target->getAttr<TARGETING::ATTR_HWAS_STATE>().functional) + l_errl->collectTrace(SBEIO_COMP_NAME,256); + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - // Set attribute indicating that SBE is started - i_target->setAttr<TARGETING::ATTR_SBE_IS_STARTED>(1); - this->iv_sbeRestarted = true; - - SBE_TRACF("SUCCESS: get_sbe_reg completed okay for proc 0x%.8X", - TARGETING::get_huid(i_target)); + l_errl->plid(iv_callerErrorLogPLID); } - //@TODO-RTC:100963 - this should match the logic in - //call_proc_check_slave_sbe_seeprom.C - } while(0); - SBE_TRACF(EXIT_MRK "get_sbe_reg()"); - -} - -void SbeRetryHandler::handle_sbe_reg_value(TARGETING::Target * i_target) -{ - errlHndl_t l_errl = NULL; - - SBE_TRACF(ENTER_MRK "handle_sbe_reg_value()"); - - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> - l_fapi2_proc_target(i_target); + // capture the target data in the elog + ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl ); - switch(this->iv_currentSBEState) + // Commit error log + errlCommit( l_errl, HWPF_COMP_ID ); + } + // No error, able to read the sbe status register okay + // No guarantees that the SBE made it to runtime + else { - case SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR: - { - SBE_TRACF("handle_sbe_reg_value(): case FUNCTION_ERROR"); - // There has been a failure getting the SBE register - // We cannot continue any further, return failure. - this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - break; - } - case SbeRetryHandler::SBE_REG_RETURN::SBE_AT_RUNTIME: - { - SBE_TRACF("handle_sbe_reg_value(): case SBE_AT_RUNTIME"); - // The SBE has successfully booted at runtime - this->iv_currentAction = P9_EXTRACT_SBE_RC::ERROR_RECOVERED; - break; - } - case SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT: - { - SBE_TRACF("handle_sbe_reg_value(): case SBE_FAILED_TO_BOOT"); - if((this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) - && (!iv_retriggeredMain)) - - { - iv_retriggeredMain = true; - -#ifndef __HOSTBOOT_RUNTIME - // This could potentially take awhile, reset watchdog - INITSERVICE::sendProgressCode(); -#endif - SBE_TRACF("handle_sbe_reg_value(): Attempting " - "REIPL_UPD_SEEPROM failed. Recalling with BKP_SEEPROM"); - // If we were trying to reipl and hit the error, we need - // to start with a new seeprom before hitting the threshold - this->iv_currentAction = - P9_EXTRACT_SBE_RC::RETURN_ACTION::REIPL_BKP_SEEPROM; - this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET; - main_sbe_handler(i_target); - break; - } - - // Failed to boot, setting the final action for debugging. - SBE_TRACF("Inside handle_sbe_reg_value, calling " - "p9_extract_sbe_rc HWP"); - // Get SBE extract rc - P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; - FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_fapi2_proc_target, l_rcAction); - this->iv_currentAction = l_rcAction; - - SBE_TRACF("handle_sbe_reg_value(): SBE failed to boot. Final " - "action is %llx", l_rcAction); - - if(l_errl) - { - SBE_TRACF("ERROR : p9_extract_sbe_rc HWP returning errorlog " - "PLID-0x%x", l_errl->plid()); - - // capture the target data in the elog - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); + SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X", + TARGETING::get_huid(i_target)); + } - // Cache PLID of error log - iv_errorLogPLID = l_errl->plid(); + SBE_TRACF(EXIT_MRK "sbe_run_extract_msg_reg()"); - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } + return l_statusReadSuccess; - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - } - - break; - } - default: - { - // This should never happened - // error out, unexpected enum value returned. - //return P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - /*@ - * @errortype ERRL_SEV_PREDICTIVE - * @moduleid SBEIO_HANDLE_SBE_REG_VALUE - * @reasoncode SBEIO_INCORRECT_FCN_CALL - * @userdata1 HUID of target - * @userdata2 SBE current state - * @devdesc This function was called incorrectly or - * there is a new enum that is not handled yet. - */ - l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_PREDICTIVE, - SBEIO_HANDLE_SBE_REG_VALUE, - SBEIO_INCORRECT_FCN_CALL, - get_huid(i_target),this->iv_currentSBEState); - l_errl->collectTrace("ISTEPS_TRACE",256); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - errlCommit(l_errl, ISTEP_COMP_ID); - this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - break; - } - } - SBE_TRACF(EXIT_MRK "handle_sbe_reg_value()"); } -errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) +errlHndl_t SbeRetryHandler::sbe_poll_status_reg(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "sbe_timeout_handler()"); + SBE_TRACF(ENTER_MRK "sbe_poll_status_reg()"); - errlHndl_t l_errl = NULL; + errlHndl_t l_errl = nullptr; this->iv_currentSBEState = - SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT; + SbeRetryHandler::SBE_REG_RETURN::SBE_NOT_AT_RUNTIME; const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2_proc_target(i_target); - // Each slave sbe gets 60s to respond with the fact that it's + // Each sbe gets 60s to respond with the fact that it's // booted and at runtime (stable state) - uint64_t SBE_TIMEOUT_NSEC = 60*NS_PER_SEC; //60 sec + uint64_t l_sbeTimeout = SBE_RETRY_TIMEOUT_HW; // 60 seconds // Bump this up really high for simics, things are slow there if( Util::isSimicsRunning() ) { - SBE_TIMEOUT_NSEC *= 10; + l_sbeTimeout = SBE_RETRY_TIMEOUT_SIMICS; // 600 seconds } - const uint64_t SBE_NUM_LOOPS = 100; - const uint64_t SBE_WAIT_SLEEP = (SBE_TIMEOUT_NSEC/SBE_NUM_LOOPS); + + const uint64_t SBE_WAIT_SLEEP = (l_sbeTimeout/SBE_RETRY_NUM_LOOPS); SBE_TRACF("Running p9_get_sbe_msg_register HWP on proc target %.8X", TARGETING::get_huid(i_target)); - for( uint64_t l_loops = 0; l_loops < SBE_NUM_LOOPS; l_loops++ ) + for( uint64_t l_loops = 0; l_loops < SBE_RETRY_NUM_LOOPS; l_loops++ ) { sbeMsgReg_t l_reg; FAPI_INVOKE_HWP(l_errl, p9_get_sbe_msg_register, l_fapi2_proc_target, l_reg); - this->iv_sbeRegister = l_reg; + this->iv_sbeRegister.reg = l_reg.reg; if (l_errl) { SBE_TRACF("ERROR : call p9_get_sbe_msg_register, PLID=0x%x, " @@ -558,7 +623,7 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) l_errl->plid(), l_loops ); this->iv_currentSBEState = - SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR; + SbeRetryHandler::SBE_REG_RETURN::FAILED_COLLECTING_REG; break; } else if ((this->iv_sbeRegister).currState == SBE_STATE_RUNTIME) @@ -591,46 +656,74 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) (this->iv_sbeRegister).reg); } l_loops++; +#ifndef __HOSTBOOT_RUNTIME + // reset watchdog before performing the nanosleep + INITSERVICE::sendProgressCode(); +#endif nanosleep(0,SBE_WAIT_SLEEP); } } if ((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME) { - // Switch to using FSI SCOM + // Switch to using FSI SCOM if we are not using xscom TARGETING::ScomSwitches l_switches = i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>(); TARGETING::ScomSwitches l_switches_before = l_switches; - // Turn off SBE SCOM and turn on FSI SCOM. - l_switches.useFsiScom = 1; - l_switches.useSbeScom = 0; - - SBE_TRACF("sbe_timeout_handler: changing SCOM switches from 0x%.2X " - "to 0x%.2X for proc 0x%.8X", - l_switches_before, - l_switches, - TARGETING::get_huid(i_target)); - i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches); + if(!l_switches.useXscom) + { + // Turn off SBE SCOM and turn on FSI SCOM. + l_switches.useFsiScom = 1; + l_switches.useSbeScom = 0; + + SBE_TRACF("sbe_poll_status_reg: changing SCOM switches from 0x%.2X " + "to 0x%.2X for proc 0x%.8X", + l_switches_before, + l_switches, + TARGETING::get_huid(i_target)); + i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches); + } } - // Set the PLID of the error log to caller's PLID, - // if provided - if (l_errl && iv_callerErrorLogPLID) + SBE_TRACF(EXIT_MRK "sbe_poll_status_reg()"); + return l_errl; +} + +#ifndef __HOSTBOOT_RUNTIME +void SbeRetryHandler::handleFspIplTimeFail(TARGETING::Target * i_target) +{ + // If we found that there was async FFDC available we need to notify hwsv of this + // even if we did not find anything useful in the ffdc for us, its possible hwsv + // will be able to use it. + if ((this->iv_sbeRegister).asyncFFDC) { - l_errl->plid(iv_callerErrorLogPLID); + iv_shutdownReturnCode = SBEIO_HWSV_COLLECT_SBE_RC; } - - SBE_TRACF(EXIT_MRK "sbe_timeout_handler()"); - return l_errl; + // If the asyncFFDC bit is not set on the sbeRegister + // then we need to pass the DEAD_SBE RC to hwsv when we + // TI + else + { + this->iv_shutdownReturnCode = SBEIO_DEAD_SBE; + } + SBE_TRACF("handleFspIplTimeFail(): During IPL time on FSP system hostboot will TI so that HWSV can handle the error. " + "Shutting down w/ the error code %s" , + this->iv_sbeRegister.asyncFFDC ? "SBEIO_HWSV_COLLECT_SBE_RC" : "SBEIO_DEAD_SBE" ); + + // On FSP systems if we failed to recover the SBE then we should shutdown w/ the + // correct error so that HWSV will know what FFDC to collect + INITSERVICE::doShutdownWithError(this->iv_shutdownReturnCode, + TARGETING::get_huid(i_target)); } +#endif -P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc( +uint32_t SbeRetryHandler::action_for_ffdc_rc( uint32_t i_rc) { SBE_TRACF(ENTER_MRK "action_for_ffdc_rc()"); - P9_EXTRACT_SBE_RC::RETURN_ACTION l_action; + uint32_t l_action; switch(i_rc) { @@ -675,22 +768,22 @@ P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc( case fapi2::RC_EXTRACT_SBE_RC_BRANCH_TO_SEEPROM_FAIL: case fapi2::RC_EXTRACT_SBE_RC_UNEXPECTED_OTPROM_HALT: case fapi2::RC_EXTRACT_SBE_RC_OTP_ECC_ERR: - default: l_action = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; break; + default: + + l_action = NO_ACTION_FOUND_FOR_THIS_RC; } SBE_TRACF(EXIT_MRK "action_for_ffdc_rc()"); return l_action; } -bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) +void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) { SBE_TRACF(ENTER_MRK "sbe_get_ffdc_handler()"); - - bool l_flowCtrl = false; uint32_t l_responseSize = SbeFifoRespBuffer::MSG_BUFFER_SIZE; uint32_t *l_pFifoResponse = reinterpret_cast<uint32_t *>(malloc(l_responseSize)); @@ -715,12 +808,43 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) else { // Parse the FFDC package(s) in the response - SbeFFDCParser * l_ffdc_parser = - new SbeFFDCParser(); + auto l_ffdc_parser = std::make_shared<SbeFFDCParser>(); l_ffdc_parser->parseFFDCData(reinterpret_cast<void *>(l_pFifoResponse)); uint8_t l_pkgs = l_ffdc_parser->getTotalPackages(); - P9_EXTRACT_SBE_RC::RETURN_ACTION l_action; + + // Currently we expect a maxiumum of 2 FFDC packets. These packets would be + // a HWP FFDC packet which we will look at to determine what our retry action + // should be. The other type of packet we might see would be details on the + // internal SBE fail. For internal SBE fail packets we will just add the FFDC + // to the error log and move on. + // + // Note: If we exceed MAX_EXPECTED_FFDC_PACKAGES, commit an informational log. + // It shouldn't break anything but this could help us understand if something odd + // is happening + if(l_pkgs > MAX_EXPECTED_FFDC_PACKAGES) + { + /*@ + * @errortype + * @moduleid SBEIO_GET_FFDC_HANDLER + * @reasoncode SBEIO_MORE_FFDC_THAN_EXPECTED + * @userdata1 Maximum expected packages + * @userdata2 Number of FFDC packages + * @devdesc Unexpected number of FFDC packages in buffer + * @custdesc Extra FFDC gathered, marked information event + */ + l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL, + SBEIO_GET_FFDC_HANDLER, + SBEIO_MORE_FFDC_THAN_EXPECTED, + MAX_EXPECTED_FFDC_PACKAGES, + l_pkgs); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + + // Also log the failing proc as FFDC + ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); + errlCommit(l_errl, SBEIO_COMP_ID); + } // If there are FFDC packages, make a log for FFDC from SBE if(l_pkgs > 0) @@ -742,35 +866,47 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) // Also log the failing proc as FFDC ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); - } - // Process each FFDC package - for(auto i=0; i<l_pkgs; i++) - { - // Add each package to the log - l_errl->addFFDC( SBEIO_COMP_ID, - l_ffdc_parser->getFFDCPackage(i), - l_ffdc_parser->getPackageLength(i), - 0, - SBEIO_UDT_PARAMETERS, - false ); - - // Get the RC from the FFDC package - uint32_t l_rc = l_ffdc_parser->getPackageRC(i); - - // Determine an action for the RC - l_action = action_for_ffdc_rc(l_rc); - - // Handle that action - this->iv_currentAction = l_action; - this->iv_retriggeredMain = true; - this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET; - main_sbe_handler(i_target); - } - // If there are FFDC packages, commit the log - if(l_pkgs > 0) - { + // Process each FFDC package + for(auto i=0; i<l_pkgs; i++) + { + // Add each package to the log + l_errl->addFFDC( SBEIO_COMP_ID, + l_ffdc_parser->getFFDCPackage(i), + l_ffdc_parser->getPackageLength(i), + 0, + SBEIO_UDT_PARAMETERS, + false ); + + // Get the RC from the FFDC package + uint32_t l_rc = l_ffdc_parser->getPackageRC(i); + + // Determine an action for the RC + P9_EXTRACT_SBE_RC::RETURN_ACTION l_action = + static_cast<P9_EXTRACT_SBE_RC::RETURN_ACTION>(action_for_ffdc_rc(l_rc)); + + if(l_action != NO_ACTION_FOUND_FOR_THIS_RC) + { + // Set the action associated with the RC that we found + this->iv_currentAction = l_action; + + // This call will look at what action_for_ffdc_rc had set the return action to + // checks on how many times we have attempted to boot this side, + // and if we have already tried switching sides + // + // + // Note this call is important, if this is not called we could end up in a + // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS + this->bestEffortCheck(); + + // Set the instance variable ffdcSetAction to let us + // know that the current action was set from what we + // found in the asyncFFDC + this->iv_ffdcSetAction = true; + } + } + l_errl->collectTrace( SBEIO_COMP_NAME, KILOBYTE/4); l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4); @@ -783,11 +919,6 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) errlCommit(l_errl, ISTEP_COMP_ID); } - - delete l_ffdc_parser; - l_ffdc_parser = nullptr; - - l_flowCtrl = true; } #endif @@ -795,155 +926,60 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) l_pFifoResponse = nullptr; SBE_TRACF(EXIT_MRK "sbe_get_ffdc_handler()"); - return l_flowCtrl; } -//By default we want to call the 2 param version of the func w/ "true" -//passed in to tell the function we want to hide the mandatory errlog -bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target) -{ - return SbeRetryHandler::sbe_boot_fail_handler(i_target, false); -} -bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target, - bool i_exposeLog) +void SbeRetryHandler::sbe_run_extract_rc(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "sbe_boot_fail_handler()"); + SBE_TRACF(ENTER_MRK "sbe_run_extract_rc()"); errlHndl_t l_errl = nullptr; fapi2::ReturnCode l_rc; - bool o_needRetry = false; - - SBE_TRACF("SBE 0x%.8X never started, sbeReg=0x%.8X", - TARGETING::get_huid(i_target),(this->iv_sbeRegister).reg ); - /*@ - * @errortype - * @reasoncode SBEIO_SLAVE_TIMEOUT - * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @userdata1 HUID of proc which had SBE timeout - * @userdata2 SBE MSG Register - * - * @devdesc Slave SBE did not get to ready state within - * allotted time - * - * @custdesc A processor in the system has failed to initialize - */ - l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_SLAVE_TIMEOUT, - TARGETING::get_huid(i_target), - (this->iv_sbeRegister).reg); - - l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - if(i_exposeLog) - { - l_errl->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - - } - // Commit error and continue, this is not terminating since - // we can still at least boot with master proc - errlCommit(l_errl,ISTEP_COMP_ID); - - SBE_TRACF("Inside sbe_boot_fail_handler, calling p9_extract_sbe_rc HWP"); + SBE_TRACF("Inside sbe_run_extract_rc, calling p9_extract_sbe_rc HWP"); // Setup for the HWP const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget( const_cast<TARGETING::Target*> (i_target)); + // Default the return action to be NO_RECOVERY , if something goes + // wrong in p9_extract_sbe_rc and l_ret doesn't get set in that function + // then we want to fall back on NO_RECOVERY which we will handle + // accordingly in bestEffortCheck P9_EXTRACT_SBE_RC::RETURN_ACTION l_ret = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; + P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - //Note that we are calling this while we are already inside - //of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err - //but unsure how to get around it. + // TODO RTC: 190528 Force FAPI_INVOKE_HWP to call FAPI_EXEC_HWP when FAPI_INVOKE + // is blocked by mutex + // Note that it's possible we are calling this while we are already inside + // of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err + // but unsure how to get around it. FAPI_EXEC_HWP(l_rc, p9_extract_sbe_rc, l_fapi2ProcTarget, l_ret, iv_useSDB, iv_secureModeDisabled); + // Convert the returnCode into an UNRECOVERABLE error log which we will + // associated w/ the caller's errlog via plid l_errl = rcToErrl(l_rc, ERRORLOG::ERRL_SEV_UNRECOVERABLE); this->iv_currentAction = l_ret; - if(this->iv_currentAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED) - { + // Set the instance variable ffdcSetAction to let us + // know that the current action was not set by what + // we found in asyncFFDC + this->iv_ffdcSetAction = false; - if(l_errl) - { - SBE_TRACF("p9_extract_sbe_rc HWP returned action %d and errorlog " - "PLID=0x%x, rc=0x%.4X", this->iv_currentAction, - l_errl->plid(), l_errl->reasonCode() ); - errlCommit(l_errl, SBEIO_COMP_ID); - } + // This call will look at what p9_extact_sbe_rc had set the return action to + // checks on how many times we have attempted to boot this side, + // and if we have already tried switching sides + // + // Note this call is important, if this is not called we could end up in a + // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS + this->bestEffortCheck(); - SBE_TRACF("sbe_boot_fail_handler: We have hit an error in the SBE " - "and hostboot will now attempt to reboot the SBE"); - /*@ - * @errortype - * @severity ERRORLOG::ERRL_SEV_PREDICTIVE - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_ATTEMPTING_REBOOT - * @userdata1 HUID of proc which had the SBE timeout - * @userdata2 Current action to be taken on the SBE - * @devdesc HWP has returned a reboot action to be taken - * Hostboot will now attempt to reboot the SBE - * @custdesc A processor in the system has failed to initialize. - * Hostboot is attempting a recovery. - */ - l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_PREDICTIVE, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_ATTEMPTING_REBOOT, - TARGETING::get_huid(i_target), - this->iv_currentAction); - l_errl->collectTrace("SBEIO_TRACE",KILOBYTE/4); - - // Set the PLID of the error log to caller's PLID if provided - if(iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - errlCommit(l_errl,SBEIO_COMP_ID); - - if(INITSERVICE::spBaseServicesEnabled()) - { #ifndef __HOSTBOOT_RUNTIME - // When we are on an FSP machine, we want to fail out of - // hostboot and give control back to the FSP. They have - // better diagnostics for this type of error. - INITSERVICE::doShutdownWithError(SBEIO_HWSV_COLLECT_SBE_RC, - TARGETING::get_huid(i_target)); + // This could potentially take awhile, reset watchdog + INITSERVICE::sendProgressCode(); #endif - } -#ifndef __HOSTBOOT_RUNTIME - // This could potentially take awhile, reset watchdog - INITSERVICE::sendProgressCode(); -#endif - SBE_TRACF("sbe_boot_fail_handler. iv_switchSides count is %llx", - iv_switchSidesCount); - if((this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) && - (iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT)) - { - this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; - o_needRetry = true; - } - else if(iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) - { - o_needRetry = false; - } - else - { - o_needRetry = true; - } - - } if(l_errl) { SBE_TRACF("Error: sbe_boot_fail_handler : p9_extract_sbe_rc HWP " @@ -964,84 +1000,219 @@ bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target, errlCommit( l_errl, HWPF_COMP_ID ); } - SBE_TRACF(EXIT_MRK "sbe_boot_fail_handler() current action is %llx", + SBE_TRACF(EXIT_MRK "sbe_run_extract_rc() current action is %llx", this->iv_currentAction); - return o_needRetry; +} + +void SbeRetryHandler::bestEffortCheck() +{ + // We don't want to accept that there is no recovery action just + // because that is what extract_rc is telling us. We want to make + // sure we have tried booting on this seeprom twice, and that we + // have tried the other seeprom twice as well. If we have tried all of + // those cases then we will fail out + if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + { + if (this->iv_currentSideBootAttempts < MAX_SIDE_BOOT_ATTEMPTS) + { + SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying RESTART_SBE"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE; + } + else if (this->iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying REIPL_BKP_SEEPROM"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; + } + else + { + // If we have attempted the max boot attempts on current side + // and have already switched sides once, then we will accept + // that we don't know how to recover and pass this status out + } + } + // If we have already switched sides, and extract rc is telling us to + // switch sides again, there is nothing we can do, so change currentAction + // to be NO_RECOVERY_ACTION + else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || + this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM ) + { + if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was REIPL_BKP_SEEPROM/REIPL_UPD_SEEPROM but that is not possible so changing to NO_RECOVERY_ACTION"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; + } + } + // If the extract sbe rc hwp tells us to restart, and we have already + // done 2 retries on this side, then attempt to switch sides, if we can't + // switch sides, set currentAction to NO_RECOVERY_ACTION + else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_SBE || + this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_CBS) + { + if (this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS) + { + if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but no actions possible so changing to NO_RECOVERY_ACTION"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; + } + else + { + SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but max attempts tried already so changing to REIPL_BKP_SEEPROM"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; + } + } + } } errlHndl_t SbeRetryHandler::switch_sbe_sides(TARGETING::Target * i_target) { SBE_TRACF(ENTER_MRK "switch_sbe_sides()"); - errlHndl_t l_errl = NULL; - const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32; + errlHndl_t l_errl = nullptr; + TARGETING::ATTR_PROC_SBE_MASTER_CHIP_type l_isMaster = + i_target->getAttr<TARGETING::ATTR_PROC_SBE_MASTER_CHIP>(); + +#ifdef __HOSTBOOT_RUNTIME + const bool l_isRuntime = true; +#else + const bool l_isRuntime = false; +#endif do{ - // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc - uint32_t l_read_reg = 0; - size_t l_opSize = sizeof(uint32_t); - l_errl = DeviceFW::deviceOp( - DeviceFW::READ, - i_target, - &l_read_reg, - l_opSize, - DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); - if( l_errl ) + if(!l_isRuntime && !l_isMaster) { - SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read " - "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " - "RC=0x%X, PLID=0x%lX", - PERV_SB_CS_FSI_BYTE, // 0x2820 - TARGETING::get_huid(i_target), - ERRL_GETRC_SAFE(l_errl), - ERRL_GETPLID_SAFE(l_errl)); - break; - } + const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32; + // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc + uint32_t l_read_reg = 0; + size_t l_opSize = sizeof(uint32_t); + l_errl = DeviceFW::deviceOp( + DeviceFW::READ, + i_target, + &l_read_reg, + l_opSize, + DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); + + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read " + "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_FSI_BYTE, // 0x2820 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } - // Determine how boot side is currently set - if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1 - { - // Set Boot Side 0 by clearing bit for side 1 - SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", - iv_switchSidesCount, - TARGETING::get_huid(i_target)); - l_read_reg &= ~l_sbeBootSelectMask; - this->iv_sbeSide = 1; + // Determine how boot side is currently set + if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1 + { + // Set Boot Side 0 by clearing bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg &= ~l_sbeBootSelectMask; + } + else // Currently set for Boot Side 0 + { + // Set Boot Side 1 by setting bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg |= l_sbeBootSelectMask; + } + + // Write updated PERV_SB_CS_FSI 0x2820 back into target proc + l_errl = DeviceFW::deviceOp( + DeviceFW::WRITE, + i_target, + &l_read_reg, + l_opSize, + DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " + "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_FSI_BYTE, // 0x2820 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } } - else // Currently set for Boot Side 0 + else { - // Set Boot Side 1 by setting bit for side 1 - SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", - iv_switchSidesCount, - TARGETING::get_huid(i_target)); - l_read_reg |= l_sbeBootSelectMask; - this->iv_sbeSide = 0; + // Read PERV_SB_CS_SCOM 0x50008 for target proc + uint64_t l_read_reg = 0; + size_t l_opSize = sizeof(uint64_t); + l_errl = DeviceFW::deviceOp( + DeviceFW::READ, + i_target, + &l_read_reg, + l_opSize, + DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) ); + + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: SCOM device read " + "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_SCOM, // 0x50008 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } + + // Determine how boot side is currently set + if(l_read_reg & SBE::SBE_BOOT_SELECT_MASK) // Currently set for Boot Side 1 + { + // Set Boot Side 0 by clearing bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg &= ~SBE::SBE_BOOT_SELECT_MASK; + } + else // Currently set for Boot Side 0 + { + // Set Boot Side 1 by setting bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg |= SBE::SBE_BOOT_SELECT_MASK; + } + + // Write updated PERV_SB_CS_SCOM 0x50008 back into target proc + l_errl = DeviceFW::deviceOp( + DeviceFW::WRITE, + i_target, + &l_read_reg, + l_opSize, + DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) ); + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " + "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_SCOM, // 0x50008 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } } - SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount is %llx", - iv_switchSidesCount); // Increment switch sides count - ++iv_switchSidesCount; - - // Write updated PERV_SB_CS_FSI 0x2820 back into target proc - l_errl = DeviceFW::deviceOp( - DeviceFW::WRITE, - i_target, - &l_read_reg, - l_opSize, - DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); - if( l_errl ) - { - SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " - "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " - "RC=0x%X, PLID=0x%lX", - PERV_SB_CS_FSI_BYTE, // 0x2820 - TARGETING::get_huid(i_target), - ERRL_GETRC_SAFE(l_errl), - ERRL_GETPLID_SAFE(l_errl)); - break; - } + ++(this->iv_switchSidesCount); + + SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount has been incremented to %llx", + iv_switchSidesCount); + + // Since we just switched sides, and we havent attempted a boot yet, + // set the current attempts for this side to be 0 + this->iv_currentSideBootAttempts = 0; }while(0); // Set the PLID of the error log to caller's PLID, |