diff options
| author | Christian Geddes <crgeddes@us.ibm.com> | 2018-03-14 18:26:16 -0500 |
|---|---|---|
| committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2018-04-06 10:15:43 -0400 |
| commit | ba8c8bfc02ca3d42a3caf3f8f797df07487c1dab (patch) | |
| tree | b0d8d1a4797b787a3b1194f5198746d4bd92c874 | |
| parent | 02f8995967cc97988cf3cdb40b1805915517bbaf (diff) | |
| download | talos-hostboot-ba8c8bfc02ca3d42a3caf3f8f797df07487c1dab.tar.gz talos-hostboot-ba8c8bfc02ca3d42a3caf3f8f797df07487c1dab.zip | |
sbe_retry_handler refactor
Previously the sbe_retry_handler had logic and wording that
assumed that it was being used to tell if the slave sbe booted or not.
However this code has many more use cases then that. Also there was some
indirect recursion that made the code hard to follow. With this refactor
the code should be easier to follow and the vocabulary used should be more
generic.
Change-Id: If6520197b3dd561857e336ed89d9356c1f2601d6
CQ: SW416106
RTC: 167191
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/55896
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Tested-by: Daniel M. Crowell <dcrowell@us.ibm.com>
| -rw-r--r-- | src/include/usr/sbeio/sbe_retry_handler.H | 235 | ||||
| -rw-r--r-- | src/include/usr/sbeio/sbeioreasoncodes.H | 4 | ||||
| -rw-r--r-- | src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C | 40 | ||||
| -rw-r--r-- | src/usr/isteps/istep08/makefile | 3 | ||||
| -rw-r--r-- | src/usr/sbeio/common/common.mk | 30 | ||||
| -rw-r--r-- | src/usr/sbeio/common/sbe_attn.C | 42 | ||||
| -rw-r--r-- | src/usr/sbeio/common/sbe_retry_handler.C | 1295 | ||||
| -rw-r--r-- | src/usr/sbeio/makefile | 30 | ||||
| -rw-r--r-- | src/usr/sbeio/runtime/makefile | 30 | ||||
| -rw-r--r-- | src/usr/sbeio/sbe_fifodd.C | 43 | ||||
| -rw-r--r-- | src/usr/sbeio/sbe_psudd.C | 56 | ||||
| -rw-r--r-- | src/usr/sbeio/test/sbe_retry_handler_test.H | 6 |
12 files changed, 1006 insertions, 808 deletions
diff --git a/src/include/usr/sbeio/sbe_retry_handler.H b/src/include/usr/sbeio/sbe_retry_handler.H index 47e68e7da..ae014d6c5 100644 --- a/src/include/usr/sbeio/sbe_retry_handler.H +++ b/src/include/usr/sbeio/sbe_retry_handler.H @@ -28,6 +28,7 @@ #include <isteps/hwpisteperror.H> #include <p9_extract_sbe_rc.H> #include <p9_get_sbe_msg_register.H> +#include <sys/time.h> namespace SBEIO { @@ -36,29 +37,43 @@ class SbeRetryHandler { public: - static const uint8_t MAX_SWITCH_SIDE_COUNT = 2; + //There are only 2 sides to the seeproms, so we only want to flip sides once + static constexpr uint8_t MAX_SWITCH_SIDE_COUNT = 1; + //We only want to attempt to boot with the same side seeprom twice + static constexpr uint8_t MAX_SIDE_BOOT_ATTEMPTS = 2; + // Currently we expect a maxiumum of 2 FFDC packets, the one + // that is useful to HB is the HWP FFDC. It is possible there is + // a packet that details an internal sbe fail that hostboot will + // add to an errorlog but otherwise ignores + static constexpr uint8_t MAX_EXPECTED_FFDC_PACKAGES = 2; + // action_for_ffdc_rc will figure out what action we should do + // for each p9_extract_sbe_rc return code. If the RC does not match + // any return code from p9_extract_sbe_rc then we want to have a + // known "no action found" value which is defined here + static constexpr uint32_t NO_ACTION_FOUND_FOR_THIS_RC = 0xFFFF; + + static constexpr uint64_t SBE_RETRY_TIMEOUT_HW = 60*NS_PER_SEC; // 60 seconds + static constexpr uint64_t SBE_RETRY_TIMEOUT_SIMICS = 600*NS_PER_SEC; // 600 seconds + static constexpr uint32_t SBE_RETRY_NUM_LOOPS = 100; enum SBE_REG_RETURN { - HWP_ERROR = 0, // Error returned from HWP - SBE_AT_RUNTIME = 1, // SBE is at runtime and booted - SBE_FAILED_TO_BOOT = 2, // SBE has failed to boot - PROC_DECONFIG = 3, // Deconfig done on Proc with SBE + FAILED_COLLECTING_REG = 0, // Error returned from HWP + SBE_AT_RUNTIME = 1, // SBE is at runtime and booted + SBE_NOT_AT_RUNTIME = 2, // SBE has failed to boot + PROC_DECONFIG = 3, // Deconfig done on Proc with SBE }; + //Possible values of iv_sbeMode enum SBE_MODE_OF_OPERATION { INFORMATIONAL_ONLY = 0, // Get error logs from the SBE HWP's - // This will not attempt a SBE restart, and it will only - // run the steps to get the p9_extract_sbe_rc return value. + // This will not attempt an SBE restart + // On FSP systems if informational mode is set we will TI + // On BMC systems we will run extract_rc then bail out ATTEMPT_REBOOT = 1, // Full SBE run, attempt to restart - // This will run all the steps and HWP's to attempt - // an SBE restart on both sides. - SBE_ACTION_SET = 2, // Full SBE run, but with a set action - // This will run all the steps and HWP's to attempt - // an SBE restart, however in this case we are specifying - // which SBE RETURN_ACTION we are attempting instead of - // the action the SBE thinks we should attempt. + // This will run all the steps and HWP's to attempt + // an SBE restart on both sides. }; enum SBE_RESTART_METHOD @@ -95,82 +110,78 @@ class SbeRetryHandler ~SbeRetryHandler(); /**************** Functions to return Class Elements ****************/ - inline bool getSbeRestart() + inline bool isSbeAtRuntime() { - return this->iv_sbeRestarted; - } - - inline uint8_t getSbeSide() - { - return this->iv_sbeSide; - } - - inline uint32_t getPLID() - { - return this->iv_errorLogPLID; + return (iv_currentSBEState == + SbeRetryHandler::SBE_REG_RETURN::SBE_AT_RUNTIME); } inline uint32_t getCallerPLID() { - return this->iv_callerErrorLogPLID; + return iv_callerErrorLogPLID; } inline uint8_t getSwitchCount() { - return this->iv_switchSidesCount; + return iv_switchSidesCount; } inline sbeMsgReg_t getSbeRegister() { - return this->iv_sbeRegister; + return iv_sbeRegister; } inline P9_EXTRACT_SBE_RC::RETURN_ACTION getCurrentAction() { - return this->iv_currentAction; + return iv_currentAction; } inline SBE_REG_RETURN getCurrentSBEState() { - return this->iv_currentSBEState; + return iv_currentSBEState; } inline SBE_RESTART_METHOD getSbeRestartMethod() { - return this->iv_sbeRestartMethod; + return iv_sbeRestartMethod; } inline void setSbeRestartMethod(SBE_RESTART_METHOD i_method) { - this->iv_sbeRestartMethod = i_method; + iv_sbeRestartMethod = i_method; } inline SBE_MODE_OF_OPERATION getSBEMode() { - return this->iv_sbeMode; + return iv_sbeMode; } inline void setSBEMode(SBE_MODE_OF_OPERATION i_sbeMode) { - this->iv_sbeMode = i_sbeMode; + iv_sbeMode = i_sbeMode; } inline bool getUseSDB() { - return this->iv_useSDB; + return iv_useSDB; } inline void setUseSDB(bool i_useSDB) { - this->iv_useSDB = i_useSDB; + iv_useSDB = i_useSDB; } inline bool getSecureModeDisabled() { - return this->iv_secureModeDisabled; + return iv_secureModeDisabled; } inline void setSecureModeDisabled(bool i_secureModeDisabled) { - this->iv_secureModeDisabled = i_secureModeDisabled; + iv_secureModeDisabled = i_secureModeDisabled; + } + + inline void setInitialPowerOn(bool i_isInitialPowerOn) + { + iv_initialPowerOn = i_isInitialPowerOn; } /** @@ -183,6 +194,34 @@ class SbeRetryHandler void main_sbe_handler( TARGETING::Target * i_target); private: +#ifndef __HOSTBOOT_RUNTIME + /** + * @brief This function will look at the SBE status register and decide + * whether to send the SBEIO_DEAD_SBE or SBEIO_HWSV_COLLECT_SBE_RC + * along with the TI depending on if the asyncFFDC bit is set in + * the status register + * + * @param[in] i_target - current proc target we are handling fail for + * + * @return - void + */ + void handleFspIplTimeFail(TARGETING::Target * i_target); +#endif + + /** + * @brief This function will look at what iv_currentAction is set to + * and take into account how many times we have tried to boot + * and how many times we have switched sides. + * Note: no_recovery is only an acceptable answer if we have tried + * all possibilities. That means that we must have attempted + * two boots on both sides. If we have not hit our max attempts + * for both sides then this procedure should change iv_currentAction + * to either RESTART_SBE or REIPL_BKP_SEEPROM + * + * + * @return - void + */ + void bestEffortCheck(); /** * @brief This function handles the SBE timeout and loops @@ -193,7 +232,7 @@ class SbeRetryHandler * @return - error, NULL if no error */ - errlHndl_t sbe_timeout_handler(TARGETING::Target * i_target); + errlHndl_t sbe_poll_status_reg(TARGETING::Target * i_target); /** * @brief This function handles getting the SBE FFDC. @@ -206,7 +245,7 @@ class SbeRetryHandler * of loop or current iteration */ - bool sbe_get_ffdc_handler(TARGETING::Target * i_target); + void sbe_get_ffdc_handler(TARGETING::Target * i_target); /** * @brief This function handles the SBE failed to boot error. @@ -219,22 +258,7 @@ class SbeRetryHandler * set to TRUE */ - bool sbe_boot_fail_handler(TARGETING::Target * i_target); - - /** - * @brief This function handles the SBE failed to boot error. - * - * @param[in] i_target - current proc target - * @param[in] i_exposeLog - an error log is created at the top - * of this function, if we are doing retries - * we might not want to have this log show up - * i_exposeLog = true will make the elog PREDICTIVE - * - * @return - bool: true if we need to retry - */ - - bool sbe_boot_fail_handler(TARGETING::Target * i_target, - bool i_exposeLog); + void sbe_run_extract_rc(TARGETING::Target * i_target); /** * @brief This function deals with the mask needed to switch @@ -246,13 +270,6 @@ class SbeRetryHandler */ errlHndl_t switch_sbe_sides(TARGETING::Target * i_target); - /** - * @brief This function handles the SBE register value and the actions - * that go along with it. - * - * @param[in] i_target - current proc target - */ - void handle_sbe_reg_value( TARGETING::Target * i_target); /** * @brief This is the switch case that identifies the action needed @@ -262,16 +279,20 @@ class SbeRetryHandler * * @return - pass(0) or specific returned SBE action */ - P9_EXTRACT_SBE_RC::RETURN_ACTION action_for_ffdc_rc( uint32_t i_rc); + uint32_t action_for_ffdc_rc( uint32_t i_rc); /** * @brief This function handles the call to the p9_get_sbe_msg_handler. - * It determines what state the SBE is in. + * It will read the sbe msg register (Cfam 2809 or Scom 50009) + * and update iv_currentSBEState to reflect the state that + * the sbe's msg register is telling us * * @param[in] i_target - current proc target * + * @return - return true if reading the message register was a success + * return false if there was an error getting the sbe msg register */ - void get_sbe_reg(TARGETING::Target * i_target); + bool sbe_run_extract_msg_reg(TARGETING::Target * i_target); /************************** Class Elements **************************/ @@ -293,22 +314,6 @@ class SbeRetryHandler bool iv_secureModeDisabled; /* - * @brief True if we successfully restarted the SBE - */ - bool iv_sbeRestarted; - - /* - * @brief True if we switched to the other side of the SBE - */ - uint8_t iv_sbeSide; - - /* - * @brief PLID of the error logged. 0 if no error - * was logged. - */ - uint32_t iv_errorLogPLID; - - /* * @brief PLID of the caller. 0 if caller does not * provide one. Not to be confused with the * PLID when error log is created in the usage @@ -317,7 +322,8 @@ class SbeRetryHandler uint32_t iv_callerErrorLogPLID; /* - * @brief Number of times we switch SBE sides. Max is 2 + * @brief Number of times we switch SBE sides. Max is defined by + * MAX_SWITCH_SIDE_COUNT */ uint8_t iv_switchSidesCount; @@ -337,11 +343,37 @@ class SbeRetryHandler SBE_REG_RETURN iv_currentSBEState; /* - * @brief There are a few situations in which we have to retrigger - * the main function. This variable ensures we do not fall into - * an infinite loop situation + * @brief Currently there are 3 options for what the shutdownReturnCode + * will be. It can be 0 if there is no return code we wish to + * send with shutdown. Then it can also be SBEIO_HWSV_COLLECT_SBE_RC + * to notify that HWSV should collect FFDC or it can be SBEIO_DEAD_SBE + * to tell HWSV that the SBE is dead. */ - bool iv_retriggeredMain; + uint32_t iv_shutdownReturnCode; + + /* + * @brief This value will keep track of how many times we have attempted + * to boot the current side of the SBE's seeprom. In the ctor this + * value should be 1, because if the retry handler has been called + * that means that we have attempted to boot the current side at + * least 1 time. When we switch seeprom sides this value should + * drop back to 0. It will be incremeted each time we attempt + * to call start_cbs or hreset depending on iv_sbeRestartMethod + */ + uint8_t iv_currentSideBootAttempts; + + /* + * @brief If the asyncFFDC bit is found to be set on the status register + * this indicates to hostboot that the SBE was able to collect + * FFDC about what went wrong in its attempt to boot itself + * in this case Hostboot will send a FIFO chip op to the SBE + * so the SBE will write the FFDC data out to memory where + * Hostboot can parse it. Note that after the SBE writes + * the data to memory the asyncFFDC bit on the status register + * will be off. + */ + bool iv_ffdcSetAction; + /* * @brief The mode of operation that needs to be run through the @@ -351,13 +383,32 @@ class SbeRetryHandler SBE_MODE_OF_OPERATION iv_sbeMode; /* - * @brief If true, use the HWP p9_start_cbs to restart the SBE. This - * HWP kills the proc we're on, so there are some situations - * where we want to use hreset instead. Each choice is noted - * in the SBE_RESTART_METHOD enum + * @brief This instance variable will instruct the main_sbe_handler + * loop on what method to use when attempting to restart the + * sbe that we have detected an error on. Currently there are + * two options to recover an sbe in a bad state. The first option + * is to run "start_cbs", this essentially powers down the proc + * and starts the boot sequence from the beginning. This is okay + * to use when initially trying to poweron slave processor's sbe + * but it is not as useful after that as it will blow away any fabric + * initialization we have done on the slave proc chip. The other + * option is to use HRESET. HRESET will attempt to restart the + * sbe on the fly and does not require us to completely restart + * the processor. HRESET can be used during runtime to attempt + * to recover an sbe while not disrupting the rest of the proc + * chips. Both choices are noted in the SBE_RESTART_METHOD enum */ SBE_RESTART_METHOD iv_sbeRestartMethod; + /* + * @brief If true, this tells the retry_hanlder that the caller has recently + * attempted to boot the sbe on processor passed to the ctor. This + * tells us that the sbe_status register is not stale and that we + * can use the curState value on the status register to determine + * if the SBE made it to runtime or not + */ + bool iv_initialPowerOn; + }; // End of class SbeRetryHandler } // End of namespace SBEIO diff --git a/src/include/usr/sbeio/sbeioreasoncodes.H b/src/include/usr/sbeio/sbeioreasoncodes.H index 2bd07d553..1b3bab689 100644 --- a/src/include/usr/sbeio/sbeioreasoncodes.H +++ b/src/include/usr/sbeio/sbeioreasoncodes.H @@ -123,6 +123,10 @@ enum sbeioReasonCode SBEIO_RETURNED_FFDC = SBEIO_COMP_ID | 0x57, SBEIO_SLAVE_TIMEOUT = SBEIO_COMP_ID | 0x58, SBEIO_ATTEMPTING_REBOOT = SBEIO_COMP_ID | 0x59, + SBEIO_UNSUPPORTED_REQUEST = SBEIO_COMP_ID | 0x5A, + SBEIO_MORE_FFDC_THAN_EXPECTED = SBEIO_COMP_ID | 0x5B, + SBEIO_EXCEED_MAX_SIDE_SWITCHES = SBEIO_COMP_ID | 0x5C, + SBEIO_EXCEED_MAX_SIDE_BOOTS = SBEIO_COMP_ID | 0x5D, // SBE Vital Attention error codes SBEIO_SBE_RC_VALUE_INFO = SBEIO_COMP_ID | 0x60, diff --git a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C index 215d0e35f..4b45a8de2 100644 --- a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C +++ b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2015,2017 */ +/* Contributors Listed Below - COPYRIGHT 2015,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -129,13 +129,24 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs ) " on processor target %.8X", TARGETING::get_huid(l_cpu_target)); + //Note no PLID passed in SBEIO::SbeRetryHandler l_SBEobj = SBEIO::SbeRetryHandler( SBEIO::SbeRetryHandler::SBE_MODE_OF_OPERATION::ATTEMPT_REBOOT); + l_SBEobj.setSbeRestartMethod(SBEIO::SbeRetryHandler:: + SBE_RESTART_METHOD::START_CBS); + + // We want to tell the retry handler that we have just powered + // on the sbe, to distinguish this case from other cases where + // we have determine there is something wrong w/ the sbe and + // want to diagnose the problem + l_SBEobj.setInitialPowerOn(true); + l_SBEobj.main_sbe_handler(l_cpu_target); - // No error and still functional - if(l_cpu_target->getAttr<ATTR_HWAS_STATE>().functional) + // We will judge whether or not the SBE had a succesful + // boot or not depending on if it made it to runtime or not + if(l_SBEobj.isSbeAtRuntime()) { // Set attribute indicating that SBE is started l_cpu_target->setAttr<ATTR_SBE_IS_STARTED>(1); @@ -173,29 +184,6 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs ) "Running p9_extract_sbe_rc HWP" " on processor target %.8X", TARGETING::get_huid(l_cpu_target) ); - - //@TODO-RTC:100963-Do something with the RETURN_ACTION - P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction - = P9_EXTRACT_SBE_RC::RE_IPL; - FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_fapi2ProcTarget, - l_rcAction); - if (l_errl) - { - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "ERROR : proc_check_slave_sbe_seeprom_complete " - "failed, p9_extract_sbe_rc HWP returning errorlog PLID=0x%x", - l_errl->plid()); - - // capture the target data in the elog - ErrlUserDetailsTarget(l_cpu_target).addToLog( l_errl ); - - // Create IStep error log and cross reference to error that occurred - l_stepError.addErrorDetails( l_errl ); - - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - } **/ } // end of going through all processors diff --git a/src/usr/isteps/istep08/makefile b/src/usr/isteps/istep08/makefile index 6aed23e56..347d8e2b9 100644 --- a/src/usr/isteps/istep08/makefile +++ b/src/usr/isteps/istep08/makefile @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2015,2017 +# Contributors Listed Below - COPYRIGHT 2015,2018 # [+] International Business Machines Corp. # # @@ -74,7 +74,6 @@ include ${PROCEDURES_PATH}/hwp/perv/p9_start_cbs.mk # proc_check_slave_sbe_seeprom_complete : Check Slave SBE Complete include ${PROCEDURES_PATH}/hwp/perv/p9_check_slave_sbe_seeprom_complete.mk -include ${PROCEDURES_PATH}/hwp/sbe/p9_get_sbe_msg_register.mk include ${PROCEDURES_PATH}/hwp/perv/p9_getecid.mk # host_p9_fbc_eff_config diff --git a/src/usr/sbeio/common/common.mk b/src/usr/sbeio/common/common.mk index 397af666f..2163f81bb 100644 --- a/src/usr/sbeio/common/common.mk +++ b/src/usr/sbeio/common/common.mk @@ -22,5 +22,31 @@ # permissions and limitations under the License. # # IBM_PROLOG_END_TAG -SBEIO_COMMON_OBJS += sbe_attn.o -SBEIO_COMMON_OBJS += sbe_retry_handler.o + +#Common .mk files to include +include ${ROOTPATH}/procedure.rules.mk +include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/sbe/p9_get_sbe_msg_register.mk +include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_start_cbs.mk + +#Common Include Paths +EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/ffdc +EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/perv +EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/lib +EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/sbe +EXTRAINCDIR += ${ROOTPATH}/src/import/hwpf/fapi2/include +EXTRAINCDIR += ${ROOTPATH}/src/include/usr/fapi2 +EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils +EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils/imageProcs +EXTRAINCDIR += ${ROOTPATH}/src/import/chips/common/utils/imageProcs + +#Common Objects +OBJS += p9_extract_sbe_rc.o +OBJS += p9_ppe_common.o +OBJS += sbe_attn.o +OBJS += sbe_retry_handler.o + +#Common VPATHs +VPATH += ${ROOTPATH}/src/usr/sbeio/common +VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/sbe/ +VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/lib/ +VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/
\ No newline at end of file diff --git a/src/usr/sbeio/common/sbe_attn.C b/src/usr/sbeio/common/sbe_attn.C index 5ad8152e1..5c151b4eb 100644 --- a/src/usr/sbeio/common/sbe_attn.C +++ b/src/usr/sbeio/common/sbe_attn.C @@ -55,11 +55,7 @@ namespace SBEIO TARGETING::get_huid(i_procTarg) ); errlHndl_t l_errhdl = nullptr; - uint32_t l_sbePlid = getSbeRC(i_procTarg); - - TRACFCOMP( g_trac_sbeio, "handleVitalAttn> Returned SBE PLID=0x%x", - l_sbePlid); - + // TODO 167191 Full SBE Belly-Up Handling for OP #ifdef __HOSTBOOT_RUNTIME // Inform OPAL, SBE is currently disabled if (TARGETING::is_sapphire_load()) @@ -74,16 +70,13 @@ namespace SBEIO SbeRetryHandler l_sbeObj = SbeRetryHandler( SbeRetryHandler::SBE_MODE_OF_OPERATION::ATTEMPT_REBOOT); - // @todo - RTC:180242. Once the hreset method is finalized, - // we can call the sbe handler with that method - //l_sbeObj.setSbeRestartMethod(SbeRetryHandler:: - // SBE_RESTART_METHOD::HRESET); - l_sbeObj.main_sbe_handler(i_procTarg); + //l_sbeObj.main_sbe_handler(i_procTarg); + #ifdef __HOSTBOOT_RUNTIME // Inform OPAL the state of the SBE after a retry - if (l_sbeObj.getSbeRestart()) + if (l_sbeObj.isSbeAtRuntime()) { if (TARGETING::is_sapphire_load()) { @@ -100,31 +93,4 @@ namespace SBEIO return l_errhdl; } - uint32_t getSbeRC(TARGETING::Target* i_target) - { - TRACFCOMP( g_trac_sbeio, ENTER_MRK "getSbeRC()"); - - errlHndl_t l_errl = nullptr; - - uint32_t l_errlPlid = NULL; - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget( - const_cast<TARGETING::Target*> (i_target)); - - P9_EXTRACT_SBE_RC::RETURN_ACTION l_ret = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; - FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_fapi2ProcTarget, l_ret); - - if(l_errl) - { - TRACFCOMP(g_trac_sbeio, "ERROR: p9_extract_sbe_rc HWP returning " - "errorlog PLID: 0x%x", l_errl->plid()); - - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); - l_errlPlid = l_errl->plid(); - } - - return l_errlPlid; - } - }; diff --git a/src/usr/sbeio/common/sbe_retry_handler.C b/src/usr/sbeio/common/sbe_retry_handler.C index e2889bf16..0af3eedb2 100644 --- a/src/usr/sbeio/common/sbe_retry_handler.C +++ b/src/usr/sbeio/common/sbe_retry_handler.C @@ -45,7 +45,6 @@ #include <initservice/initserviceif.H> #include <initservice/istepdispatcherif.H> #include <errl/errludtarget.H> -#include <sys/time.h> #include <util/misc.H> #include <ipmi/ipmiwatchdog.H> @@ -92,16 +91,16 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode, : iv_useSDB(false) , iv_secureModeDisabled(false) //Per HW team this should always be 0 -, iv_sbeRestarted(false) -, iv_sbeSide(0) -, iv_errorLogPLID(0) , iv_callerErrorLogPLID(i_plid) , iv_switchSidesCount(0) , iv_currentAction(P9_EXTRACT_SBE_RC::ERROR_RECOVERED) -, iv_currentSBEState(SBE_REG_RETURN::SBE_FAILED_TO_BOOT) -, iv_retriggeredMain(false) +, iv_currentSBEState(SBE_REG_RETURN::SBE_NOT_AT_RUNTIME) +, iv_shutdownReturnCode(0) +, iv_currentSideBootAttempts(1) // It is safe to assume that the current side has attempted to boot +, iv_ffdcSetAction(false) , iv_sbeMode(i_sbeMode) -, iv_sbeRestartMethod(SBE_RESTART_METHOD::START_CBS) +, iv_sbeRestartMethod(SBE_RESTART_METHOD::HRESET) +, iv_initialPowerOn(false) { SBE_TRACF(ENTER_MRK "SbeRetryHandler::SbeRetryHandler()"); @@ -111,209 +110,380 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode, SBE_TRACF(EXIT_MRK "SbeRetryHandler::SbeRetryHandler()"); } -SbeRetryHandler::~SbeRetryHandler() -{ - -} +SbeRetryHandler::~SbeRetryHandler() {} void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) { SBE_TRACF(ENTER_MRK "main_sbe_handler()"); - do { - errlHndl_t l_errl = NULL; + errlHndl_t l_errl = nullptr; + // Only set the secure debug bit (SDB) if we are not using xscom yet if(!i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>().useXscom) { this->iv_useSDB = true; } - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget( - const_cast<TARGETING::Target*> (i_target)); + // Get the SBE status register, this will tell us what state + // the SBE is in , if the asynFFDC bit is set on the sbe_reg + // then FFDC will be collected at this point in time. + // sbe_run_extract_msg_reg will return true if there was an error reading the status + if(!this->sbe_run_extract_msg_reg(i_target)) + { + SBE_TRACF("main_sbe_handler(): Failed to get sbe register something is seriously wrong, we should always be able to read that!!"); + //Error log should have already committed in sbe_run_extract_msg_reg for this issue + break; + } + + // We will only trust the currState value if we know the SBE has just been booted. + // In this case we have been told by the caller that the sbe just powered on + // so it is safe to assume that the currState value is legit and we can trust that + // the sbe has booted successfully to runtime. + if( this->iv_initialPowerOn && (this->iv_sbeRegister.currState == SBE_STATE_RUNTIME)) + { + //We have successfully powered on the SBE + SBE_TRACF("main_sbe_handler(): Initial power on of the SBE was a success!!"); + break; + } - bool l_retry = false; + //////****************************************************************** + // If we have made it this far we can assume that something is wrong w/ the SBE + //////****************************************************************** - if(this->iv_sbeMode != INFORMATIONAL_ONLY) + // If something is wrong w/ the SBE during IPL time on a FSP based system then + // we will always TI and let hwsv deal with the problem. This is a unique path + // so we will have it handled in a separate procedure +#ifndef __HOSTBOOT_RUNTIME + if(INITSERVICE::spBaseServicesEnabled()) { - this->get_sbe_reg(i_target); + // This function will TI Hostboot so don't expect to return + handleFspIplTimeFail(i_target); + SBE_TRACF("main_sbe_handler(): We failed to TI the system when we should have, forcing an assert(0) call"); + // We should never return from handleFspIplTimeFail + assert(0, "We have determined that there was an error with the SBE and should have TI'ed but for some reason we did not."); + } +#endif - if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && - !(this->iv_sbeMode == SBE_ACTION_SET)) - { - // return, false if no boot is needed, true if boot is needed. - l_retry = this->sbe_boot_fail_handler(i_target); - } - else if(this->iv_sbeMode == SBE_ACTION_SET) - { - l_retry = true; - } + // If iv_ffdcSetAction is true, that means that we found ffdc to parse + // this indicates that the SBE already determined what went wrong and + // reported the error via asyncFFDC so there is no need to + // run p9_extract_sbe_rc + // Also if the sbe is not booted at all, extract_rc will fail so we don't want to run it + if(!this->iv_ffdcSetAction && this->iv_sbeRegister.sbeBooted) + { + SBE_TRACF("main_sbe_handler(): No async ffdc found and sbe says it has been booted, running run p9_sbe_extract_rc."); + // Call the function that runs extract_rc, this needs to run to determine + // what broke and what our retry action should be + this->sbe_run_extract_rc(i_target); + } + // If we have determined that the sbe never booted + // then set the current action to be "restart sbe" + // that way we will attempt to start the sbe again + else if(!this->iv_sbeRegister.sbeBooted) + { + SBE_TRACF("main_sbe_handler(): SBE reports it was never booted, calling p9_sbe_extract_rc will fail. Setting action to be RESTART_SBE"); + //Maybe commit log here saying initial start_cbs didnt run + this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE; + } - while((this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && - l_retry) - { + // If the mode was marked as informational that means the caller did not want + // any actions to take place, the caller only wanted information collected + if(this->iv_sbeMode == INFORMATIONAL_ONLY) + { + SBE_TRACF("main_sbe_handler(): Retry handler is being called in INFORMATIONAL mode so we are exiting without attempting any retry actions"); + break; + } - SBE_TRACF("main_sbe_handler(): current SBE state is %d, retry " - "is %d current SBE action is %d", - this->iv_sbeRegister.currState, - l_retry, this->iv_currentAction); + // This do-while loop will continuously look at iv_currentAction, act + // accordingly, then read status register and determine next action. + // The ideal way to exit the loop is if the SBE makes it up to runtime after + // attempting a retry which indicates we have recovered. If the currentAction + // says NO_RECOVERY_ACTION then we break out of this loop. Also if we fail + // to read the sbe's status register or if we get write fails when trying to switch + // seeprom sides. Both the fails mentioned last indicate there is a larger problem + do + { + // We need to handle the following values that currentAction could be, + // it is possible that iv_currentAction can be any of these values except there + // is currently no path that will set it to be ERROR_RECOVERED + // ERROR_RECOVERED = 0, + // - We should never hit this, if we have recovered then + // curreState should be RUNTIME + // RESTART_SBE = 1, + // RESTART_CBS = 2, + // - We will not listen to p9_extract_rc on HOW to restart the + // sbe. We will assume iv_sbeRestartMethod is correct and + // perform the restart method that iv_sbeRestartMethod says + // regardless if currentAction = RESTART_SBE or RESTART_CBS + // REIPL_BKP_SEEPROM = 3, + // REIPL_UPD_SEEPROM = 4, + // - We will switch the seeprom side (if we have not already) + // - then attempt to restart the sbe w/ iv_sbeRestartMethod + // NO_RECOVERY_ACTION = 5, + // - we deconfigure the processor we are retrying and fail out + // + // Important things to remember, we only want to attempt a single side + // a maxiumum of 2 times, and also we only want to switch sides once + + SBE_TRACF("main_sbe_handler(): iv_sbeRegister.currState: %d , " + "iv_currentSideBootAttempts: %d , " + "iv_currentAction: %d , ", + this->iv_sbeRegister.currState, + this->iv_currentSideBootAttempts, + this->iv_currentAction); + if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + { + // There is no action possible. Gard and Callout the proc /*@ - * @errortype - * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_EXTRACT_RC_ERROR - * @userdata1 HUID of proc that had the SBE timeout - * @userdata2 SBE failing code - * - * @devdesc SBE did not start, this function is looking at - * the error to determine next course of action - * - * @custdesc The SBE did not start, we will attempt a reboot - * if possible - */ + * @errortype ERRL_SEV_UNRECOVERABLE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_NO_RECOVERY_ACTION + * @userdata1 SBE current error + * @userdata2 HUID of proc + * @devdesc There is no recovery action on the SBE. + * We're deconfiguring this proc + * @custdesc Processor Error + */ l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_INFORMATIONAL, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_EXTRACT_RC_ERROR, - TARGETING::get_huid(i_target), - this->iv_currentAction); - - l_errl->collectTrace("ISTEPS_TRACE",256); + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_NO_RECOVERY_ACTION, + P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION, + TARGETING::get_huid(i_target)); + l_errl->collectTrace( "ISTEPS_TRACE", 256); + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + l_errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_NULL ); // Set the PLID of the error log to caller's PLID, // if provided if (iv_callerErrorLogPLID) { - l_errl->plid(iv_callerErrorLogPLID); + l_errl->plid(iv_callerErrorLogPLID); } - // Commit error and continue errlCommit(l_errl, ISTEP_COMP_ID); + this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG; + SBE_TRACF("main_sbe_handler(): We have concluded there are no further recovery actions to take, deconfiguring proc and exiting handler"); + break; + } - // if no recovery action, fail out. - if(this->iv_currentAction == - P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + // if the bkp_seeprom or upd_seeprom, attempt to switch sides. + // This is also dependent on the iv_switchSideCount. + // Note: we do this for upd_seeprom because we don't support + // updating the seeprom during IPL time + if((this->iv_currentAction == + P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || + this->iv_currentAction == + P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM)) + { + if(this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) { - // There is no action possible. Gard and Callout the proc /*@ - * @errortype ERRL_SEV_UNRECOVERABLE - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_NO_RECOVERY_ACTION - * @userdata1 SBE current error - * @userdata2 HUID of proc - * @devdesc There is no recovery action on the SBE. - * We're garding this proc - */ + * @errortype ERRL_SEV_PREDICTIVE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_EXCEED_MAX_SIDE_SWITCHES + * @userdata1 Switch Sides Count + * @userdata2 HUID of proc + * @devdesc We have already flipped seeprom sides once + * and we should not have attempted to flip again + * @custdesc Processor Error + */ l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_UNRECOVERABLE, + ERRORLOG::ERRL_SEV_PREDICTIVE, SBEIO_EXTRACT_RC_HANDLER, - SBEIO_NO_RECOVERY_ACTION, - P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION, + SBEIO_EXCEED_MAX_SIDE_SWITCHES, + this->iv_switchSidesCount, TARGETING::get_huid(i_target)); - l_errl->collectTrace( "ISTEPS_TRACE", 256); - l_errl->addHwCallout( i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::DECONFIG, - HWAS::GARD_NULL ); - - // Cache PLID of error log - iv_errorLogPLID = l_errl->plid(); + l_errl->collectTrace( SBEIO_COMP_NAME, 256); // Set the PLID of the error log to caller's PLID, // if provided if (iv_callerErrorLogPLID) { - l_errl->plid(iv_callerErrorLogPLID); + l_errl->plid(iv_callerErrorLogPLID); } - + errlCommit(l_errl, SBEIO_COMP_ID); + // Break out of loop, something bad happened and we dont want end + // up in a endless loop + break; + } + l_errl = this->switch_sbe_sides(i_target); + if(l_errl) + { errlCommit(l_errl, ISTEP_COMP_ID); - - SBE_TRACF("main_sbe_handler(): updating return value " - "to indicate that we have deconfigured the proc"); - this->iv_currentSBEState = SBE_REG_RETURN::PROC_DECONFIG; - + // If any error occurs while we are trying to switch sides + // this indicates big problems so we want to break out of the + // retry loop break; } + // Note that we do not want to continue here because we want to + // attempt to restart using whatever sbeRestartMethod is set to after + // switching seeprom sides + } - // if the bkp_seeprom or upd_seeprom, attempt to switch sides. - // This is also dependent on the iv_switchSideCount. - if(this->iv_currentAction == - P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || - this->iv_currentAction == - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) + if(this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS) + { + /*@ + * @errortype ERRL_SEV_PREDICTIVE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_EXCEED_MAX_SIDE_BOOTS + * @userdata1 # of boots attempts on this side + * @userdata2 HUID of proc + * @devdesc We have already done the max attempts for + * the current seeprom side. For some reason + * we are attempting to do another boot. + * @custdesc Processor Error + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_EXCEED_MAX_SIDE_BOOTS, + this->iv_currentSideBootAttempts, + TARGETING::get_huid(i_target)); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - l_errl = this->switch_sbe_sides(i_target); - if(l_errl) - { - errlCommit(l_errl, ISTEP_COMP_ID); - break; - } + l_errl->plid(iv_callerErrorLogPLID); } + errlCommit(l_errl, SBEIO_COMP_ID); + // Break out of loop, something bad happened and we dont want end + // up in a endless loop + break; + } + // Look at the sbeRestartMethd instance variable to determine which method + // we will use to attempt the restart. In general during IPL time we will + // attempt CBS, during runtime we will want to use HRESET. + else if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS) + { + SBE_TRACF("Invoking p9_start_cbs HWP on processor %.8X", get_huid(i_target)); + const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> + l_fapi2_proc_target (i_target); + + FAPI_INVOKE_HWP(l_errl, p9_start_cbs, + l_fapi2_proc_target, true); - // Attempt SBE restart - if(this->iv_sbeRestartMethod == SBE_RESTART_METHOD::START_CBS) + //Increment attempt count for this side + this->iv_currentSideBootAttempts++; + + if(l_errl) { - SBE_TRACF("Invoking p9_start_cbs HWP"); - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> - l_fapi2_proc_target (i_target); + SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x", + l_errl->plid() ); + l_errl->collectTrace( "ISTEPS_TRACE", 256 ); + l_errl->collectTrace( SBEIO_COMP_NAME, 256 ); + + // Gard the target, when SBE Retry fails + l_errl->addHwCallout(i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_Predictive); - FAPI_INVOKE_HWP(l_errl, p9_start_cbs, - l_fapi2_proc_target, true); - if(l_errl) + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - SBE_TRACF("ERROR: call p9_start_cbs, PLID=0x%x", - l_errl->plid() ); - l_errl->collectTrace( "ISTEPS_TRACE", 256 ); - - // Gard the target, when SBE Retry fails - l_errl->addHwCallout(i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::NO_DECONFIG, - HWAS::GARD_Predictive); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - errlCommit( l_errl, ISTEP_COMP_ID); + l_errl->plid(iv_callerErrorLogPLID); } - }else - { - //@todo - RTC:180242 - Restart SBE + + errlCommit( l_errl, ISTEP_COMP_ID); + // If we got an errlog while attempting start_cbs + // we will assume that no future retry actions + // will work so we will break out of the retry loop + break; } + }else + { + //@todo RTC:180242 Right now we don't have the support + // to perform an hreset, when we do remove this error + // log and perform the hreset. + + //Increment attempt count for this side + this->iv_currentSideBootAttempts++; + /*@ + * @errortype + * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE + * @moduleid SBEIO_EXTRACT_RC_HANDLER + * @reasoncode SBEIO_UNSUPPORTED_REQUEST + * @userdata1 HUID of proc that had the SBE timeout + * @userdata2 SBE failing code + * + * @devdesc SBE did not start, this function is looking at + * the error to determine next course of action + * + * @custdesc The SBE did not start, we will attempt a reboot + * if possible + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + SBEIO_EXTRACT_RC_HANDLER, + SBEIO_UNSUPPORTED_REQUEST, + TARGETING::get_huid(i_target), + this->iv_currentAction); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256 ); - // Get the sbe register - this->get_sbe_reg(i_target); + // Gard the proc, when SBE Retry fails + l_errl->addHwCallout(i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_Predictive); - if( (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)) + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - // return, false if no boot is needed. - l_retry = this->sbe_boot_fail_handler(i_target); + l_errl->plid(iv_callerErrorLogPLID); } + + errlCommit(l_errl, ISTEP_COMP_ID); + + // If we got an errlog while attempting hreset + // we will assume that no future retry actions + // will work so we will exit + break; } - } - else - { - // In the informational only mode, we just need enough information - // to get the SBE RC returned from the HWP. We are running with - // the knowledge that the SBE has failed already. - // pass true to have log show up - this->sbe_boot_fail_handler(i_target, true); - this->iv_currentSBEState = SBE_FAILED_TO_BOOT; - } + // We have performed the action, so make sure that ffdcSetAction is set back to 0 + this->iv_ffdcSetAction = 0; - this->handle_sbe_reg_value(i_target); + // Get the sbe register (note that if asyncFFDC bit is set in status register then + // we will read it in this call) + if(!this->sbe_run_extract_msg_reg(i_target)) + { + // Error log should have already committed in sbe_run_extract_msg_reg for this issue + // we need to stop our recovery efforts and bail out of the retry handler + break; + } - // if we have started the sbe, and the current action is upd_seeprom - // or bkp_seeprom, note that we started on an unexpected side - if(i_target->getAttr<TARGETING::ATTR_SBE_IS_STARTED>() && - (this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || - this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) ) + // If our retry attempt fail, and we didnt see any asyncFFDC after + if (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) + { + // Again, if ffdcSetAction is set, that means we have found FFDC + // already that the SBE saved away prior to failing so we don't need + // to run extract_rc if ffdcSetAction is true + if(!this->iv_ffdcSetAction) + { + SBE_TRACF("main_sbe_handler(): Failed to reach runtime after sbe restart and no asyncFFDC found. Calling p9_sbe_extract_rc."); + // Run extract rc to figure out why the sbe did not make it to + // runtime state + this->sbe_run_extract_rc(i_target); + } + } + + } while((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME); + + // If we ended up switching sides we want to mark it down as + // as informational log + if(this->iv_switchSidesCount) { /*@ * @errortype ERRL_SEV_INFORMATIONAL @@ -329,6 +499,7 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) SBEIO_BOOTED_UNEXPECTED_SIDE, 0,TARGETING::get_huid(i_target)); l_errl->collectTrace("ISTEPS_TRACE",256); + l_errl->collectTrace(SBEIO_COMP_NAME,256); // Set the PLID of the error log to caller's PLID, // if provided @@ -345,212 +516,106 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target ) SBE_TRACF(EXIT_MRK "main_sbe_handler()"); } -void SbeRetryHandler::get_sbe_reg(TARGETING::Target * i_target) +bool SbeRetryHandler::sbe_run_extract_msg_reg(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "get_sbe_reg()"); + SBE_TRACF(ENTER_MRK "sbe_run_extract_msg_reg()"); errlHndl_t l_errl = nullptr; - do + //Assume that reading the status succeeded + bool l_statusReadSuccess = true; + + // This function will poll the status register for 60 seconds + // waiting for the SBE to reach runtime + // we will exit the polling before 60 seconds if we either reach + // runtime, or get an error reading the status reg, or if the asyncFFDC + // bit is set + l_errl = this->sbe_poll_status_reg(i_target); + + // If there is no error getting the status register, and the SBE + // did not make it to runtime AND the asyncFFDC bit is set, we will + // use the FFDC to decide our actions rather than using p9_extract_sbe_rc + if((!l_errl) && + (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) && + this->iv_sbeRegister.asyncFFDC) { - l_errl = this->sbe_timeout_handler(i_target); - - if((!l_errl) && (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)) - { - // See if async FFDC bit is set in SBE register - if(this->iv_sbeRegister.asyncFFDC) - { - bool l_flowCtrl = this->sbe_get_ffdc_handler(i_target); - - if(l_flowCtrl) - { - break; - } - } - } - else if (l_errl) - { - SBE_TRACF("ERROR: call get_sbe_reg, PLID=0x%x", l_errl->plid() ); - - // capture the target data in the elog - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl ); + SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X . " + "There was asyncFFDC found though so we will run the FFDC parser", + TARGETING::get_huid(i_target)); + // The SBE has responded to an asyncronus request that hostboot + // made with FFDC indicating an error has occurred. + // This should be the path we hit when we are waiting to see + // if the sbe boots + this->sbe_get_ffdc_handler(i_target); + } + // If there was an error log that means that we failed to read the + // cfam register to get the SBE status, something is seriously wrong + // if we hit this + else if (l_errl) + { + l_statusReadSuccess = false; + SBE_TRACF("ERROR: call sbe_run_extract_msg_reg, PLID=0x%x", l_errl->plid() ); - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - } - // No error and still functional - else if(i_target->getAttr<TARGETING::ATTR_HWAS_STATE>().functional) + l_errl->collectTrace(SBEIO_COMP_NAME,256); + // Set the PLID of the error log to caller's PLID, + // if provided + if (iv_callerErrorLogPLID) { - // Set attribute indicating that SBE is started - i_target->setAttr<TARGETING::ATTR_SBE_IS_STARTED>(1); - this->iv_sbeRestarted = true; - - SBE_TRACF("SUCCESS: get_sbe_reg completed okay for proc 0x%.8X", - TARGETING::get_huid(i_target)); + l_errl->plid(iv_callerErrorLogPLID); } - //@TODO-RTC:100963 - this should match the logic in - //call_proc_check_slave_sbe_seeprom.C - } while(0); - SBE_TRACF(EXIT_MRK "get_sbe_reg()"); - -} - -void SbeRetryHandler::handle_sbe_reg_value(TARGETING::Target * i_target) -{ - errlHndl_t l_errl = NULL; - - SBE_TRACF(ENTER_MRK "handle_sbe_reg_value()"); - - const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> - l_fapi2_proc_target(i_target); + // capture the target data in the elog + ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl ); - switch(this->iv_currentSBEState) + // Commit error log + errlCommit( l_errl, HWPF_COMP_ID ); + } + // No error, able to read the sbe status register okay + // No guarantees that the SBE made it to runtime + else { - case SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR: - { - SBE_TRACF("handle_sbe_reg_value(): case FUNCTION_ERROR"); - // There has been a failure getting the SBE register - // We cannot continue any further, return failure. - this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - break; - } - case SbeRetryHandler::SBE_REG_RETURN::SBE_AT_RUNTIME: - { - SBE_TRACF("handle_sbe_reg_value(): case SBE_AT_RUNTIME"); - // The SBE has successfully booted at runtime - this->iv_currentAction = P9_EXTRACT_SBE_RC::ERROR_RECOVERED; - break; - } - case SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT: - { - SBE_TRACF("handle_sbe_reg_value(): case SBE_FAILED_TO_BOOT"); - if((this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM) - && (!iv_retriggeredMain)) - - { - iv_retriggeredMain = true; - -#ifndef __HOSTBOOT_RUNTIME - // This could potentially take awhile, reset watchdog - INITSERVICE::sendProgressCode(); -#endif - SBE_TRACF("handle_sbe_reg_value(): Attempting " - "REIPL_UPD_SEEPROM failed. Recalling with BKP_SEEPROM"); - // If we were trying to reipl and hit the error, we need - // to start with a new seeprom before hitting the threshold - this->iv_currentAction = - P9_EXTRACT_SBE_RC::RETURN_ACTION::REIPL_BKP_SEEPROM; - this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET; - main_sbe_handler(i_target); - break; - } - - // Failed to boot, setting the final action for debugging. - SBE_TRACF("Inside handle_sbe_reg_value, calling " - "p9_extract_sbe_rc HWP"); - // Get SBE extract rc - P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; - FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_fapi2_proc_target, l_rcAction); - this->iv_currentAction = l_rcAction; - - SBE_TRACF("handle_sbe_reg_value(): SBE failed to boot. Final " - "action is %llx", l_rcAction); - - if(l_errl) - { - SBE_TRACF("ERROR : p9_extract_sbe_rc HWP returning errorlog " - "PLID-0x%x", l_errl->plid()); - - // capture the target data in the elog - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); + SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X", + TARGETING::get_huid(i_target)); + } - // Cache PLID of error log - iv_errorLogPLID = l_errl->plid(); + SBE_TRACF(EXIT_MRK "sbe_run_extract_msg_reg()"); - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } + return l_statusReadSuccess; - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - } - - break; - } - default: - { - // This should never happened - // error out, unexpected enum value returned. - //return P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - /*@ - * @errortype ERRL_SEV_PREDICTIVE - * @moduleid SBEIO_HANDLE_SBE_REG_VALUE - * @reasoncode SBEIO_INCORRECT_FCN_CALL - * @userdata1 HUID of target - * @userdata2 SBE current state - * @devdesc This function was called incorrectly or - * there is a new enum that is not handled yet. - */ - l_errl = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_PREDICTIVE, - SBEIO_HANDLE_SBE_REG_VALUE, - SBEIO_INCORRECT_FCN_CALL, - get_huid(i_target),this->iv_currentSBEState); - l_errl->collectTrace("ISTEPS_TRACE",256); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - errlCommit(l_errl, ISTEP_COMP_ID); - this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - break; - } - } - SBE_TRACF(EXIT_MRK "handle_sbe_reg_value()"); } -errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) +errlHndl_t SbeRetryHandler::sbe_poll_status_reg(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "sbe_timeout_handler()"); + SBE_TRACF(ENTER_MRK "sbe_poll_status_reg()"); - errlHndl_t l_errl = NULL; + errlHndl_t l_errl = nullptr; this->iv_currentSBEState = - SbeRetryHandler::SBE_REG_RETURN::SBE_FAILED_TO_BOOT; + SbeRetryHandler::SBE_REG_RETURN::SBE_NOT_AT_RUNTIME; const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2_proc_target(i_target); - // Each slave sbe gets 60s to respond with the fact that it's + // Each sbe gets 60s to respond with the fact that it's // booted and at runtime (stable state) - uint64_t SBE_TIMEOUT_NSEC = 60*NS_PER_SEC; //60 sec + uint64_t l_sbeTimeout = SBE_RETRY_TIMEOUT_HW; // 60 seconds // Bump this up really high for simics, things are slow there if( Util::isSimicsRunning() ) { - SBE_TIMEOUT_NSEC *= 10; + l_sbeTimeout = SBE_RETRY_TIMEOUT_SIMICS; // 600 seconds } - const uint64_t SBE_NUM_LOOPS = 100; - const uint64_t SBE_WAIT_SLEEP = (SBE_TIMEOUT_NSEC/SBE_NUM_LOOPS); + + const uint64_t SBE_WAIT_SLEEP = (l_sbeTimeout/SBE_RETRY_NUM_LOOPS); SBE_TRACF("Running p9_get_sbe_msg_register HWP on proc target %.8X", TARGETING::get_huid(i_target)); - for( uint64_t l_loops = 0; l_loops < SBE_NUM_LOOPS; l_loops++ ) + for( uint64_t l_loops = 0; l_loops < SBE_RETRY_NUM_LOOPS; l_loops++ ) { sbeMsgReg_t l_reg; FAPI_INVOKE_HWP(l_errl, p9_get_sbe_msg_register, l_fapi2_proc_target, l_reg); - this->iv_sbeRegister = l_reg; + this->iv_sbeRegister.reg = l_reg.reg; if (l_errl) { SBE_TRACF("ERROR : call p9_get_sbe_msg_register, PLID=0x%x, " @@ -558,7 +623,7 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) l_errl->plid(), l_loops ); this->iv_currentSBEState = - SbeRetryHandler::SBE_REG_RETURN::HWP_ERROR; + SbeRetryHandler::SBE_REG_RETURN::FAILED_COLLECTING_REG; break; } else if ((this->iv_sbeRegister).currState == SBE_STATE_RUNTIME) @@ -591,46 +656,74 @@ errlHndl_t SbeRetryHandler::sbe_timeout_handler(TARGETING::Target * i_target) (this->iv_sbeRegister).reg); } l_loops++; +#ifndef __HOSTBOOT_RUNTIME + // reset watchdog before performing the nanosleep + INITSERVICE::sendProgressCode(); +#endif nanosleep(0,SBE_WAIT_SLEEP); } } if ((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME) { - // Switch to using FSI SCOM + // Switch to using FSI SCOM if we are not using xscom TARGETING::ScomSwitches l_switches = i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>(); TARGETING::ScomSwitches l_switches_before = l_switches; - // Turn off SBE SCOM and turn on FSI SCOM. - l_switches.useFsiScom = 1; - l_switches.useSbeScom = 0; - - SBE_TRACF("sbe_timeout_handler: changing SCOM switches from 0x%.2X " - "to 0x%.2X for proc 0x%.8X", - l_switches_before, - l_switches, - TARGETING::get_huid(i_target)); - i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches); + if(!l_switches.useXscom) + { + // Turn off SBE SCOM and turn on FSI SCOM. + l_switches.useFsiScom = 1; + l_switches.useSbeScom = 0; + + SBE_TRACF("sbe_poll_status_reg: changing SCOM switches from 0x%.2X " + "to 0x%.2X for proc 0x%.8X", + l_switches_before, + l_switches, + TARGETING::get_huid(i_target)); + i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches); + } } - // Set the PLID of the error log to caller's PLID, - // if provided - if (l_errl && iv_callerErrorLogPLID) + SBE_TRACF(EXIT_MRK "sbe_poll_status_reg()"); + return l_errl; +} + +#ifndef __HOSTBOOT_RUNTIME +void SbeRetryHandler::handleFspIplTimeFail(TARGETING::Target * i_target) +{ + // If we found that there was async FFDC available we need to notify hwsv of this + // even if we did not find anything useful in the ffdc for us, its possible hwsv + // will be able to use it. + if ((this->iv_sbeRegister).asyncFFDC) { - l_errl->plid(iv_callerErrorLogPLID); + iv_shutdownReturnCode = SBEIO_HWSV_COLLECT_SBE_RC; } - - SBE_TRACF(EXIT_MRK "sbe_timeout_handler()"); - return l_errl; + // If the asyncFFDC bit is not set on the sbeRegister + // then we need to pass the DEAD_SBE RC to hwsv when we + // TI + else + { + this->iv_shutdownReturnCode = SBEIO_DEAD_SBE; + } + SBE_TRACF("handleFspIplTimeFail(): During IPL time on FSP system hostboot will TI so that HWSV can handle the error. " + "Shutting down w/ the error code %s" , + this->iv_sbeRegister.asyncFFDC ? "SBEIO_HWSV_COLLECT_SBE_RC" : "SBEIO_DEAD_SBE" ); + + // On FSP systems if we failed to recover the SBE then we should shutdown w/ the + // correct error so that HWSV will know what FFDC to collect + INITSERVICE::doShutdownWithError(this->iv_shutdownReturnCode, + TARGETING::get_huid(i_target)); } +#endif -P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc( +uint32_t SbeRetryHandler::action_for_ffdc_rc( uint32_t i_rc) { SBE_TRACF(ENTER_MRK "action_for_ffdc_rc()"); - P9_EXTRACT_SBE_RC::RETURN_ACTION l_action; + uint32_t l_action; switch(i_rc) { @@ -675,22 +768,22 @@ P9_EXTRACT_SBE_RC::RETURN_ACTION SbeRetryHandler::action_for_ffdc_rc( case fapi2::RC_EXTRACT_SBE_RC_BRANCH_TO_SEEPROM_FAIL: case fapi2::RC_EXTRACT_SBE_RC_UNEXPECTED_OTPROM_HALT: case fapi2::RC_EXTRACT_SBE_RC_OTP_ECC_ERR: - default: l_action = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; break; + default: + + l_action = NO_ACTION_FOUND_FOR_THIS_RC; } SBE_TRACF(EXIT_MRK "action_for_ffdc_rc()"); return l_action; } -bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) +void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) { SBE_TRACF(ENTER_MRK "sbe_get_ffdc_handler()"); - - bool l_flowCtrl = false; uint32_t l_responseSize = SbeFifoRespBuffer::MSG_BUFFER_SIZE; uint32_t *l_pFifoResponse = reinterpret_cast<uint32_t *>(malloc(l_responseSize)); @@ -715,12 +808,43 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) else { // Parse the FFDC package(s) in the response - SbeFFDCParser * l_ffdc_parser = - new SbeFFDCParser(); + auto l_ffdc_parser = std::make_shared<SbeFFDCParser>(); l_ffdc_parser->parseFFDCData(reinterpret_cast<void *>(l_pFifoResponse)); uint8_t l_pkgs = l_ffdc_parser->getTotalPackages(); - P9_EXTRACT_SBE_RC::RETURN_ACTION l_action; + + // Currently we expect a maxiumum of 2 FFDC packets. These packets would be + // a HWP FFDC packet which we will look at to determine what our retry action + // should be. The other type of packet we might see would be details on the + // internal SBE fail. For internal SBE fail packets we will just add the FFDC + // to the error log and move on. + // + // Note: If we exceed MAX_EXPECTED_FFDC_PACKAGES, commit an informational log. + // It shouldn't break anything but this could help us understand if something odd + // is happening + if(l_pkgs > MAX_EXPECTED_FFDC_PACKAGES) + { + /*@ + * @errortype + * @moduleid SBEIO_GET_FFDC_HANDLER + * @reasoncode SBEIO_MORE_FFDC_THAN_EXPECTED + * @userdata1 Maximum expected packages + * @userdata2 Number of FFDC packages + * @devdesc Unexpected number of FFDC packages in buffer + * @custdesc Extra FFDC gathered, marked information event + */ + l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL, + SBEIO_GET_FFDC_HANDLER, + SBEIO_MORE_FFDC_THAN_EXPECTED, + MAX_EXPECTED_FFDC_PACKAGES, + l_pkgs); + + l_errl->collectTrace( SBEIO_COMP_NAME, 256); + + // Also log the failing proc as FFDC + ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); + errlCommit(l_errl, SBEIO_COMP_ID); + } // If there are FFDC packages, make a log for FFDC from SBE if(l_pkgs > 0) @@ -742,35 +866,47 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) // Also log the failing proc as FFDC ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_errl); - } - // Process each FFDC package - for(auto i=0; i<l_pkgs; i++) - { - // Add each package to the log - l_errl->addFFDC( SBEIO_COMP_ID, - l_ffdc_parser->getFFDCPackage(i), - l_ffdc_parser->getPackageLength(i), - 0, - SBEIO_UDT_PARAMETERS, - false ); - - // Get the RC from the FFDC package - uint32_t l_rc = l_ffdc_parser->getPackageRC(i); - - // Determine an action for the RC - l_action = action_for_ffdc_rc(l_rc); - - // Handle that action - this->iv_currentAction = l_action; - this->iv_retriggeredMain = true; - this->iv_sbeMode = SBE_MODE_OF_OPERATION::SBE_ACTION_SET; - main_sbe_handler(i_target); - } - // If there are FFDC packages, commit the log - if(l_pkgs > 0) - { + // Process each FFDC package + for(auto i=0; i<l_pkgs; i++) + { + // Add each package to the log + l_errl->addFFDC( SBEIO_COMP_ID, + l_ffdc_parser->getFFDCPackage(i), + l_ffdc_parser->getPackageLength(i), + 0, + SBEIO_UDT_PARAMETERS, + false ); + + // Get the RC from the FFDC package + uint32_t l_rc = l_ffdc_parser->getPackageRC(i); + + // Determine an action for the RC + P9_EXTRACT_SBE_RC::RETURN_ACTION l_action = + static_cast<P9_EXTRACT_SBE_RC::RETURN_ACTION>(action_for_ffdc_rc(l_rc)); + + if(l_action != NO_ACTION_FOUND_FOR_THIS_RC) + { + // Set the action associated with the RC that we found + this->iv_currentAction = l_action; + + // This call will look at what action_for_ffdc_rc had set the return action to + // checks on how many times we have attempted to boot this side, + // and if we have already tried switching sides + // + // + // Note this call is important, if this is not called we could end up in a + // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS + this->bestEffortCheck(); + + // Set the instance variable ffdcSetAction to let us + // know that the current action was set from what we + // found in the asyncFFDC + this->iv_ffdcSetAction = true; + } + } + l_errl->collectTrace( SBEIO_COMP_NAME, KILOBYTE/4); l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4); @@ -783,11 +919,6 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) errlCommit(l_errl, ISTEP_COMP_ID); } - - delete l_ffdc_parser; - l_ffdc_parser = nullptr; - - l_flowCtrl = true; } #endif @@ -795,155 +926,60 @@ bool SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target) l_pFifoResponse = nullptr; SBE_TRACF(EXIT_MRK "sbe_get_ffdc_handler()"); - return l_flowCtrl; } -//By default we want to call the 2 param version of the func w/ "true" -//passed in to tell the function we want to hide the mandatory errlog -bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target) -{ - return SbeRetryHandler::sbe_boot_fail_handler(i_target, false); -} -bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target, - bool i_exposeLog) +void SbeRetryHandler::sbe_run_extract_rc(TARGETING::Target * i_target) { - SBE_TRACF(ENTER_MRK "sbe_boot_fail_handler()"); + SBE_TRACF(ENTER_MRK "sbe_run_extract_rc()"); errlHndl_t l_errl = nullptr; fapi2::ReturnCode l_rc; - bool o_needRetry = false; - - SBE_TRACF("SBE 0x%.8X never started, sbeReg=0x%.8X", - TARGETING::get_huid(i_target),(this->iv_sbeRegister).reg ); - /*@ - * @errortype - * @reasoncode SBEIO_SLAVE_TIMEOUT - * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @userdata1 HUID of proc which had SBE timeout - * @userdata2 SBE MSG Register - * - * @devdesc Slave SBE did not get to ready state within - * allotted time - * - * @custdesc A processor in the system has failed to initialize - */ - l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_SLAVE_TIMEOUT, - TARGETING::get_huid(i_target), - (this->iv_sbeRegister).reg); - - l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4); - - // Set the PLID of the error log to caller's PLID, - // if provided - if (iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - - if(i_exposeLog) - { - l_errl->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - - } - // Commit error and continue, this is not terminating since - // we can still at least boot with master proc - errlCommit(l_errl,ISTEP_COMP_ID); - - SBE_TRACF("Inside sbe_boot_fail_handler, calling p9_extract_sbe_rc HWP"); + SBE_TRACF("Inside sbe_run_extract_rc, calling p9_extract_sbe_rc HWP"); // Setup for the HWP const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget( const_cast<TARGETING::Target*> (i_target)); + // Default the return action to be NO_RECOVERY , if something goes + // wrong in p9_extract_sbe_rc and l_ret doesn't get set in that function + // then we want to fall back on NO_RECOVERY which we will handle + // accordingly in bestEffortCheck P9_EXTRACT_SBE_RC::RETURN_ACTION l_ret = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; + P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; - //Note that we are calling this while we are already inside - //of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err - //but unsure how to get around it. + // TODO RTC: 190528 Force FAPI_INVOKE_HWP to call FAPI_EXEC_HWP when FAPI_INVOKE + // is blocked by mutex + // Note that it's possible we are calling this while we are already inside + // of a FAPI_INVOKE_HWP call. This might cause issue w/ current_err + // but unsure how to get around it. FAPI_EXEC_HWP(l_rc, p9_extract_sbe_rc, l_fapi2ProcTarget, l_ret, iv_useSDB, iv_secureModeDisabled); + // Convert the returnCode into an UNRECOVERABLE error log which we will + // associated w/ the caller's errlog via plid l_errl = rcToErrl(l_rc, ERRORLOG::ERRL_SEV_UNRECOVERABLE); this->iv_currentAction = l_ret; - if(this->iv_currentAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED) - { + // Set the instance variable ffdcSetAction to let us + // know that the current action was not set by what + // we found in asyncFFDC + this->iv_ffdcSetAction = false; - if(l_errl) - { - SBE_TRACF("p9_extract_sbe_rc HWP returned action %d and errorlog " - "PLID=0x%x, rc=0x%.4X", this->iv_currentAction, - l_errl->plid(), l_errl->reasonCode() ); - errlCommit(l_errl, SBEIO_COMP_ID); - } + // This call will look at what p9_extact_sbe_rc had set the return action to + // checks on how many times we have attempted to boot this side, + // and if we have already tried switching sides + // + // Note this call is important, if this is not called we could end up in a + // endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS + this->bestEffortCheck(); - SBE_TRACF("sbe_boot_fail_handler: We have hit an error in the SBE " - "and hostboot will now attempt to reboot the SBE"); - /*@ - * @errortype - * @severity ERRORLOG::ERRL_SEV_PREDICTIVE - * @moduleid SBEIO_EXTRACT_RC_HANDLER - * @reasoncode SBEIO_ATTEMPTING_REBOOT - * @userdata1 HUID of proc which had the SBE timeout - * @userdata2 Current action to be taken on the SBE - * @devdesc HWP has returned a reboot action to be taken - * Hostboot will now attempt to reboot the SBE - * @custdesc A processor in the system has failed to initialize. - * Hostboot is attempting a recovery. - */ - l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_PREDICTIVE, - SBEIO_EXTRACT_RC_HANDLER, - SBEIO_ATTEMPTING_REBOOT, - TARGETING::get_huid(i_target), - this->iv_currentAction); - l_errl->collectTrace("SBEIO_TRACE",KILOBYTE/4); - - // Set the PLID of the error log to caller's PLID if provided - if(iv_callerErrorLogPLID) - { - l_errl->plid(iv_callerErrorLogPLID); - } - errlCommit(l_errl,SBEIO_COMP_ID); - - if(INITSERVICE::spBaseServicesEnabled()) - { #ifndef __HOSTBOOT_RUNTIME - // When we are on an FSP machine, we want to fail out of - // hostboot and give control back to the FSP. They have - // better diagnostics for this type of error. - INITSERVICE::doShutdownWithError(SBEIO_HWSV_COLLECT_SBE_RC, - TARGETING::get_huid(i_target)); + // This could potentially take awhile, reset watchdog + INITSERVICE::sendProgressCode(); #endif - } -#ifndef __HOSTBOOT_RUNTIME - // This could potentially take awhile, reset watchdog - INITSERVICE::sendProgressCode(); -#endif - SBE_TRACF("sbe_boot_fail_handler. iv_switchSides count is %llx", - iv_switchSidesCount); - if((this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) && - (iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT)) - { - this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; - o_needRetry = true; - } - else if(iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) - { - o_needRetry = false; - } - else - { - o_needRetry = true; - } - - } if(l_errl) { SBE_TRACF("Error: sbe_boot_fail_handler : p9_extract_sbe_rc HWP " @@ -964,84 +1000,219 @@ bool SbeRetryHandler::sbe_boot_fail_handler(TARGETING::Target * i_target, errlCommit( l_errl, HWPF_COMP_ID ); } - SBE_TRACF(EXIT_MRK "sbe_boot_fail_handler() current action is %llx", + SBE_TRACF(EXIT_MRK "sbe_run_extract_rc() current action is %llx", this->iv_currentAction); - return o_needRetry; +} + +void SbeRetryHandler::bestEffortCheck() +{ + // We don't want to accept that there is no recovery action just + // because that is what extract_rc is telling us. We want to make + // sure we have tried booting on this seeprom twice, and that we + // have tried the other seeprom twice as well. If we have tried all of + // those cases then we will fail out + if(this->iv_currentAction == P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION) + { + if (this->iv_currentSideBootAttempts < MAX_SIDE_BOOT_ATTEMPTS) + { + SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying RESTART_SBE"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE; + } + else if (this->iv_switchSidesCount < MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was NO_RECOVERY_ACTION but we are trying REIPL_BKP_SEEPROM"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; + } + else + { + // If we have attempted the max boot attempts on current side + // and have already switched sides once, then we will accept + // that we don't know how to recover and pass this status out + } + } + // If we have already switched sides, and extract rc is telling us to + // switch sides again, there is nothing we can do, so change currentAction + // to be NO_RECOVERY_ACTION + else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM || + this->iv_currentAction == P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM ) + { + if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was REIPL_BKP_SEEPROM/REIPL_UPD_SEEPROM but that is not possible so changing to NO_RECOVERY_ACTION"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; + } + } + // If the extract sbe rc hwp tells us to restart, and we have already + // done 2 retries on this side, then attempt to switch sides, if we can't + // switch sides, set currentAction to NO_RECOVERY_ACTION + else if(this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_SBE || + this->iv_currentAction == P9_EXTRACT_SBE_RC::RESTART_CBS) + { + if (this->iv_currentSideBootAttempts >= MAX_SIDE_BOOT_ATTEMPTS) + { + if (this->iv_switchSidesCount >= MAX_SWITCH_SIDE_COUNT) + { + SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but no actions possible so changing to NO_RECOVERY_ACTION"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; + } + else + { + SBE_TRACF("bestEffortCheck(): suggested action was RESTART_SBE/RESTART_CBS but max attempts tried already so changing to REIPL_BKP_SEEPROM"); + this->iv_currentAction = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM; + } + } + } } errlHndl_t SbeRetryHandler::switch_sbe_sides(TARGETING::Target * i_target) { SBE_TRACF(ENTER_MRK "switch_sbe_sides()"); - errlHndl_t l_errl = NULL; - const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32; + errlHndl_t l_errl = nullptr; + TARGETING::ATTR_PROC_SBE_MASTER_CHIP_type l_isMaster = + i_target->getAttr<TARGETING::ATTR_PROC_SBE_MASTER_CHIP>(); + +#ifdef __HOSTBOOT_RUNTIME + const bool l_isRuntime = true; +#else + const bool l_isRuntime = false; +#endif do{ - // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc - uint32_t l_read_reg = 0; - size_t l_opSize = sizeof(uint32_t); - l_errl = DeviceFW::deviceOp( - DeviceFW::READ, - i_target, - &l_read_reg, - l_opSize, - DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); - if( l_errl ) + if(!l_isRuntime && !l_isMaster) { - SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read " - "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " - "RC=0x%X, PLID=0x%lX", - PERV_SB_CS_FSI_BYTE, // 0x2820 - TARGETING::get_huid(i_target), - ERRL_GETRC_SAFE(l_errl), - ERRL_GETPLID_SAFE(l_errl)); - break; - } + const uint32_t l_sbeBootSelectMask = SBE::SBE_BOOT_SELECT_MASK >> 32; + // Read PERV_SB_CS_FSI_BYTE 0x2820 for target proc + uint32_t l_read_reg = 0; + size_t l_opSize = sizeof(uint32_t); + l_errl = DeviceFW::deviceOp( + DeviceFW::READ, + i_target, + &l_read_reg, + l_opSize, + DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); + + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device read " + "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_FSI_BYTE, // 0x2820 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } - // Determine how boot side is currently set - if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1 - { - // Set Boot Side 0 by clearing bit for side 1 - SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", - iv_switchSidesCount, - TARGETING::get_huid(i_target)); - l_read_reg &= ~l_sbeBootSelectMask; - this->iv_sbeSide = 1; + // Determine how boot side is currently set + if(l_read_reg & l_sbeBootSelectMask) // Currently set for Boot Side 1 + { + // Set Boot Side 0 by clearing bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg &= ~l_sbeBootSelectMask; + } + else // Currently set for Boot Side 0 + { + // Set Boot Side 1 by setting bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg |= l_sbeBootSelectMask; + } + + // Write updated PERV_SB_CS_FSI 0x2820 back into target proc + l_errl = DeviceFW::deviceOp( + DeviceFW::WRITE, + i_target, + &l_read_reg, + l_opSize, + DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " + "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_FSI_BYTE, // 0x2820 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } } - else // Currently set for Boot Side 0 + else { - // Set Boot Side 1 by setting bit for side 1 - SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", - iv_switchSidesCount, - TARGETING::get_huid(i_target)); - l_read_reg |= l_sbeBootSelectMask; - this->iv_sbeSide = 0; + // Read PERV_SB_CS_SCOM 0x50008 for target proc + uint64_t l_read_reg = 0; + size_t l_opSize = sizeof(uint64_t); + l_errl = DeviceFW::deviceOp( + DeviceFW::READ, + i_target, + &l_read_reg, + l_opSize, + DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) ); + + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: SCOM device read " + "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_SCOM, // 0x50008 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } + + // Determine how boot side is currently set + if(l_read_reg & SBE::SBE_BOOT_SELECT_MASK) // Currently set for Boot Side 1 + { + // Set Boot Side 0 by clearing bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 0 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg &= ~SBE::SBE_BOOT_SELECT_MASK; + } + else // Currently set for Boot Side 0 + { + // Set Boot Side 1 by setting bit for side 1 + SBE_TRACF( "switch_sbe_sides #%d: Set Boot Side 1 for HUID 0x%08X", + iv_switchSidesCount, + TARGETING::get_huid(i_target)); + l_read_reg |= SBE::SBE_BOOT_SELECT_MASK; + } + + // Write updated PERV_SB_CS_SCOM 0x50008 back into target proc + l_errl = DeviceFW::deviceOp( + DeviceFW::WRITE, + i_target, + &l_read_reg, + l_opSize, + DEVICE_SCOM_ADDRESS(PERV_SB_CS_SCOM) ); + if( l_errl ) + { + SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " + "PERV_SB_CS_SCOM (0x%.4X), proc target = %.8X, " + "RC=0x%X, PLID=0x%lX", + PERV_SB_CS_SCOM, // 0x50008 + TARGETING::get_huid(i_target), + ERRL_GETRC_SAFE(l_errl), + ERRL_GETPLID_SAFE(l_errl)); + break; + } } - SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount is %llx", - iv_switchSidesCount); // Increment switch sides count - ++iv_switchSidesCount; - - // Write updated PERV_SB_CS_FSI 0x2820 back into target proc - l_errl = DeviceFW::deviceOp( - DeviceFW::WRITE, - i_target, - &l_read_reg, - l_opSize, - DEVICE_FSI_ADDRESS(PERV_SB_CS_FSI_BYTE) ); - if( l_errl ) - { - SBE_TRACF( ERR_MRK"switch_sbe_sides: FSI device write " - "PERV_SB_CS_FSI_BYTE (0x%.4X), proc target = %.8X, " - "RC=0x%X, PLID=0x%lX", - PERV_SB_CS_FSI_BYTE, // 0x2820 - TARGETING::get_huid(i_target), - ERRL_GETRC_SAFE(l_errl), - ERRL_GETPLID_SAFE(l_errl)); - break; - } + ++(this->iv_switchSidesCount); + + SBE_TRACF("switch_sbe_sides(): iv_switchSidesCount has been incremented to %llx", + iv_switchSidesCount); + + // Since we just switched sides, and we havent attempted a boot yet, + // set the current attempts for this side to be 0 + this->iv_currentSideBootAttempts = 0; }while(0); // Set the PLID of the error log to caller's PLID, diff --git a/src/usr/sbeio/makefile b/src/usr/sbeio/makefile index da41e8862..fccde6aa4 100644 --- a/src/usr/sbeio/makefile +++ b/src/usr/sbeio/makefile @@ -25,19 +25,11 @@ ROOTPATH = ../../.. PROCEDURES_PATH = ${ROOTPATH}/src/import/chips/p9/procedures MODULE = sbeio -include common/common.mk - -EXTRAINCDIR += ${ROOTPATH}/src/import/hwpf/fapi2/include -EXTRAINCDIR += ${ROOTPATH}/src/include/usr/fapi2 -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils/imageProcs -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/common/utils/imageProcs -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/ffdc -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/lib -EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/sbe/ +# pull in .mk common between sbeio and sbeio_rt +include common/common.mk +# sbeio's unique objects OBJS += sbe_psudd.o OBJS += sbe_utils.o OBJS += sbe_secureHwp.o @@ -56,22 +48,8 @@ OBJS += sbe_getSBEFFDC.o OBJS += sbe_memRegionMgr.o OBJS += sbe_fifo_buffer.o OBJS += sbe_ffdc_package_parser.o -OBJS += ${SBEIO_COMMON_OBJS} - -VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/ -VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/lib/ -VPATH += ${ROOTPATH}/src/usr/sbeio/common - -include ${ROOTPATH}/procedure.rules.mk - -#Not using the ekb mk file because it includes extra files -# that we already include in libfapi2: -# - p9_ppe_utils.o -#include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_extract_sbe_rc.mk -OBJS += p9_extract_sbe_rc.o -OBJS += p9_ppe_common.o - +# sbeio's sub directories SUBDIRS += test.d SUBDIRS += runtime.d diff --git a/src/usr/sbeio/runtime/makefile b/src/usr/sbeio/runtime/makefile index 541ad0b77..37792b554 100644 --- a/src/usr/sbeio/runtime/makefile +++ b/src/usr/sbeio/runtime/makefile @@ -31,41 +31,15 @@ PROCEDURES_PATH = ${ROOTPATH}/src/import/chips/p9/procedures MODULE = sbeio_rt +# pull in .mk common between sbeio and sbeio_rt include ../common/common.mk -EXTRAINCDIR += ${ROOTPATH}/src/import/hwpf/fapi2/include -EXTRAINCDIR += ${ROOTPATH}/src/include/usr/fapi2 -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils/imageProcs -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/common/utils/imageProcs -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/ffdc -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv -EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/lib - -EXTRAINCDIR += ${PROCEDURES_PATH}/hwp/sbe/ - ## Objects unique to HBRT OBJS += rt_sbeio.o OBJS += sbeio_attr_override.o OBJS += sbeio_vital_attn.o -OBJS += ${SBEIO_COMMON_OBJS} - -VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/ -VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/sbe/ -VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/lib/ -VPATH += ../common - -include ${ROOTPATH}/procedure.rules.mk -include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/sbe/p9_get_sbe_msg_register.mk -include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_start_cbs.mk - -#Not using the ekb mk file because it includes extra files -# that we already include in libfapi2: -# - p9_ppe_utils.o -#include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_extract_sbe_rc.mk -OBJS += p9_extract_sbe_rc.o -OBJS += p9_ppe_common.o +## sbeio_rt's sub directories SUBDIRS += test.d include ${ROOTPATH}/config.mk diff --git a/src/usr/sbeio/sbe_fifodd.C b/src/usr/sbeio/sbe_fifodd.C index 5906452ec..66b533540 100644 --- a/src/usr/sbeio/sbe_fifodd.C +++ b/src/usr/sbeio/sbe_fifodd.C @@ -47,6 +47,7 @@ #include <sbeio/sbe_sp_intf.H> #include <xscom/piberror.H> #include <sbeio/sbe_retry_handler.H> +#include <initservice/initserviceif.H> extern trace_desc_t* g_trac_sbeio; @@ -657,14 +658,37 @@ errlHndl_t SbeFifo::waitDnFifoReady(TARGETING::Target * i_target, errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_HIGH); - errl->addHwCallout( i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::NO_DECONFIG, - HWAS::GARD_NULL ); + // Keep a copy of the plid so we can pass it to the retry_handler + // so the error logs it creates will be linked + uint32_t l_errPlid = errl->plid(); - //It is likely that the SBE is in a failed state so set up retry handler + // Commit errlor log now if this is a FSP system because + // we will not return from retry handler + if(INITSERVICE::spBaseServicesEnabled()) + { + errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL ); + ERRORLOG::errlCommit( errl, SBEIO_COMP_ID ); + } + //On open power systems we want to deconfigure the processor + else + { + errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_NULL ); + } + + + // Set the retry handler's mode to be informational, this will run + // p9_extract_rc then TI the system on fsp-systems. + // On open power systems if mode is set to informational we will run + // p9_extract_rc then return back to this function SbeRetryHandler l_SBEobj = SbeRetryHandler( - SbeRetryHandler::SBE_MODE_OF_OPERATION::INFORMATIONAL_ONLY); + SbeRetryHandler::SBE_MODE_OF_OPERATION::INFORMATIONAL_ONLY, + l_errPlid); // Look at the scomSwitch attribute to tell what types // of scoms are going to be used. If the SMP is not yet up then we @@ -683,12 +707,7 @@ errlHndl_t SbeFifo::waitDnFifoReady(TARGETING::Target * i_target, l_SBEobj.main_sbe_handler(i_target); - if(l_SBEobj.getPLID()) - { - //tie the error from the sbe retry handler to this error - errl->plid(l_SBEobj.getPLID()); - } - errl->collectTrace(SBEIO_COMP_NAME); + //break out of continuous loop ( should only get here on openPower systems) break; } diff --git a/src/usr/sbeio/sbe_psudd.C b/src/usr/sbeio/sbe_psudd.C index 47be7b7be..d97f34d26 100644 --- a/src/usr/sbeio/sbe_psudd.C +++ b/src/usr/sbeio/sbe_psudd.C @@ -48,6 +48,7 @@ #include <p9_extract_sbe_rc.H> #include <errl/errludlogregister.H> #include <sbeio/sbe_retry_handler.H> +#include <initservice/initserviceif.H> trace_desc_t* g_trac_sbeio; TRAC_INIT(&g_trac_sbeio, SBEIO_COMP_NAME, 6*KILOBYTE, TRACE::BUFFER_SLOW); @@ -528,23 +529,45 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, TARGETING::get_huid(i_target)), i_pPsuRequest->mbxReg0); + // log the failing proc as FFDC + ErrlUserDetailsTarget(i_target).addToLog(l_errl); + l_respRegsFFDC.addToLog(l_errl); + l_errl->collectTrace(SBEIO_COMP_NAME); + + // Keep a copy of the plid so we can pass it to the retry_handler + // so the error logs it creates will be linked + uint32_t l_errPlid = l_errl->plid(); + + // Commit errlor log now if this is a FSP system because + // we will not return from retry handler + if(INITSERVICE::spBaseServicesEnabled()) + { + l_errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL ); + ERRORLOG::errlCommit( l_errl, SBEIO_COMP_ID ); + } + //On open power systems we want to deconfigure the processor + else + { + l_errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_NULL ); + } + // If the FFDC is empty, this error could be because the SBE // isn't booted correctly. We need to check the state of the - // SBE, handle the SBE value, and potentionally try - // to restart the SBE + // SBE. + // If we are on a FSP based system we expect this to result in a TI + // If we are on a BMC based system we expect to return from this fail SbeRetryHandler l_SBEobj = SbeRetryHandler( - SbeRetryHandler::SBE_MODE_OF_OPERATION:: - INFORMATIONAL_ONLY); + SbeRetryHandler::SBE_MODE_OF_OPERATION::INFORMATIONAL_ONLY, + l_errPlid); l_SBEobj.main_sbe_handler(i_target); - if(l_SBEobj.getPLID() != NULL) - { - // If there is not an unrecovered error, we want to tie - // the error from the sbe retry handler to this error. - l_errl->plid(l_SBEobj.getPLID()); - l_errl->setSev(ERRL_SEV_UNRECOVERABLE); - } } else { @@ -591,17 +614,16 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, l_ffdc_parser = nullptr; } - l_errl->addHwCallout( i_target, HWAS::SRCI_PRIORITY_HIGH, HWAS::NO_DECONFIG, HWAS::GARD_NULL ); - } - // log the failing proc as FFDC - ErrlUserDetailsTarget(i_target).addToLog(l_errl); - l_respRegsFFDC.addToLog(l_errl); - l_errl->collectTrace(SBEIO_COMP_NAME); + // log the failing proc as FFDC + ErrlUserDetailsTarget(i_target).addToLog(l_errl); + l_respRegsFFDC.addToLog(l_errl); + l_errl->collectTrace(SBEIO_COMP_NAME); + } MAGIC_INST_GET_SBE_TRACES( i_target->getAttr<TARGETING::ATTR_POSITION>(), diff --git a/src/usr/sbeio/test/sbe_retry_handler_test.H b/src/usr/sbeio/test/sbe_retry_handler_test.H index bfe6808d7..9a3719895 100644 --- a/src/usr/sbeio/test/sbe_retry_handler_test.H +++ b/src/usr/sbeio/test/sbe_retry_handler_test.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2017 */ +/* Contributors Listed Below - COPYRIGHT 2017,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -125,12 +125,12 @@ class SbeRetryHandlerTest : public CxxTest::TestSuite uint32_t l_sbeStarted = l_cpu_target->getAttr< TARGETING::ATTR_SBE_IS_STARTED>(); - if(l_SBEobj.getSbeRestart() && !l_sbeStarted) + if(l_SBEobj.isSbeAtRuntime() && !l_sbeStarted) { TS_FAIL("testSBEStarted: If the class element that " "the SBE started is true, then the SBE attribute also " "needs to be true"); - }else if(!(l_SBEobj.getSbeRestart() && l_sbeStarted)) + }else if(!(l_SBEobj.isSbeAtRuntime() && l_sbeStarted)) { TS_FAIL("testSBEStarted: If the class element " "that the SBE started is false, then the SBE attribute " |

