diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2019-01-28 10:20:43 -0600 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2019-01-29 16:01:32 -0600 |
commit | f6b8ae93833ca00cdf97864d44393c688212be4d (patch) | |
tree | 0f76ff2253dcd07cddfe3d36e38c96684ae84c1f | |
parent | 5b03768f4279a7f5252baf89a767d7431f56efb7 (diff) | |
download | talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip |
Do not gard cores on the initial core wakeup failure
We have seen rare (but non-zero) errors during slave core wakeup
where we never see the new core reporting in. Currently this
will result in a visible log and a core gard. However, there is
currently no indication this failure is actually due to bad
hardware.
As a workaround, this commit adds an indicator that keeps track
of if a core has failed wakeup previously. The first time we
encounter the error there will be a visible log with a FW callout
and no deconfig or gard of the core. That will trigger a boot
failure and a reboot. If we don't fail on the next boot (which
is expected), the counter will be cleared. If we do fail again
there will be a visible log (with a new SRC) that calls out the
core as the primary cause, plus does a deconfig+gard.
Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com>
Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
4 files changed, 115 insertions, 5 deletions
diff --git a/src/include/usr/isteps/istep_reasoncodes.H b/src/include/usr/isteps/istep_reasoncodes.H index 610232614..4aa6a78a0 100644 --- a/src/include/usr/isteps/istep_reasoncodes.H +++ b/src/include/usr/isteps/istep_reasoncodes.H @@ -136,6 +136,7 @@ namespace ISTEP RC_RISK_LEVEL_TOO_LOW = ISTEP_COMP_ID | 0x4B, RC_INVALID_HX_KEYWORD_DATA = ISTEP_COMP_ID | 0x4C, RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x4D, + RC_SLAVE_CORE_WAKEUP_ERROR = ISTEP_COMP_ID | 0x4E, }; }; diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C index e9cea28fb..6d670e167 100644 --- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C +++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C @@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs) } } // End of handle time out error - // Create error log - if (0 != rc) + // Check if this core failed last time + ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = + (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(); + + // Create predictive error log if this is the first failure + // AND the HWP didn't see a problem + if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) ) { TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "call_host_activate_slave_cores: " @@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs) l_checkidle_eid, rc) ); + // Going to assume some kind of SW error unless it fails + // again + l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH); + // Callout core that failed to wake up. l_errl->addHwCallout(*l_core, - HWAS::SRCI_PRIORITY_MED, - HWAS::DECONFIG, - HWAS::GARD_Predictive); + HWAS::SRCI_PRIORITY_LOW, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL); + + // Could be an interrupt issue + l_errl->collectTrace(INTR_TRACE_NAME,256); + + // Throw printk in there too in case it is a kernel issue + ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl); + + // Add interesting ISTEP traces + l_errl->collectTrace(ISTEP_COMP_NAME,256); + + l_stepError.addErrorDetails( l_errl ); + errlCommit( l_errl, HWPF_COMP_ID ); + + // Remember that we failed so we can gard the core if it + // happens again on the reboot + l_prevFail = 1; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + + break; + } + // Create unrecoverable error log if this is a repeat + // OR if the HWP hit something + else if( (0 != rc) && + ((l_prevFail > 0) || (l_checkidle_eid != 0)) ) + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "call_host_activate_slave_cores: " + "Core errors during wakeup on core %x", + pir); + /*@ + * @errortype + * @reasoncode RC_SLAVE_CORE_WAKEUP_ERROR + * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES + * @userdata1[00:31] PIR of failing core. + * @userdata2[32:63] Number of previous failures. + * @userdata2[00:31] EID from p9_check_idle_stop_done(). + * @userdata2[32:63] rc of cpu_start_core(). + * + * @devdesc Kernel returned error when trying to activate + * core. + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + MOD_HOST_ACTIVATE_SLAVE_CORES, + RC_SLAVE_CORE_WAKEUP_ERROR, + TWO_UINT32_TO_UINT64( + pir, + l_prevFail), + TWO_UINT32_TO_UINT64( + l_checkidle_eid, + rc) ); + + // Callout and gard core that failed to wake up. + l_errl->addHwCallout(*l_core, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Predictive); // Could be an interrupt issue l_errl->collectTrace(INTR_TRACE_NAME,256); @@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs) l_stepError.addErrorDetails( l_errl ); errlCommit( l_errl, HWPF_COMP_ID ); + + // We garded the core so we should zero out the fail + // counter so the replacement doesn't get blamed + l_prevFail = 0; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + break; } + // Zero out the counter if we passed + else if( l_prevFail > 0 ) + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "call_host_activate_slave_cores: " + "Resetting failure count for core %.8X", + TARGETING::get_huid(*l_core) ); + l_prevFail = 0; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + } } } // @@@@@ END CUSTOM BLOCK: @@@@@ diff --git a/src/usr/targeting/common/xmltohb/attribute_types.xml b/src/usr/targeting/common/xmltohb/attribute_types.xml index c31202b92..d75aaca3d 100644 --- a/src/usr/targeting/common/xmltohb/attribute_types.xml +++ b/src/usr/targeting/common/xmltohb/attribute_types.xml @@ -5808,6 +5808,25 @@ </attribute> <attribute> + <description> + Tracks if a specific core has previously experienced a timeout during + initial activation. + 0 = No previous errors reported; + 1 = Core failed on the last attempt to be started + </description> + <id>PREVIOUS_WAKEUP_FAIL</id> + <persistency>non-volatile</persistency> + <readable/> + <writeable/> + <simpleType> + <uint8_t> + <default>0</default> + </uint8_t> + </simpleType> + <no_export/> + </attribute> + + <attribute> <complexType> <description>Structure which defines a target's primary capabilities. A target can only support at most FSI SCOM and one of the other two SCOM diff --git a/src/usr/targeting/common/xmltohb/target_types.xml b/src/usr/targeting/common/xmltohb/target_types.xml index 197a591ae..f51a84c23 100644 --- a/src/usr/targeting/common/xmltohb/target_types.xml +++ b/src/usr/targeting/common/xmltohb/target_types.xml @@ -1706,6 +1706,9 @@ <id>PARENT_PERVASIVE</id> </attribute> <attribute> + <id>PREVIOUS_WAKEUP_FAIL</id> + </attribute> + <attribute> <default>CORE</default> <id>TYPE</id> </attribute> |