diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2019-01-28 10:20:43 -0600 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2019-01-29 16:01:32 -0600 |
commit | f6b8ae93833ca00cdf97864d44393c688212be4d (patch) | |
tree | 0f76ff2253dcd07cddfe3d36e38c96684ae84c1f /src/usr/isteps/istep16/call_host_activate_slave_cores.C | |
parent | 5b03768f4279a7f5252baf89a767d7431f56efb7 (diff) | |
download | talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip |
Do not gard cores on the initial core wakeup failure
We have seen rare (but non-zero) errors during slave core wakeup
where we never see the new core reporting in. Currently this
will result in a visible log and a core gard. However, there is
currently no indication this failure is actually due to bad
hardware.
As a workaround, this commit adds an indicator that keeps track
of if a core has failed wakeup previously. The first time we
encounter the error there will be a visible log with a FW callout
and no deconfig or gard of the core. That will trigger a boot
failure and a reboot. If we don't fail on the next boot (which
is expected), the counter will be cleared. If we do fail again
there will be a visible log (with a new SRC) that calls out the
core as the primary cause, plus does a deconfig+gard.
Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com>
Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/isteps/istep16/call_host_activate_slave_cores.C')
-rw-r--r-- | src/usr/isteps/istep16/call_host_activate_slave_cores.C | 97 |
1 files changed, 92 insertions, 5 deletions
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C index e9cea28fb..6d670e167 100644 --- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C +++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C @@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs) } } // End of handle time out error - // Create error log - if (0 != rc) + // Check if this core failed last time + ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = + (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(); + + // Create predictive error log if this is the first failure + // AND the HWP didn't see a problem + if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) ) { TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "call_host_activate_slave_cores: " @@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs) l_checkidle_eid, rc) ); + // Going to assume some kind of SW error unless it fails + // again + l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH); + // Callout core that failed to wake up. l_errl->addHwCallout(*l_core, - HWAS::SRCI_PRIORITY_MED, - HWAS::DECONFIG, - HWAS::GARD_Predictive); + HWAS::SRCI_PRIORITY_LOW, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL); + + // Could be an interrupt issue + l_errl->collectTrace(INTR_TRACE_NAME,256); + + // Throw printk in there too in case it is a kernel issue + ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl); + + // Add interesting ISTEP traces + l_errl->collectTrace(ISTEP_COMP_NAME,256); + + l_stepError.addErrorDetails( l_errl ); + errlCommit( l_errl, HWPF_COMP_ID ); + + // Remember that we failed so we can gard the core if it + // happens again on the reboot + l_prevFail = 1; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + + break; + } + // Create unrecoverable error log if this is a repeat + // OR if the HWP hit something + else if( (0 != rc) && + ((l_prevFail > 0) || (l_checkidle_eid != 0)) ) + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "call_host_activate_slave_cores: " + "Core errors during wakeup on core %x", + pir); + /*@ + * @errortype + * @reasoncode RC_SLAVE_CORE_WAKEUP_ERROR + * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES + * @userdata1[00:31] PIR of failing core. + * @userdata2[32:63] Number of previous failures. + * @userdata2[00:31] EID from p9_check_idle_stop_done(). + * @userdata2[32:63] rc of cpu_start_core(). + * + * @devdesc Kernel returned error when trying to activate + * core. + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + MOD_HOST_ACTIVATE_SLAVE_CORES, + RC_SLAVE_CORE_WAKEUP_ERROR, + TWO_UINT32_TO_UINT64( + pir, + l_prevFail), + TWO_UINT32_TO_UINT64( + l_checkidle_eid, + rc) ); + + // Callout and gard core that failed to wake up. + l_errl->addHwCallout(*l_core, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Predictive); // Could be an interrupt issue l_errl->collectTrace(INTR_TRACE_NAME,256); @@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs) l_stepError.addErrorDetails( l_errl ); errlCommit( l_errl, HWPF_COMP_ID ); + + // We garded the core so we should zero out the fail + // counter so the replacement doesn't get blamed + l_prevFail = 0; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + break; } + // Zero out the counter if we passed + else if( l_prevFail > 0 ) + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "call_host_activate_slave_cores: " + "Resetting failure count for core %.8X", + TARGETING::get_huid(*l_core) ); + l_prevFail = 0; + (*l_core)-> + setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail); + } } } // @@@@@ END CUSTOM BLOCK: @@@@@ |