summaryrefslogtreecommitdiffstats
path: root/src/usr/isteps/istep16/call_host_activate_slave_cores.C
diff options
context:
space:
mode:
authorDan Crowell <dcrowell@us.ibm.com>2019-01-28 10:20:43 -0600
committerDaniel M. Crowell <dcrowell@us.ibm.com>2019-01-29 16:01:32 -0600
commitf6b8ae93833ca00cdf97864d44393c688212be4d (patch)
tree0f76ff2253dcd07cddfe3d36e38c96684ae84c1f /src/usr/isteps/istep16/call_host_activate_slave_cores.C
parent5b03768f4279a7f5252baf89a767d7431f56efb7 (diff)
downloadtalos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz
talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip
Do not gard cores on the initial core wakeup failure
We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com> Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/isteps/istep16/call_host_activate_slave_cores.C')
-rw-r--r--src/usr/isteps/istep16/call_host_activate_slave_cores.C97
1 files changed, 92 insertions, 5 deletions
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
index e9cea28fb..6d670e167 100644
--- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C
+++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
}
} // End of handle time out error
- // Create error log
- if (0 != rc)
+ // Check if this core failed last time
+ ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
+ (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();
+
+ // Create predictive error log if this is the first failure
+ // AND the HWP didn't see a problem
+ if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
@@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
l_checkidle_eid,
rc) );
+ // Going to assume some kind of SW error unless it fails
+ // again
+ l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
+ HWAS::SRCI_PRIORITY_HIGH);
+
// Callout core that failed to wake up.
l_errl->addHwCallout(*l_core,
- HWAS::SRCI_PRIORITY_MED,
- HWAS::DECONFIG,
- HWAS::GARD_Predictive);
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
+
+ // Could be an interrupt issue
+ l_errl->collectTrace(INTR_TRACE_NAME,256);
+
+ // Throw printk in there too in case it is a kernel issue
+ ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
+
+ // Add interesting ISTEP traces
+ l_errl->collectTrace(ISTEP_COMP_NAME,256);
+
+ l_stepError.addErrorDetails( l_errl );
+ errlCommit( l_errl, HWPF_COMP_ID );
+
+ // Remember that we failed so we can gard the core if it
+ // happens again on the reboot
+ l_prevFail = 1;
+ (*l_core)->
+ setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
+ break;
+ }
+ // Create unrecoverable error log if this is a repeat
+ // OR if the HWP hit something
+ else if( (0 != rc) &&
+ ((l_prevFail > 0) || (l_checkidle_eid != 0)) )
+ {
+ TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+ "call_host_activate_slave_cores: "
+ "Core errors during wakeup on core %x",
+ pir);
+ /*@
+ * @errortype
+ * @reasoncode RC_SLAVE_CORE_WAKEUP_ERROR
+ * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
+ * @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES
+ * @userdata1[00:31] PIR of failing core.
+ * @userdata2[32:63] Number of previous failures.
+ * @userdata2[00:31] EID from p9_check_idle_stop_done().
+ * @userdata2[32:63] rc of cpu_start_core().
+ *
+ * @devdesc Kernel returned error when trying to activate
+ * core.
+ */
+ l_errl = new ERRORLOG::ErrlEntry(
+ ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+ MOD_HOST_ACTIVATE_SLAVE_CORES,
+ RC_SLAVE_CORE_WAKEUP_ERROR,
+ TWO_UINT32_TO_UINT64(
+ pir,
+ l_prevFail),
+ TWO_UINT32_TO_UINT64(
+ l_checkidle_eid,
+ rc) );
+
+ // Callout and gard core that failed to wake up.
+ l_errl->addHwCallout(*l_core,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::DECONFIG,
+ HWAS::GARD_Predictive);
// Could be an interrupt issue
l_errl->collectTrace(INTR_TRACE_NAME,256);
@@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)
l_stepError.addErrorDetails( l_errl );
errlCommit( l_errl, HWPF_COMP_ID );
+
+ // We garded the core so we should zero out the fail
+ // counter so the replacement doesn't get blamed
+ l_prevFail = 0;
+ (*l_core)->
+ setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
break;
}
+ // Zero out the counter if we passed
+ else if( l_prevFail > 0 )
+ {
+ TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+ "call_host_activate_slave_cores: "
+ "Resetting failure count for core %.8X",
+ TARGETING::get_huid(*l_core) );
+ l_prevFail = 0;
+ (*l_core)->
+ setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+ }
}
}
// @@@@@ END CUSTOM BLOCK: @@@@@
OpenPOWER on IntegriCloud