From f6b8ae93833ca00cdf97864d44393c688212be4d Mon Sep 17 00:00:00 2001 From: Dan Crowell Date: Mon, 28 Jan 2019 10:20:43 -0600 Subject: Do not gard cores on the initial core wakeup failure We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: FSP CI Jenkins Tested-by: Jenkins OP HW Reviewed-by: Christian R. Geddes Reviewed-by: William G. Hoffa Reviewed-by: Daniel M. Crowell --- src/include/usr/isteps/istep_reasoncodes.H | 1 + 1 file changed, 1 insertion(+) (limited to 'src/include') diff --git a/src/include/usr/isteps/istep_reasoncodes.H b/src/include/usr/isteps/istep_reasoncodes.H index 610232614..4aa6a78a0 100644 --- a/src/include/usr/isteps/istep_reasoncodes.H +++ b/src/include/usr/isteps/istep_reasoncodes.H @@ -136,6 +136,7 @@ namespace ISTEP RC_RISK_LEVEL_TOO_LOW = ISTEP_COMP_ID | 0x4B, RC_INVALID_HX_KEYWORD_DATA = ISTEP_COMP_ID | 0x4C, RC_PNOR_IPMI_NOT_ENABLED = ISTEP_COMP_ID | 0x4D, + RC_SLAVE_CORE_WAKEUP_ERROR = ISTEP_COMP_ID | 0x4E, }; }; -- cgit v1.2.3