summaryrefslogtreecommitdiffstats
path: root/src/usr/isteps/istep16/call_host_activate_slave_cores.C
diff options
context:
space:
mode:
authorDan Crowell <dcrowell@us.ibm.com>2019-02-04 13:42:30 -0600
committerDaniel M. Crowell <dcrowell@us.ibm.com>2019-03-06 20:45:24 -0600
commite73d0c117548b97422485a8435b84de00d4c7ca9 (patch)
tree42f96981e16d514ac35a42ce519ecffc049b799a /src/usr/isteps/istep16/call_host_activate_slave_cores.C
parent73fc80f05f076a1270b01a4e73d4d510d03ff2a3 (diff)
downloadtalos-hostboot-e73d0c117548b97422485a8435b84de00d4c7ca9.tar.gz
talos-hostboot-e73d0c117548b97422485a8435b84de00d4c7ca9.zip
Force reboot without visible errors for core wakeup failure
The intermittent core wakeup failure continues to plague us with no solution in sight. Since the error is extremely rare (less than 1% of boots) we have decided to force a manual reboot and not log any visible errors to the customer. Change-Id: Ic30f6330431bd2c8ce75075befc2c36d278d8152 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71319 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/72921 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/isteps/istep16/call_host_activate_slave_cores.C')
-rw-r--r--src/usr/isteps/istep16/call_host_activate_slave_cores.C43
1 files changed, 34 insertions, 9 deletions
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
index 6d670e167..c5f941e8f 100644
--- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C
+++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -51,6 +51,7 @@
#include <scom/scomif.H>
#include <errl/errludprintk.H>
#include <intr/intr_reasoncodes.H>
+#include <initservice/istepdispatcherif.H>
using namespace ERRORLOG;
using namespace TARGETING;
@@ -84,6 +85,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
assert( sys != NULL );
uint32_t l_numCores = 0;
+ // keep track of which cores started
+ TargetHandleList l_startedCores;
+
for(TargetHandleList::const_iterator
l_core = l_cores.begin();
l_core != l_cores.end();
@@ -233,7 +237,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Add interesting ISTEP traces
l_errl->collectTrace(ISTEP_COMP_NAME,256);
- l_stepError.addErrorDetails( l_errl );
+ // Choosing to ignore this intermittent error
+ l_errl->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
errlCommit( l_errl, HWPF_COMP_ID );
// Remember that we failed so we can gard the core if it
@@ -242,6 +247,14 @@ void* call_host_activate_slave_cores (void *io_pArgs)
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+#ifdef CONFIG_BMC_IPMI
+ // Initiate a graceful power cycle
+ CONSOLE::displayf(ISTEP_COMP_NAME, "System Rebooting To Retry Recoverable Error");
+ CONSOLE::flush();
+ TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,"call_host_activate_slave_cores: requesting power cycle");
+ INITSERVICE::requestReboot();
+#endif
+
break;
}
// Create unrecoverable error log if this is a repeat
@@ -306,17 +319,29 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Zero out the counter if we passed
else if( l_prevFail > 0 )
{
- TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
- "call_host_activate_slave_cores: "
- "Resetting failure count for core %.8X",
- TARGETING::get_huid(*l_core) );
- l_prevFail = 0;
- (*l_core)->
- setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+ // Add to the list of passing cores so we can
+ // clear ATTR_PREVIOUS_WAKEUP_FAIL later
+ l_startedCores.push_back(*l_core);
}
}
}
- // @@@@@ END CUSTOM BLOCK: @@@@@
+
+ // Clear out the wakeup_fail indicators only after every core has passed.
+ // Doing this outside the loop helps mitigate the (unlikely) case where
+ // a failure bounces between different cores on several consecutive boots.
+ for(TargetHandleList::const_iterator
+ l_core = l_startedCores.begin();
+ l_core != l_startedCores.end();
+ ++l_core)
+ {
+ TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+ "call_host_activate_slave_cores: "
+ "Resetting failure count for core %.8X",
+ TARGETING::get_huid(*l_core) );
+ ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = 0;
+ (*l_core)->
+ setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+ }
#if defined(CONFIG_IPLTIME_CHECKSTOP_ANALYSIS) && !defined(__HOSTBOOT_RUNTIME)
if( l_stepError.isNull() )
OpenPOWER on IntegriCloud