summaryrefslogtreecommitdiffstats
path: root/src/usr/errl
diff options
context:
space:
mode:
authorDean Sanner <dsanner@us.ibm.com>2017-08-14 10:02:43 -0500
committerDaniel M. Crowell <dcrowell@us.ibm.com>2017-09-14 22:29:41 -0400
commit9acfce99596f12dcc60952f8506a77e542609cbf (patch)
treec0053f3d4c74e412f598c7d704da02c4c83bc0de /src/usr/errl
parent16887e07aa54b19b64f8c754d41b6076fe72464f (diff)
downloadtalos-hostboot-9acfce99596f12dcc60952f8506a77e542609cbf.tar.gz
talos-hostboot-9acfce99596f12dcc60952f8506a77e542609cbf.zip
Clear ECC sections marked "clearOnEccErr" on error
- Add the capability for Hostboot to recover (with reboot) when it consumes an ECC error - PNOR layout needs to be updated to flag the recoverable sections (generally cached or throw away data like *VPD HBEL, and GUARD partitions) - Upon bad ECC detection, Hostboot will check partition flag and if set, it will clear and write good ECC to PNOR. It will then throw the normal error and terminate, waiting for the BMC to issue a reboot Change-Id: Ie4f4c0637d3962e9d4871e84a0bda8c256a74440 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44608 Reviewed-by: Stephen M. Cprek <smcprek@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/errl')
-rw-r--r--src/usr/errl/errlmanager.C34
1 files changed, 33 insertions, 1 deletions
diff --git a/src/usr/errl/errlmanager.C b/src/usr/errl/errlmanager.C
index 45de9d1fa..17cc11dd6 100644
--- a/src/usr/errl/errlmanager.C
+++ b/src/usr/errl/errlmanager.C
@@ -231,6 +231,20 @@ void * ErrlManager::startup ( void* i_self )
return NULL;
}
+///////////////////////////////////////////////////////////////////////////////
+// ErrlManager::pnorSetupThread()
+///////////////////////////////////////////////////////////////////////////////
+void * ErrlManager::pnorSetupThread ( void* i_self )
+{
+ TRACFCOMP( g_trac_errl, ENTER_MRK "ErrlManager::pnorSetupThread..." );
+
+ //Start a thread to deal with PNOR setup
+ reinterpret_cast<ErrlManager *>(i_self)->setupPnorInfo();
+
+ TRACFCOMP( g_trac_errl, EXIT_MRK "ErrlManager::pnorSetupThread" );
+ return nullptr;
+}
+
///////////////////////////////////////////////////////////////////////////////
// ErrlManager::errlogMsgHndlr()
@@ -250,7 +264,25 @@ void ErrlManager::errlogMsgHndlr ()
case ERRLOG_ACCESS_PNOR_TYPE:
{
// PNOR is up and running now.
- setupPnorInfo();
+ // This can fail if there is bad ECC in HBEL (which is
+ // somewhat common on power faults). Because of this,
+ // trigger this as separate task so message that kills
+ //task on bad ECC doesn't bring down the whole daemon
+ auto l_tid = task_create(ErrlManager::pnorSetupThread,
+ this);
+
+ // status of the task ( OK or Crashed )
+ int l_childsts = 0;
+ auto l_tidretrc = task_wait_tid( l_tid, &l_childsts, 0);
+
+ if ((static_cast<int16_t>(l_tidretrc) < 0 ) ||
+ (l_childsts != TASK_STATUS_EXITED_CLEAN ))
+ {
+ TRACFCOMP(g_trac_errl, ERR_MRK "Failed to setup PNOR; l_tidretrc=0x%x,"
+ " l_childsts=0x%x", l_tidretrc, l_childsts);
+ //Set iv_pnorAddr to nullptr to prevent writes
+ iv_pnorAddr = nullptr;
+ }
//We are done with the msg
msg_free(theMsg);
OpenPOWER on IntegriCloud