diff options
author | Dean Sanner <dsanner@us.ibm.com> | 2017-08-14 10:02:43 -0500 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2017-09-14 22:29:41 -0400 |
commit | 9acfce99596f12dcc60952f8506a77e542609cbf (patch) | |
tree | c0053f3d4c74e412f598c7d704da02c4c83bc0de /src/usr/errl | |
parent | 16887e07aa54b19b64f8c754d41b6076fe72464f (diff) | |
download | talos-hostboot-9acfce99596f12dcc60952f8506a77e542609cbf.tar.gz talos-hostboot-9acfce99596f12dcc60952f8506a77e542609cbf.zip |
Clear ECC sections marked "clearOnEccErr" on error
- Add the capability for Hostboot to recover (with reboot)
when it consumes an ECC error
- PNOR layout needs to be updated to flag the recoverable
sections (generally cached or throw away data like *VPD
HBEL, and GUARD partitions)
- Upon bad ECC detection, Hostboot will check partition
flag and if set, it will clear and write good ECC to PNOR.
It will then throw the normal error and terminate, waiting
for the BMC to issue a reboot
Change-Id: Ie4f4c0637d3962e9d4871e84a0bda8c256a74440
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44608
Reviewed-by: Stephen M. Cprek <smcprek@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/errl')
-rw-r--r-- | src/usr/errl/errlmanager.C | 34 |
1 files changed, 33 insertions, 1 deletions
diff --git a/src/usr/errl/errlmanager.C b/src/usr/errl/errlmanager.C index 45de9d1fa..17cc11dd6 100644 --- a/src/usr/errl/errlmanager.C +++ b/src/usr/errl/errlmanager.C @@ -231,6 +231,20 @@ void * ErrlManager::startup ( void* i_self ) return NULL; } +/////////////////////////////////////////////////////////////////////////////// +// ErrlManager::pnorSetupThread() +/////////////////////////////////////////////////////////////////////////////// +void * ErrlManager::pnorSetupThread ( void* i_self ) +{ + TRACFCOMP( g_trac_errl, ENTER_MRK "ErrlManager::pnorSetupThread..." ); + + //Start a thread to deal with PNOR setup + reinterpret_cast<ErrlManager *>(i_self)->setupPnorInfo(); + + TRACFCOMP( g_trac_errl, EXIT_MRK "ErrlManager::pnorSetupThread" ); + return nullptr; +} + /////////////////////////////////////////////////////////////////////////////// // ErrlManager::errlogMsgHndlr() @@ -250,7 +264,25 @@ void ErrlManager::errlogMsgHndlr () case ERRLOG_ACCESS_PNOR_TYPE: { // PNOR is up and running now. - setupPnorInfo(); + // This can fail if there is bad ECC in HBEL (which is + // somewhat common on power faults). Because of this, + // trigger this as separate task so message that kills + //task on bad ECC doesn't bring down the whole daemon + auto l_tid = task_create(ErrlManager::pnorSetupThread, + this); + + // status of the task ( OK or Crashed ) + int l_childsts = 0; + auto l_tidretrc = task_wait_tid( l_tid, &l_childsts, 0); + + if ((static_cast<int16_t>(l_tidretrc) < 0 ) || + (l_childsts != TASK_STATUS_EXITED_CLEAN )) + { + TRACFCOMP(g_trac_errl, ERR_MRK "Failed to setup PNOR; l_tidretrc=0x%x," + " l_childsts=0x%x", l_tidretrc, l_childsts); + //Set iv_pnorAddr to nullptr to prevent writes + iv_pnorAddr = nullptr; + } //We are done with the msg msg_free(theMsg); |