Do not gard cores on the initial core wakeup failure

We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com> Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
author: Dan Crowell <dcrowell@us.ibm.com> 2019-01-28 10:20:43 -0600
committer: Daniel M. Crowell <dcrowell@us.ibm.com> 2019-01-29 16:01:32 -0600
commit: f6b8ae93833ca00cdf97864d44393c688212be4d (patch)
tree: 0f76ff2253dcd07cddfe3d36e38c96684ae84c1f /src/usr/isteps/istep16/call_host_activate_slave_cores.C
parent: 5b03768f4279a7f5252baf89a767d7431f56efb7 (diff)
download: talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz
talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip
1 files changed, 92 insertions, 5 deletions
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
index e9cea28fb..6d670e167 100644
--- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C
+++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                 }
             } // End of handle time out error
 
-            // Create error log
-            if (0 != rc)
+            // Check if this core failed last time
+            ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
+              (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();
+
+            // Create predictive error log if this is the first failure
+            //   AND the HWP didn't see a problem
+            if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
             {
                 TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
                         "call_host_activate_slave_cores: "
@@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                                  l_checkidle_eid,
                                  rc) );
 
+                // Going to assume some kind of SW error unless it fails
+                //  again
+                l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
+                                             HWAS::SRCI_PRIORITY_HIGH);
+
                 // Callout core that failed to wake up.
                 l_errl->addHwCallout(*l_core,
-                        HWAS::SRCI_PRIORITY_MED,
-                        HWAS::DECONFIG,
-                        HWAS::GARD_Predictive);
+                        HWAS::SRCI_PRIORITY_LOW,
+                        HWAS::NO_DECONFIG,
+                        HWAS::GARD_NULL);
+
+                // Could be an interrupt issue
+                l_errl->collectTrace(INTR_TRACE_NAME,256);
+
+                // Throw printk in there too in case it is a kernel issue
+                ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
+
+                // Add interesting ISTEP traces
+                l_errl->collectTrace(ISTEP_COMP_NAME,256);
+
+                l_stepError.addErrorDetails( l_errl );
+                errlCommit( l_errl, HWPF_COMP_ID );
+
+                // Remember that we failed so we can gard the core if it
+                //  happens again on the reboot
+                l_prevFail = 1;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
+                break;
+            }
+            // Create unrecoverable error log if this is a repeat
+            //  OR if the HWP hit something
+            else if( (0 != rc) &&
+                     ((l_prevFail > 0) || (l_checkidle_eid != 0)) )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Core errors during wakeup on core %x",
+                           pir);
+                /*@
+                 * @errortype
+                 * @reasoncode  RC_SLAVE_CORE_WAKEUP_ERROR
+                 * @severity    ERRORLOG::ERRL_SEV_UNRECOVERABLE
+                 * @moduleid    MOD_HOST_ACTIVATE_SLAVE_CORES
+                 * @userdata1[00:31]   PIR of failing core.
+                 * @userdata2[32:63]   Number of previous failures.
+                 * @userdata2[00:31]   EID from p9_check_idle_stop_done().
+                 * @userdata2[32:63]   rc of cpu_start_core().
+                 *
+                 * @devdesc Kernel returned error when trying to activate
+                 *          core.
+                 */
+                l_errl = new ERRORLOG::ErrlEntry(
+                               ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+                               MOD_HOST_ACTIVATE_SLAVE_CORES,
+                               RC_SLAVE_CORE_WAKEUP_ERROR,
+                               TWO_UINT32_TO_UINT64(
+                                   pir,
+                                   l_prevFail),
+                               TWO_UINT32_TO_UINT64(
+                                   l_checkidle_eid,
+                                   rc) );
+
+                // Callout and gard core that failed to wake up.
+                l_errl->addHwCallout(*l_core,
+                                     HWAS::SRCI_PRIORITY_HIGH,
+                                     HWAS::DECONFIG,
+                                     HWAS::GARD_Predictive);
 
                 // Could be an interrupt issue
                 l_errl->collectTrace(INTR_TRACE_NAME,256);
@@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)
 
                 l_stepError.addErrorDetails( l_errl );
                 errlCommit( l_errl, HWPF_COMP_ID );
+
+                // We garded the core so we should zero out the fail
+                //  counter so the replacement doesn't get blamed
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
                 break;
             }
+            // Zero out the counter if we passed 
+            else if( l_prevFail > 0 )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Resetting failure count for core %.8X",
+                           TARGETING::get_huid(*l_core) );
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+            }
         }
     }
     // @@@@@    END CUSTOM BLOCK:   @@@@@
author	Dan Crowell <dcrowell@us.ibm.com>	2019-01-28 10:20:43 -0600
committer	Daniel M. Crowell <dcrowell@us.ibm.com>	2019-01-29 16:01:32 -0600
commit	f6b8ae93833ca00cdf97864d44393c688212be4d (patch)
tree	0f76ff2253dcd07cddfe3d36e38c96684ae84c1f /src/usr/isteps/istep16/call_host_activate_slave_cores.C
parent	5b03768f4279a7f5252baf89a767d7431f56efb7 (diff)
download	talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip