Do not gard cores on the initial core wakeup failure

We have seen rare (but non-zero) errors during slave core wakeup where we never see the new core reporting in. Currently this will result in a visible log and a core gard. However, there is currently no indication this failure is actually due to bad hardware. As a workaround, this commit adds an indicator that keeps track of if a core has failed wakeup previously. The first time we encounter the error there will be a visible log with a FW callout and no deconfig or gard of the core. That will trigger a boot failure and a reboot. If we don't fail on the next boot (which is expected), the counter will be cleared. If we do fail again there will be a visible log (with a new SRC) that calls out the core as the primary cause, plus does a deconfig+gard. Change-Id: I3a25537cf9c9c8e0b679519b67e9ae4e3492736d Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70992 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com> Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
author: Dan Crowell <dcrowell@us.ibm.com> 2019-01-28 10:20:43 -0600
committer: Daniel M. Crowell <dcrowell@us.ibm.com> 2019-01-29 16:01:32 -0600
commit: f6b8ae93833ca00cdf97864d44393c688212be4d (patch)
tree: 0f76ff2253dcd07cddfe3d36e38c96684ae84c1f
parent: 5b03768f4279a7f5252baf89a767d7431f56efb7 (diff)
download: talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz
talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip
4 files changed, 115 insertions, 5 deletions
diff --git a/src/include/usr/isteps/istep_reasoncodes.H b/src/include/usr/isteps/istep_reasoncodes.H
index 610232614..4aa6a78a0 100644
--- a/src/include/usr/isteps/istep_reasoncodes.H
+++ b/src/include/usr/isteps/istep_reasoncodes.H
@@ -136,6 +136,7 @@ namespace ISTEP
         RC_RISK_LEVEL_TOO_LOW                    = ISTEP_COMP_ID | 0x4B,
         RC_INVALID_HX_KEYWORD_DATA               = ISTEP_COMP_ID | 0x4C,
         RC_PNOR_IPMI_NOT_ENABLED                 = ISTEP_COMP_ID | 0x4D,
+        RC_SLAVE_CORE_WAKEUP_ERROR               = ISTEP_COMP_ID | 0x4E,
     };
 
 };
diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
index e9cea28fb..6d670e167 100644
--- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C
+++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C
@@ -179,8 +179,13 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                 }
             } // End of handle time out error
 
-            // Create error log
-            if (0 != rc)
+            // Check if this core failed last time
+            ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail =
+              (*l_core)->getAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>();
+
+            // Create predictive error log if this is the first failure
+            //   AND the HWP didn't see a problem
+            if( (0 != rc) && (l_prevFail == 0) && (l_checkidle_eid == 0) )
             {
                 TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
                         "call_host_activate_slave_cores: "
@@ -208,11 +213,75 @@ void* call_host_activate_slave_cores (void *io_pArgs)
                                  l_checkidle_eid,
                                  rc) );
 
+                // Going to assume some kind of SW error unless it fails
+                //  again
+                l_errl->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE,
+                                             HWAS::SRCI_PRIORITY_HIGH);
+
                 // Callout core that failed to wake up.
                 l_errl->addHwCallout(*l_core,
-                        HWAS::SRCI_PRIORITY_MED,
-                        HWAS::DECONFIG,
-                        HWAS::GARD_Predictive);
+                        HWAS::SRCI_PRIORITY_LOW,
+                        HWAS::NO_DECONFIG,
+                        HWAS::GARD_NULL);
+
+                // Could be an interrupt issue
+                l_errl->collectTrace(INTR_TRACE_NAME,256);
+
+                // Throw printk in there too in case it is a kernel issue
+                ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);
+
+                // Add interesting ISTEP traces
+                l_errl->collectTrace(ISTEP_COMP_NAME,256);
+
+                l_stepError.addErrorDetails( l_errl );
+                errlCommit( l_errl, HWPF_COMP_ID );
+
+                // Remember that we failed so we can gard the core if it
+                //  happens again on the reboot
+                l_prevFail = 1;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
+                break;
+            }
+            // Create unrecoverable error log if this is a repeat
+            //  OR if the HWP hit something
+            else if( (0 != rc) &&
+                     ((l_prevFail > 0) || (l_checkidle_eid != 0)) )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Core errors during wakeup on core %x",
+                           pir);
+                /*@
+                 * @errortype
+                 * @reasoncode  RC_SLAVE_CORE_WAKEUP_ERROR
+                 * @severity    ERRORLOG::ERRL_SEV_UNRECOVERABLE
+                 * @moduleid    MOD_HOST_ACTIVATE_SLAVE_CORES
+                 * @userdata1[00:31]   PIR of failing core.
+                 * @userdata2[32:63]   Number of previous failures.
+                 * @userdata2[00:31]   EID from p9_check_idle_stop_done().
+                 * @userdata2[32:63]   rc of cpu_start_core().
+                 *
+                 * @devdesc Kernel returned error when trying to activate
+                 *          core.
+                 */
+                l_errl = new ERRORLOG::ErrlEntry(
+                               ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+                               MOD_HOST_ACTIVATE_SLAVE_CORES,
+                               RC_SLAVE_CORE_WAKEUP_ERROR,
+                               TWO_UINT32_TO_UINT64(
+                                   pir,
+                                   l_prevFail),
+                               TWO_UINT32_TO_UINT64(
+                                   l_checkidle_eid,
+                                   rc) );
+
+                // Callout and gard core that failed to wake up.
+                l_errl->addHwCallout(*l_core,
+                                     HWAS::SRCI_PRIORITY_HIGH,
+                                     HWAS::DECONFIG,
+                                     HWAS::GARD_Predictive);
 
                 // Could be an interrupt issue
                 l_errl->collectTrace(INTR_TRACE_NAME,256);
@@ -225,8 +294,26 @@ void* call_host_activate_slave_cores (void *io_pArgs)
 
                 l_stepError.addErrorDetails( l_errl );
                 errlCommit( l_errl, HWPF_COMP_ID );
+
+                // We garded the core so we should zero out the fail
+                //  counter so the replacement doesn't get blamed
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+
                 break;
             }
+            // Zero out the counter if we passed 
+            else if( l_prevFail > 0 )
+            {
+                TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
+                           "call_host_activate_slave_cores: "
+                           "Resetting failure count for core %.8X",
+                           TARGETING::get_huid(*l_core) );
+                l_prevFail = 0;
+                (*l_core)->
+                  setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
+            }
         }
     }
     // @@@@@    END CUSTOM BLOCK:   @@@@@
diff --git a/src/usr/targeting/common/xmltohb/attribute_types.xml b/src/usr/targeting/common/xmltohb/attribute_types.xml
index c31202b92..d75aaca3d 100644
--- a/src/usr/targeting/common/xmltohb/attribute_types.xml
+++ b/src/usr/targeting/common/xmltohb/attribute_types.xml
@@ -5808,6 +5808,25 @@
   </attribute>
 
   <attribute>
+    <description>
+       Tracks if a specific core has previously experienced a timeout during
+       initial activation.
+          0 = No previous errors reported; 
+          1 = Core failed on the last attempt to be started
+    </description>
+    <id>PREVIOUS_WAKEUP_FAIL</id>
+    <persistency>non-volatile</persistency>
+    <readable/>
+    <writeable/>
+    <simpleType>
+      <uint8_t>
+        <default>0</default>
+      </uint8_t>
+    </simpleType>
+    <no_export/>
+  </attribute>
+
+  <attribute>
     <complexType>
       <description>Structure which defines a target's primary capabilities.
         A target can only support at most FSI SCOM and one of the other two SCOM
diff --git a/src/usr/targeting/common/xmltohb/target_types.xml b/src/usr/targeting/common/xmltohb/target_types.xml
index 197a591ae..f51a84c23 100644
--- a/src/usr/targeting/common/xmltohb/target_types.xml
+++ b/src/usr/targeting/common/xmltohb/target_types.xml
@@ -1706,6 +1706,9 @@
       <id>PARENT_PERVASIVE</id>
     </attribute>
     <attribute>
+      <id>PREVIOUS_WAKEUP_FAIL</id>
+    </attribute>
+    <attribute>
       <default>CORE</default>
       <id>TYPE</id>
     </attribute>
author	Dan Crowell <dcrowell@us.ibm.com>	2019-01-28 10:20:43 -0600
committer	Daniel M. Crowell <dcrowell@us.ibm.com>	2019-01-29 16:01:32 -0600
commit	f6b8ae93833ca00cdf97864d44393c688212be4d (patch)
tree	0f76ff2253dcd07cddfe3d36e38c96684ae84c1f
parent	5b03768f4279a7f5252baf89a767d7431f56efb7 (diff)
download	talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.tar.gz talos-hostboot-f6b8ae93833ca00cdf97864d44393c688212be4d.zip