diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2018-04-26 13:01:01 -0500 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2018-05-19 17:56:42 -0400 |
commit | a4e02fc0828910582a08cb1277a30531540d7523 (patch) | |
tree | 130522c4e3683706afef978456945909b4ed8752 /src | |
parent | 798ff0e50b1dd3a9b0ea640faae67bbd871b909c (diff) | |
download | talos-hostboot-a4e02fc0828910582a08cb1277a30531540d7523.tar.gz talos-hostboot-a4e02fc0828910582a08cb1277a30531540d7523.zip |
FFDC enhancements for core activate fails
Adding some more traces to the error log we grab for core
activation failures.
Change-Id: I30c6985060fcffcb3382b775a52e59c08d2b51b7
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57907
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/include/usr/intr/intr_reasoncodes.H | 3 | ||||
-rw-r--r-- | src/usr/intr/intrrp.C | 4 | ||||
-rw-r--r-- | src/usr/isteps/istep16/call_host_activate_slave_cores.C | 20 |
3 files changed, 21 insertions, 6 deletions
diff --git a/src/include/usr/intr/intr_reasoncodes.H b/src/include/usr/intr/intr_reasoncodes.H index 62d8ad87a..4dc3cc496 100644 --- a/src/include/usr/intr/intr_reasoncodes.H +++ b/src/include/usr/intr/intr_reasoncodes.H @@ -27,6 +27,9 @@ #include <hbotcompid.H> +#define INTR_TRACE_NAME INTR_COMP_NAME + + namespace INTR { enum IntrModuleID diff --git a/src/usr/intr/intrrp.C b/src/usr/intr/intrrp.C index c9809b802..fda137259 100644 --- a/src/usr/intr/intrrp.C +++ b/src/usr/intr/intrrp.C @@ -57,8 +57,6 @@ #include <p9n2_misc_scom_addresses_fld.H> #include <util/utilmbox_scratch.H> -#define INTR_TRACE_NAME INTR_COMP_NAME - using namespace INTR; using namespace TARGETING; @@ -3224,7 +3222,7 @@ void* INTR::IntrRp::handleCpuTimeout(void* _pir) msg->data[0] = pir; msg_q_t intr_msgQ = msg_q_resolve(VFS_ROOT_MSG_INTR); - TRACFCOMP( g_trac_intr,"handleCpuTimeout for pir: %lx", pir); + TRACFCOMP( g_trac_intr,"handleCpuTimeout for pir: 0x%lx", pir); do { diff --git a/src/usr/isteps/istep16/call_host_activate_slave_cores.C b/src/usr/isteps/istep16/call_host_activate_slave_cores.C index e18639077..67e6b816b 100644 --- a/src/usr/isteps/istep16/call_host_activate_slave_cores.C +++ b/src/usr/isteps/istep16/call_host_activate_slave_cores.C @@ -49,6 +49,8 @@ #endif #include <scom/scomif.H> +#include <errl/errludprintk.H> +#include <intr/intr_reasoncodes.H> using namespace ERRORLOG; using namespace TARGETING; @@ -121,11 +123,12 @@ void* call_host_activate_slave_cores (void *io_pArgs) int rc = cpu_start_core(pir, en_threads); // Handle time out error + uint32_t l_checkidle_eid = 0; if (-ETIME == rc) { TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "call_host_activate_slave_cores: " - "Time out rc from kernel %d on core %x", + "Time out rc from kernel %d on core 0x%x", rc, pir); @@ -157,6 +160,8 @@ void* call_host_activate_slave_cores (void *io_pArgs) // Create IStep error log l_stepError.addErrorDetails(l_timeout_errl); + l_checkidle_eid = l_timeout_errl->eid(); + // Commit error errlCommit( l_timeout_errl, HWPF_COMP_ID ); } @@ -176,7 +181,8 @@ void* call_host_activate_slave_cores (void *io_pArgs) * @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE * @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES * @userdata1 PIR of failing core. - * @userdata2 rc of cpu_start_core(). + * @userdata2[00:31] EID from p9_check_idle_stop_done(). + * @userdata2[32:63] rc of cpu_start_core(). * * @devdesc Kernel returned error when trying to activate * core. @@ -186,7 +192,9 @@ void* call_host_activate_slave_cores (void *io_pArgs) MOD_HOST_ACTIVATE_SLAVE_CORES, RC_BAD_RC, pir, - rc ); + TWO_UINT32_TO_UINT64( + l_checkidle_eid, + rc) ); // Callout core that failed to wake up. l_errl->addHwCallout(*l_core, @@ -194,6 +202,12 @@ void* call_host_activate_slave_cores (void *io_pArgs) HWAS::DECONFIG, HWAS::GARD_Predictive); + // Could be an interrupt issue + l_errl->collectTrace(INTR_TRACE_NAME,256); + + // Throw printk in there too in case it is a kernel issue + ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl); + l_stepError.addErrorDetails( l_errl ); errlCommit( l_errl, HWPF_COMP_ID ); break; |