summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2018-06-28 12:23:30 -0500
committerMartha Broyles <mbroyles@us.ibm.com>2018-06-29 14:07:25 -0400
commitbc0c2332263be3b8615c7ff04a3373cfd57f7a57 (patch)
treed16174f34f14bcb5d3b34e9d7ca7f47d6d7f21e9 /src/occ_405/amec
parent18583159ffc2ca1b75a5c1491485c7befad93b72 (diff)
downloadtalos-occ-bc0c2332263be3b8615c7ff04a3373cfd57f7a57.tar.gz
talos-occ-bc0c2332263be3b8615c7ff04a3373cfd57f7a57.zip
Prevent calling out DIMMs and Centaurs due to GPE issues
Change-Id: If977941c59a60c4e4fcd0d9759a83b3b0d0c5dd9 CQ: SW433604 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/61579 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec')
-rwxr-xr-xsrc/occ_405/amec/amec_health.c28
-rw-r--r--src/occ_405/amec/amec_sensors_fw.c30
2 files changed, 57 insertions, 1 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index ce3637a..91b2a28 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -37,6 +37,7 @@
// Externs
//*************************************************************************/
extern bool G_simics_environment;
+extern bool G_log_gpe1_error;
//*************************************************************************/
// Defines/Enums
@@ -391,6 +392,17 @@ void amec_health_check_dimm_timeout()
continue;
}
+ // To prevent DIMMs from incorrectly being called out, don't log errors if there have
+ // been timeouts with GPE1 tasks not finishing
+ if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermaldimm.temp_timeout)
+ {
+ TRAC_ERR("Timed out reading DIMM temperature due to GPE1 issues");
+ // give notification that GPE1 error should now be logged which will reset the OCC
+ G_log_gpe1_error = TRUE;
+ // no reason to check anymore since all DIMMs are collected from the same GPE
+ break;
+ }
+
TRAC_ERR("Timed out reading DIMM%04X temperature (cur_temp[%d] flags[0x%02X])",
(l_port<<8)|l_dimm, l_fru->cur_temp, l_fru->flags);
@@ -435,6 +447,11 @@ void amec_health_check_dimm_timeout()
l_callouts_count++;
}
} //iterate over all dimms
+ if(G_log_gpe1_error)
+ {
+ // Going to be resetting so no reason to check anymore ports
+ break;
+ }
} //iterate over all ports
if(l_err)
@@ -708,6 +725,17 @@ void amec_health_check_cent_timeout()
continue;
}
+ // To prevent Centaurs from incorrectly being called out, don't log errors if there have
+ // been timeouts with GPE1 tasks not finishing
+ if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermalcent.temp_timeout)
+ {
+ TRAC_ERR("Timed out reading centaur temperature due to GPE1 issues");
+ // give notification that GPE1 error should now be logged which will reset the OCC
+ G_log_gpe1_error = TRUE;
+ // no reason to check anymore since all Centaurs are collected from the same GPE
+ break;
+ }
+
TRAC_ERR("Timed out reading centaur temperature on cent[%d] temp[%d] flags[0x%02X]",
l_cent, l_fru->cur_temp, l_fru->flags);
diff --git a/src/occ_405/amec/amec_sensors_fw.c b/src/occ_405/amec/amec_sensors_fw.c
index 6c5a1c0..e4d524b 100644
--- a/src/occ_405/amec/amec_sensors_fw.c
+++ b/src/occ_405/amec/amec_sensors_fw.c
@@ -49,6 +49,7 @@
/* Globals */
/******************************************************************************/
extern bool G_24x7_disabled;
+bool G_log_gpe1_error = FALSE;
//*************************************************************************
// Code
@@ -203,8 +204,10 @@ void task_gpe_timings(task_t * i_task)
else
{
INCREMENT_ERR_HISTORY(ERRH_GPE1_NOT_IDLE);
+ // Log error and request reset if GPE1 issue has gone on long enough to cause real issues
+ // i.e. timeout collecting memory temperatures
- if(L_consec_trace_count[1] < MAX_CONSEC_TRACE)
+ if( (L_consec_trace_count[1] < MAX_CONSEC_TRACE) || (G_log_gpe1_error) )
{
xsr_sprg0.fields.xsr = in32(GPE_GPE1XIXSR);
xsr_sprg0.fields.sprg0 = in32(GPE_GPE1XISPRG0);
@@ -216,6 +219,31 @@ void task_gpe_timings(task_t * i_task)
xsr_sprg0.fields.xsr, iar_xsr.fields.iar,
ir_edr.fields.ir, ir_edr.fields.edr, xsr_sprg0.fields.sprg0);
L_consec_trace_count[1]++;
+
+ if(G_log_gpe1_error)
+ {
+ TRAC_ERR("GPE1 not idle causing timeouts, need to reset!!!");
+ /* @
+ * @errortype
+ * @moduleid AMEC_UPDATE_FW_SENSORS
+ * @reasoncode GPE_REQUEST_TASK_TIMEOUT
+ * @userdata1 0
+ * @userdata2 0
+ * @userdata4 ERC_AMEC_GPE1_TIMEOUT
+ * @devdesc Tasks on GPE1 failing to complete
+ */
+ l_err = createErrl(AMEC_UPDATE_FW_SENSORS, //modId
+ GPE_REQUEST_TASK_TIMEOUT, //reasoncode
+ ERC_AMEC_GPE1_TIMEOUT, //Extended reason code
+ ERRL_SEV_UNRECOVERABLE, //Severity
+ NULL, //Trace Buf
+ DEFAULT_TRACE_SIZE, //Trace Size
+ 0, //userdata1
+ 0); //userdata2
+
+ // Commit error log and request reset
+ REQUEST_RESET(l_err);
+ }
}
}
}
OpenPOWER on IntegriCloud