summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2018-06-28 12:23:30 -0500
committerMartha Broyles <mbroyles@us.ibm.com>2018-06-29 14:07:25 -0400
commitbc0c2332263be3b8615c7ff04a3373cfd57f7a57 (patch)
treed16174f34f14bcb5d3b34e9d7ca7f47d6d7f21e9
parent18583159ffc2ca1b75a5c1491485c7befad93b72 (diff)
downloadtalos-occ-bc0c2332263be3b8615c7ff04a3373cfd57f7a57.tar.gz
talos-occ-bc0c2332263be3b8615c7ff04a3373cfd57f7a57.zip
Prevent calling out DIMMs and Centaurs due to GPE issues
Change-Id: If977941c59a60c4e4fcd0d9759a83b3b0d0c5dd9 CQ: SW433604 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/61579 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-xsrc/occ_405/amec/amec_health.c28
-rw-r--r--src/occ_405/amec/amec_sensors_fw.c30
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp.h5
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.h2
-rw-r--r--src/occ_405/occ_service_codes.h1
-rwxr-xr-xsrc/occ_405/occbuildname.c2
-rwxr-xr-xsrc/occ_405/proc/proc_data.c3
7 files changed, 66 insertions, 5 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index ce3637a..91b2a28 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -37,6 +37,7 @@
// Externs
//*************************************************************************/
extern bool G_simics_environment;
+extern bool G_log_gpe1_error;
//*************************************************************************/
// Defines/Enums
@@ -391,6 +392,17 @@ void amec_health_check_dimm_timeout()
continue;
}
+ // To prevent DIMMs from incorrectly being called out, don't log errors if there have
+ // been timeouts with GPE1 tasks not finishing
+ if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermaldimm.temp_timeout)
+ {
+ TRAC_ERR("Timed out reading DIMM temperature due to GPE1 issues");
+ // give notification that GPE1 error should now be logged which will reset the OCC
+ G_log_gpe1_error = TRUE;
+ // no reason to check anymore since all DIMMs are collected from the same GPE
+ break;
+ }
+
TRAC_ERR("Timed out reading DIMM%04X temperature (cur_temp[%d] flags[0x%02X])",
(l_port<<8)|l_dimm, l_fru->cur_temp, l_fru->flags);
@@ -435,6 +447,11 @@ void amec_health_check_dimm_timeout()
l_callouts_count++;
}
} //iterate over all dimms
+ if(G_log_gpe1_error)
+ {
+ // Going to be resetting so no reason to check anymore ports
+ break;
+ }
} //iterate over all ports
if(l_err)
@@ -708,6 +725,17 @@ void amec_health_check_cent_timeout()
continue;
}
+ // To prevent Centaurs from incorrectly being called out, don't log errors if there have
+ // been timeouts with GPE1 tasks not finishing
+ if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermalcent.temp_timeout)
+ {
+ TRAC_ERR("Timed out reading centaur temperature due to GPE1 issues");
+ // give notification that GPE1 error should now be logged which will reset the OCC
+ G_log_gpe1_error = TRUE;
+ // no reason to check anymore since all Centaurs are collected from the same GPE
+ break;
+ }
+
TRAC_ERR("Timed out reading centaur temperature on cent[%d] temp[%d] flags[0x%02X]",
l_cent, l_fru->cur_temp, l_fru->flags);
diff --git a/src/occ_405/amec/amec_sensors_fw.c b/src/occ_405/amec/amec_sensors_fw.c
index 6c5a1c0..e4d524b 100644
--- a/src/occ_405/amec/amec_sensors_fw.c
+++ b/src/occ_405/amec/amec_sensors_fw.c
@@ -49,6 +49,7 @@
/* Globals */
/******************************************************************************/
extern bool G_24x7_disabled;
+bool G_log_gpe1_error = FALSE;
//*************************************************************************
// Code
@@ -203,8 +204,10 @@ void task_gpe_timings(task_t * i_task)
else
{
INCREMENT_ERR_HISTORY(ERRH_GPE1_NOT_IDLE);
+ // Log error and request reset if GPE1 issue has gone on long enough to cause real issues
+ // i.e. timeout collecting memory temperatures
- if(L_consec_trace_count[1] < MAX_CONSEC_TRACE)
+ if( (L_consec_trace_count[1] < MAX_CONSEC_TRACE) || (G_log_gpe1_error) )
{
xsr_sprg0.fields.xsr = in32(GPE_GPE1XIXSR);
xsr_sprg0.fields.sprg0 = in32(GPE_GPE1XISPRG0);
@@ -216,6 +219,31 @@ void task_gpe_timings(task_t * i_task)
xsr_sprg0.fields.xsr, iar_xsr.fields.iar,
ir_edr.fields.ir, ir_edr.fields.edr, xsr_sprg0.fields.sprg0);
L_consec_trace_count[1]++;
+
+ if(G_log_gpe1_error)
+ {
+ TRAC_ERR("GPE1 not idle causing timeouts, need to reset!!!");
+ /* @
+ * @errortype
+ * @moduleid AMEC_UPDATE_FW_SENSORS
+ * @reasoncode GPE_REQUEST_TASK_TIMEOUT
+ * @userdata1 0
+ * @userdata2 0
+ * @userdata4 ERC_AMEC_GPE1_TIMEOUT
+ * @devdesc Tasks on GPE1 failing to complete
+ */
+ l_err = createErrl(AMEC_UPDATE_FW_SENSORS, //modId
+ GPE_REQUEST_TASK_TIMEOUT, //reasoncode
+ ERC_AMEC_GPE1_TIMEOUT, //Extended reason code
+ ERRL_SEV_UNRECOVERABLE, //Severity
+ NULL, //Trace Buf
+ DEFAULT_TRACE_SIZE, //Trace Size
+ 0, //userdata1
+ 0); //userdata2
+
+ // Commit error log and request reset
+ REQUEST_RESET(l_err);
+ }
}
}
}
diff --git a/src/occ_405/cmdh/cmdh_fsp.h b/src/occ_405/cmdh/cmdh_fsp.h
index 967a3bc..4570f58 100755
--- a/src/occ_405/cmdh/cmdh_fsp.h
+++ b/src/occ_405/cmdh/cmdh_fsp.h
@@ -310,6 +310,11 @@ extern fsp_cmd_t G_htmgt_cmd_buffer;
extern fsp_rsp_t G_htmgt_rsp_buffer;
extern uint8_t G_rsp_status;
+// Defines for Internal flags used for debug
+// To set flags: tmgtclient -X 0x40 --data 0x1f<4 byte value for G_internal_flags>
+extern uint32_t G_internal_flags;
+#define INT_FLAG_DISABLE_24X7 0x00000001
+
void notifyCmdhWakeupCondition(eCmdhWakeupThreadMask i_cond);
void clearCmdhWakeupCondition(eCmdhWakeupThreadMask i_cond);
int cmdh_thread_wait_for_wakeup(void);
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h
index 7b8e4a5..1f1caa9 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.h
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h
@@ -631,8 +631,6 @@ typedef struct __attribute__ ((packed))
uint8_t checksum[CMDH_FSP_CHECKSUM_SIZE];
}cmdh_dbug_internal_flags_rsp_t;
-extern uint32_t G_internal_flags;
-
//---------------------------------------------------------
// Tunable Parameter Command
//---------------------------------------------------------
diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h
index b212028..ee140d3 100644
--- a/src/occ_405/occ_service_codes.h
+++ b/src/occ_405/occ_service_codes.h
@@ -198,6 +198,7 @@ enum occExtReasonCode
ERC_AMEC_VRM_VDD_TEMP_TIMEOUT = 0x0030,
ERC_AMEC_DIMM_TEMP_TIMEOUT = 0x0031,
ERC_AMEC_CENT_TEMP_TIMEOUT = 0x0032,
+ ERC_AMEC_GPE1_TIMEOUT = 0x0033,
ERC_CMDH_MBOX_REQST_FAILURE = 0x0040,
ERC_CMDH_INTERNAL_FAILURE = 0x0041,
diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c
index cffa109..21c2ecf 100755
--- a/src/occ_405/occbuildname.c
+++ b/src/occ_405/occbuildname.c
@@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =
#else
-volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180621a\0" /*</BuildName>*/ ;
+volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180629a\0" /*</BuildName>*/ ;
#endif
diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c
index f1a0b35..c2eb17f 100755
--- a/src/occ_405/proc/proc_data.c
+++ b/src/occ_405/proc/proc_data.c
@@ -716,7 +716,8 @@ void task_24x7(task_t * i_task)
static uint8_t L_numTicks = 0x00; // never called since OCC started
static bool L_idle_trace = FALSE;
- if (!G_24x7_disabled)
+ // Schedule 24x7 task if it hasn't been disabled
+ if( (!G_24x7_disabled) && !(G_internal_flags & INT_FLAG_DISABLE_24X7) )
{
// Schedule 24x7 task if idle
if (!async_request_is_idle(&G_24x7_request.request))
OpenPOWER on IntegriCloud