From bc0c2332263be3b8615c7ff04a3373cfd57f7a57 Mon Sep 17 00:00:00 2001 From: mbroyles Date: Thu, 28 Jun 2018 12:23:30 -0500 Subject: Prevent calling out DIMMs and Centaurs due to GPE issues Change-Id: If977941c59a60c4e4fcd0d9759a83b3b0d0c5dd9 CQ: SW433604 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/61579 Tested-by: FSP CI Jenkins Reviewed-by: William A. Bryan Reviewed-by: Andres A. Lugo-Reyes Reviewed-by: Martha Broyles --- src/occ_405/amec/amec_health.c | 28 ++++++++++++++++++++++++++++ src/occ_405/amec/amec_sensors_fw.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'src/occ_405/amec') diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index ce3637a..91b2a28 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -37,6 +37,7 @@ // Externs //*************************************************************************/ extern bool G_simics_environment; +extern bool G_log_gpe1_error; //*************************************************************************/ // Defines/Enums @@ -391,6 +392,17 @@ void amec_health_check_dimm_timeout() continue; } + // To prevent DIMMs from incorrectly being called out, don't log errors if there have + // been timeouts with GPE1 tasks not finishing + if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermaldimm.temp_timeout) + { + TRAC_ERR("Timed out reading DIMM temperature due to GPE1 issues"); + // give notification that GPE1 error should now be logged which will reset the OCC + G_log_gpe1_error = TRUE; + // no reason to check anymore since all DIMMs are collected from the same GPE + break; + } + TRAC_ERR("Timed out reading DIMM%04X temperature (cur_temp[%d] flags[0x%02X])", (l_port<<8)|l_dimm, l_fru->cur_temp, l_fru->flags); @@ -435,6 +447,11 @@ void amec_health_check_dimm_timeout() l_callouts_count++; } } //iterate over all dimms + if(G_log_gpe1_error) + { + // Going to be resetting so no reason to check anymore ports + break; + } } //iterate over all ports if(l_err) @@ -708,6 +725,17 @@ void amec_health_check_cent_timeout() continue; } + // To prevent Centaurs from incorrectly being called out, don't log errors if there have + // been timeouts with GPE1 tasks not finishing + if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermalcent.temp_timeout) + { + TRAC_ERR("Timed out reading centaur temperature due to GPE1 issues"); + // give notification that GPE1 error should now be logged which will reset the OCC + G_log_gpe1_error = TRUE; + // no reason to check anymore since all Centaurs are collected from the same GPE + break; + } + TRAC_ERR("Timed out reading centaur temperature on cent[%d] temp[%d] flags[0x%02X]", l_cent, l_fru->cur_temp, l_fru->flags); diff --git a/src/occ_405/amec/amec_sensors_fw.c b/src/occ_405/amec/amec_sensors_fw.c index 6c5a1c0..e4d524b 100644 --- a/src/occ_405/amec/amec_sensors_fw.c +++ b/src/occ_405/amec/amec_sensors_fw.c @@ -49,6 +49,7 @@ /* Globals */ /******************************************************************************/ extern bool G_24x7_disabled; +bool G_log_gpe1_error = FALSE; //************************************************************************* // Code @@ -203,8 +204,10 @@ void task_gpe_timings(task_t * i_task) else { INCREMENT_ERR_HISTORY(ERRH_GPE1_NOT_IDLE); + // Log error and request reset if GPE1 issue has gone on long enough to cause real issues + // i.e. timeout collecting memory temperatures - if(L_consec_trace_count[1] < MAX_CONSEC_TRACE) + if( (L_consec_trace_count[1] < MAX_CONSEC_TRACE) || (G_log_gpe1_error) ) { xsr_sprg0.fields.xsr = in32(GPE_GPE1XIXSR); xsr_sprg0.fields.sprg0 = in32(GPE_GPE1XISPRG0); @@ -216,6 +219,31 @@ void task_gpe_timings(task_t * i_task) xsr_sprg0.fields.xsr, iar_xsr.fields.iar, ir_edr.fields.ir, ir_edr.fields.edr, xsr_sprg0.fields.sprg0); L_consec_trace_count[1]++; + + if(G_log_gpe1_error) + { + TRAC_ERR("GPE1 not idle causing timeouts, need to reset!!!"); + /* @ + * @errortype + * @moduleid AMEC_UPDATE_FW_SENSORS + * @reasoncode GPE_REQUEST_TASK_TIMEOUT + * @userdata1 0 + * @userdata2 0 + * @userdata4 ERC_AMEC_GPE1_TIMEOUT + * @devdesc Tasks on GPE1 failing to complete + */ + l_err = createErrl(AMEC_UPDATE_FW_SENSORS, //modId + GPE_REQUEST_TASK_TIMEOUT, //reasoncode + ERC_AMEC_GPE1_TIMEOUT, //Extended reason code + ERRL_SEV_UNRECOVERABLE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //userdata1 + 0); //userdata2 + + // Commit error log and request reset + REQUEST_RESET(l_err); + } } } } -- cgit v1.2.1