diff options
author | Chris Cain <cjcain@us.ibm.com> | 2017-08-23 15:18:24 -0500 |
---|---|---|
committer | Christopher J. Cain <cjcain@us.ibm.com> | 2017-08-25 14:41:28 -0400 |
commit | df326632a2cc6be49523b32fd034a95915e76898 (patch) | |
tree | 165d56f3289547d2d9decbf9ca6cee056c00544b /src/occ_405 | |
parent | 3f57751abd8ca0308e3938dc86d5a313b7599ebc (diff) | |
download | talos-occ-df326632a2cc6be49523b32fd034a95915e76898.tar.gz talos-occ-df326632a2cc6be49523b32fd034a95915e76898.zip |
Only call out DIMMs when health monitor time has expired
Previously OCC would call out the DIMM if we got 2 consecutive
I2C failures trying to read DIMM temperatures. Health monitor
already has code to handle timeout, so we will just keep retrying
on failures.
- Remove 60 second delay before starting to read DIMM temps
since SW398808 should resolve the lock problem.
- Added debug cmd to retrieve the GPE0/GPE1 trace buffers.
Change-Id: I65156347e24ff8e68414a64aaf7e00ff4c12a2f8
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45073
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Diffstat (limited to 'src/occ_405')
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.c | 39 | ||||
-rwxr-xr-x | src/occ_405/dimm/dimm.c | 94 | ||||
-rwxr-xr-x | src/occ_405/lock/lock.c | 2 | ||||
-rwxr-xr-x | src/occ_405/occbuildname.c | 2 |
4 files changed, 75 insertions, 62 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index cb3835c..c6802ee 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -48,12 +48,14 @@ #include "homer.h" #include <centaur_data.h> #include <avsbus.h> -#include "cmdh_dbug_cmd.h" #include "wof.h" #include "sensor_main_memory.h" extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap; extern bool G_vrm_thermal_monitoring; +#include <gpe_export.h> +extern gpe_shared_data_t G_shared_gpe_data; + // This table contains tunable parameter information that can be exposed to // customers (only Master OCC should access/control this table) cmdh_tunable_param_table_t G_mst_tunable_parameter_table[CMDH_DEFAULT_TUNABLE_PARAM_NUM] = @@ -889,9 +891,34 @@ void cmdh_dbug_get_trace (const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_dbug_get_trace_query_t *l_get_trace_query_ptr = (cmdh_dbug_get_trace_query_t*) i_cmd_ptr; cmdh_dbug_get_trace_resp_t *l_get_trace_resp_ptr = (cmdh_dbug_get_trace_resp_t*) o_rsp_ptr; - const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp); - l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size); - l_trace_size = l_trace_buffer_size; + if (memcmp((char *)l_get_trace_query_ptr->comp, "GP", 2) == 0) + { + // Return a GPE0/GPE1 trace buffer + if (l_get_trace_query_ptr->comp[2] == '0') + { + if (G_shared_gpe_data.gpe0_tb_ptr != 0) + { + l_trace_size = G_shared_gpe_data.gpe0_tb_sz; + memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe0_tb_ptr, (size_t)l_trace_size); + } + } + else if (l_get_trace_query_ptr->comp[2] == '1') + { + if (G_shared_gpe_data.gpe0_tb_ptr != 0) + { + l_trace_size = G_shared_gpe_data.gpe1_tb_sz; + memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe1_tb_ptr, (size_t)l_trace_size); + } + } + else l_rc = 255; + } + else + { + // Return a 405 trace buffer + const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp); + l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size); + l_trace_size = l_trace_buffer_size; + } if(l_rc==0) { G_rsp_status = ERRL_RC_SUCCESS; @@ -1924,7 +1951,7 @@ uint8_t cmdh_set_user_pcap_common(uint16_t i_pcap, //Indicate there is new PCAP data available G_master_pcap_data.pcap_data_count++; - // if user pcap was just disabled set source to 0 (no user pcap) + // if user pcap was just disabled set source to 0 (no user pcap) if(i_pcap == 0) { G_master_pcap_data.source = 0; @@ -2089,7 +2116,7 @@ uint8_t cmdh_set_pcap_inband(const uint16_t i_cmd_data_length, uint16_t l_pcap = CONVERT_UINT8_ARRAY_UINT16(l_cmd_ptr->power_cap[0], l_cmd_ptr->power_cap[1]); l_rc = cmdh_set_user_pcap_common(l_pcap, IN_BAND); - + // if successful copy the power cap to the response buffer and set the rsp length if(l_rc == ERRL_RC_SUCCESS) { diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c index 5b3052f..5dac23d 100755 --- a/src/occ_405/dimm/dimm.c +++ b/src/occ_405/dimm/dimm.c @@ -58,11 +58,9 @@ uint8_t G_maxDimmPort = NUM_DIMM_PORTS - 1; bool G_dimm_i2c_reset_required = false; uint32_t G_dimm_i2c_reset_cause = 0; -#define MAX_CONSECUTIVE_DIMM_RESETS 1 - typedef struct { bool disabled; - uint8_t errorCount; + uint8_t errorCount; // # consecutive errors for this DIMM } dimmData_t; dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{{false,0}}}; @@ -263,12 +261,17 @@ void mark_dimm_failed() { const uint8_t port = G_dimm_sm_args.i2cPort; const uint8_t dimm = G_dimm_sm_args.dimm; - INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X " - "(ffdc 0x%08X%08X, completion_state 0x%02X)", - DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount, - WORD_HIGH(G_dimm_sm_args.error.ffdc), - WORD_LOW(G_dimm_sm_args.error.ffdc), - G_dimm_sm_request.request.completion_state); + + // Trace the first 3 consecutive failures for this DIMM + if (G_dimm[port][dimm].errorCount < 3) + { + INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X " + "(ffdc 0x%08X%08X, completion_state 0x%02X)", + DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount, + WORD_HIGH(G_dimm_sm_args.error.ffdc), + WORD_LOW(G_dimm_sm_args.error.ffdc), + G_dimm_sm_request.request.completion_state); + } g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].flags |= FRU_SENSOR_STATUS_ERROR; @@ -281,43 +284,20 @@ void mark_dimm_failed() INCREMENT_ERR_HISTORY(ERRH_DIMM_I2C_PORT1); } - if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS) + if (G_dimm[port][dimm].errorCount < 255) { - // Disable collection on this DIMM, collect FFDC and log error - G_dimm[port][dimm].disabled = true; - INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)", - DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state); - errlHndl_t l_err = NULL; - /* - * @errortype - * @moduleid DIMM_MID_MARK_DIMM_FAILED - * @reasoncode DIMM_GPE_FAILURE - * @userdata1 GPE returned rc code - * @userdata4 ERC_DIMM_COMPLETE_FAILURE - * @devdesc Disabling DIMM due to repeated I2C failures - */ - l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED, - DIMM_GPE_FAILURE, - ERC_DIMM_COMPLETE_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - G_dimm_sm_args.error.rc, - 0); - addUsrDtlsToErrl(l_err, - (uint8_t*)&G_dimm_sm_request.ffdc, - sizeof(G_dimm_sm_request.ffdc), - ERRL_STRUCT_VERSION_1, - ERRL_USR_DTL_BINARY_DATA); - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.dimm_huids[port][dimm], - ERRL_CALLOUT_PRIORITY_HIGH); - //Mark DIMM as logged so we don't log it again - amec_mem_mark_logged(0, dimm, - &G_cent_timeout_logged_bitmap, - &G_dimm_timeout_logged_bitmap.bytes[port]); - commitErrl(&l_err); + ++G_dimm[port][dimm].errorCount; + } + + if (false == G_dimm[port][dimm].disabled) + { + if(G_dimm_timeout_logged_bitmap.bytes[port] & (DIMM_SENSOR0 >> dimm)) + { + //Health monitor has already logged a timeout for this DIMM + G_dimm[port][dimm].disabled = true; + INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to health monitor timeout (consecutive errors: %d)", + DIMM_AND_PORT, G_dimm[port][dimm].errorCount); + } } // Reset DIMM I2C engine @@ -471,6 +451,7 @@ uint8_t dimm_reset_sm() case DIMM_STATE_RESET_MASTER: if (DIMM_TICK == 0) { + TRAC_INFO("dimm_reset_sm: Initiating I2C reset of engine %d", G_sysConfigData.dimm_i2c_engine); L_new_dimm_args.i2cEngine = G_sysConfigData.dimm_i2c_engine; if (schedule_dimm_req(DIMM_STATE_RESET_MASTER, L_new_dimm_args)) { @@ -710,6 +691,12 @@ void process_dimm_temp() // Store DIMM temp in sensor sensor_update(&g_amec->proc[0].tempdimm[DIMM_INDEX(port, dimm)], l_dimm_temp); + // Successful temp collected, reset error count + if (G_dimm[port][dimm].errorCount > 2) + { + INTR_TRAC_INFO("process_dimm_temp: successfully read temp for DIMM%04X (after %d consecutive errors)", + DIMM_AND_PORT, G_dimm[port][dimm].errorCount); + } G_dimm[port][dimm].errorCount = 0; } // end process_dimm_temp() @@ -736,18 +723,16 @@ void task_dimm_sm(struct task *i_self) static bool L_readIssued = false; const uint8_t engine = G_sysConfigData.dimm_i2c_engine; static bool L_occ_owns_lock = false; - // 60,000 x 500us (tick time) x 2 (called every other tick) = 60 seconds - static unsigned int L_startup_delay = 60000; - if (L_startup_delay > 0) + static unsigned int L_dimms_enabled = false; + if (!L_dimms_enabled) { - if (--L_startup_delay == 0) - { - TRAC_INFO("task_dimm_sm: Startup delay completed, DIMM temp collection will be started (0x%08X)", G_dimm_present_sensors.words[0]); - G_dimm_enabled_sensors = G_dimm_present_sensors; - } + L_dimms_enabled = true; + TRAC_INFO("task_dimm_sm: DIMM temp collection is being started (0x%08X)", G_dimm_present_sensors.words[0]); + G_dimm_enabled_sensors = G_dimm_present_sensors; } - else if (G_mem_monitoring_allowed) + + if (G_mem_monitoring_allowed) { #ifdef DEBUG_LOCK_TESTING SIMULATE_HOST(); @@ -929,6 +914,7 @@ void task_dimm_sm(struct task *i_self) if ((DIMM_TICK == 0) || (DIMM_TICK == 8)) { // If DIMM has huid/sensor then it should be present + // and if not disabled yet, start temp collection if (NIMBUS_DIMM_PRESENT(L_dimmPort,L_dimmIndex) && (G_dimm[L_dimmPort][L_dimmIndex].disabled == false)) { diff --git a/src/occ_405/lock/lock.c b/src/occ_405/lock/lock.c index 973e9a5..2716b01 100755 --- a/src/occ_405/lock/lock.c +++ b/src/occ_405/lock/lock.c @@ -160,7 +160,7 @@ void update_i2c_lock(const lockOperation_e i_op, const uint8_t i_engine) { out32(OCB_OCCFLG_OR, occ_flags.value); - TRAC_IMP("update_i2c_lock: OCC has aquired lock for I2C engine %d", i_engine); + TRAC_IMP("update_i2c_lock: OCC has acquired lock for I2C engine %d", i_engine); } } // end update_i2c_lock() diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c index eddf855..bf1f6f0 100755 --- a/src/occ_405/occbuildname.c +++ b/src/occ_405/occbuildname.c @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = #else -volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170822a\0" /*</BuildName>*/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170825a\0" /*</BuildName>*/ ; #endif |