summaryrefslogtreecommitdiffstats
path: root/src/occ_405
diff options
context:
space:
mode:
authorChris Cain <cjcain@us.ibm.com>2017-08-23 15:18:24 -0500
committerChristopher J. Cain <cjcain@us.ibm.com>2017-08-25 14:41:28 -0400
commitdf326632a2cc6be49523b32fd034a95915e76898 (patch)
tree165d56f3289547d2d9decbf9ca6cee056c00544b /src/occ_405
parent3f57751abd8ca0308e3938dc86d5a313b7599ebc (diff)
downloadtalos-occ-df326632a2cc6be49523b32fd034a95915e76898.tar.gz
talos-occ-df326632a2cc6be49523b32fd034a95915e76898.zip
Only call out DIMMs when health monitor time has expired
Previously OCC would call out the DIMM if we got 2 consecutive I2C failures trying to read DIMM temperatures. Health monitor already has code to handle timeout, so we will just keep retrying on failures. - Remove 60 second delay before starting to read DIMM temps since SW398808 should resolve the lock problem. - Added debug cmd to retrieve the GPE0/GPE1 trace buffers. Change-Id: I65156347e24ff8e68414a64aaf7e00ff4c12a2f8 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45073 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Diffstat (limited to 'src/occ_405')
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c39
-rwxr-xr-xsrc/occ_405/dimm/dimm.c94
-rwxr-xr-xsrc/occ_405/lock/lock.c2
-rwxr-xr-xsrc/occ_405/occbuildname.c2
4 files changed, 75 insertions, 62 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index cb3835c..c6802ee 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -48,12 +48,14 @@
#include "homer.h"
#include <centaur_data.h>
#include <avsbus.h>
-#include "cmdh_dbug_cmd.h"
#include "wof.h"
#include "sensor_main_memory.h"
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;
+#include <gpe_export.h>
+extern gpe_shared_data_t G_shared_gpe_data;
+
// This table contains tunable parameter information that can be exposed to
// customers (only Master OCC should access/control this table)
cmdh_tunable_param_table_t G_mst_tunable_parameter_table[CMDH_DEFAULT_TUNABLE_PARAM_NUM] =
@@ -889,9 +891,34 @@ void cmdh_dbug_get_trace (const cmdh_fsp_cmd_t * i_cmd_ptr,
cmdh_dbug_get_trace_query_t *l_get_trace_query_ptr = (cmdh_dbug_get_trace_query_t*) i_cmd_ptr;
cmdh_dbug_get_trace_resp_t *l_get_trace_resp_ptr = (cmdh_dbug_get_trace_resp_t*) o_rsp_ptr;
- const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp);
- l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size);
- l_trace_size = l_trace_buffer_size;
+ if (memcmp((char *)l_get_trace_query_ptr->comp, "GP", 2) == 0)
+ {
+ // Return a GPE0/GPE1 trace buffer
+ if (l_get_trace_query_ptr->comp[2] == '0')
+ {
+ if (G_shared_gpe_data.gpe0_tb_ptr != 0)
+ {
+ l_trace_size = G_shared_gpe_data.gpe0_tb_sz;
+ memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe0_tb_ptr, (size_t)l_trace_size);
+ }
+ }
+ else if (l_get_trace_query_ptr->comp[2] == '1')
+ {
+ if (G_shared_gpe_data.gpe0_tb_ptr != 0)
+ {
+ l_trace_size = G_shared_gpe_data.gpe1_tb_sz;
+ memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe1_tb_ptr, (size_t)l_trace_size);
+ }
+ }
+ else l_rc = 255;
+ }
+ else
+ {
+ // Return a 405 trace buffer
+ const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp);
+ l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size);
+ l_trace_size = l_trace_buffer_size;
+ }
if(l_rc==0)
{
G_rsp_status = ERRL_RC_SUCCESS;
@@ -1924,7 +1951,7 @@ uint8_t cmdh_set_user_pcap_common(uint16_t i_pcap,
//Indicate there is new PCAP data available
G_master_pcap_data.pcap_data_count++;
- // if user pcap was just disabled set source to 0 (no user pcap)
+ // if user pcap was just disabled set source to 0 (no user pcap)
if(i_pcap == 0)
{
G_master_pcap_data.source = 0;
@@ -2089,7 +2116,7 @@ uint8_t cmdh_set_pcap_inband(const uint16_t i_cmd_data_length,
uint16_t l_pcap = CONVERT_UINT8_ARRAY_UINT16(l_cmd_ptr->power_cap[0],
l_cmd_ptr->power_cap[1]);
l_rc = cmdh_set_user_pcap_common(l_pcap, IN_BAND);
-
+
// if successful copy the power cap to the response buffer and set the rsp length
if(l_rc == ERRL_RC_SUCCESS)
{
diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c
index 5b3052f..5dac23d 100755
--- a/src/occ_405/dimm/dimm.c
+++ b/src/occ_405/dimm/dimm.c
@@ -58,11 +58,9 @@ uint8_t G_maxDimmPort = NUM_DIMM_PORTS - 1;
bool G_dimm_i2c_reset_required = false;
uint32_t G_dimm_i2c_reset_cause = 0;
-#define MAX_CONSECUTIVE_DIMM_RESETS 1
-
typedef struct {
bool disabled;
- uint8_t errorCount;
+ uint8_t errorCount; // # consecutive errors for this DIMM
} dimmData_t;
dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{{false,0}}};
@@ -263,12 +261,17 @@ void mark_dimm_failed()
{
const uint8_t port = G_dimm_sm_args.i2cPort;
const uint8_t dimm = G_dimm_sm_args.dimm;
- INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X "
- "(ffdc 0x%08X%08X, completion_state 0x%02X)",
- DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount,
- WORD_HIGH(G_dimm_sm_args.error.ffdc),
- WORD_LOW(G_dimm_sm_args.error.ffdc),
- G_dimm_sm_request.request.completion_state);
+
+ // Trace the first 3 consecutive failures for this DIMM
+ if (G_dimm[port][dimm].errorCount < 3)
+ {
+ INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X "
+ "(ffdc 0x%08X%08X, completion_state 0x%02X)",
+ DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount,
+ WORD_HIGH(G_dimm_sm_args.error.ffdc),
+ WORD_LOW(G_dimm_sm_args.error.ffdc),
+ G_dimm_sm_request.request.completion_state);
+ }
g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].flags |= FRU_SENSOR_STATUS_ERROR;
@@ -281,43 +284,20 @@ void mark_dimm_failed()
INCREMENT_ERR_HISTORY(ERRH_DIMM_I2C_PORT1);
}
- if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS)
+ if (G_dimm[port][dimm].errorCount < 255)
{
- // Disable collection on this DIMM, collect FFDC and log error
- G_dimm[port][dimm].disabled = true;
- INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)",
- DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state);
- errlHndl_t l_err = NULL;
- /*
- * @errortype
- * @moduleid DIMM_MID_MARK_DIMM_FAILED
- * @reasoncode DIMM_GPE_FAILURE
- * @userdata1 GPE returned rc code
- * @userdata4 ERC_DIMM_COMPLETE_FAILURE
- * @devdesc Disabling DIMM due to repeated I2C failures
- */
- l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED,
- DIMM_GPE_FAILURE,
- ERC_DIMM_COMPLETE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_dimm_sm_args.error.rc,
- 0);
- addUsrDtlsToErrl(l_err,
- (uint8_t*)&G_dimm_sm_request.ffdc,
- sizeof(G_dimm_sm_request.ffdc),
- ERRL_STRUCT_VERSION_1,
- ERRL_USR_DTL_BINARY_DATA);
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- G_sysConfigData.dimm_huids[port][dimm],
- ERRL_CALLOUT_PRIORITY_HIGH);
- //Mark DIMM as logged so we don't log it again
- amec_mem_mark_logged(0, dimm,
- &G_cent_timeout_logged_bitmap,
- &G_dimm_timeout_logged_bitmap.bytes[port]);
- commitErrl(&l_err);
+ ++G_dimm[port][dimm].errorCount;
+ }
+
+ if (false == G_dimm[port][dimm].disabled)
+ {
+ if(G_dimm_timeout_logged_bitmap.bytes[port] & (DIMM_SENSOR0 >> dimm))
+ {
+ //Health monitor has already logged a timeout for this DIMM
+ G_dimm[port][dimm].disabled = true;
+ INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to health monitor timeout (consecutive errors: %d)",
+ DIMM_AND_PORT, G_dimm[port][dimm].errorCount);
+ }
}
// Reset DIMM I2C engine
@@ -471,6 +451,7 @@ uint8_t dimm_reset_sm()
case DIMM_STATE_RESET_MASTER:
if (DIMM_TICK == 0)
{
+ TRAC_INFO("dimm_reset_sm: Initiating I2C reset of engine %d", G_sysConfigData.dimm_i2c_engine);
L_new_dimm_args.i2cEngine = G_sysConfigData.dimm_i2c_engine;
if (schedule_dimm_req(DIMM_STATE_RESET_MASTER, L_new_dimm_args))
{
@@ -710,6 +691,12 @@ void process_dimm_temp()
// Store DIMM temp in sensor
sensor_update(&g_amec->proc[0].tempdimm[DIMM_INDEX(port, dimm)], l_dimm_temp);
+ // Successful temp collected, reset error count
+ if (G_dimm[port][dimm].errorCount > 2)
+ {
+ INTR_TRAC_INFO("process_dimm_temp: successfully read temp for DIMM%04X (after %d consecutive errors)",
+ DIMM_AND_PORT, G_dimm[port][dimm].errorCount);
+ }
G_dimm[port][dimm].errorCount = 0;
} // end process_dimm_temp()
@@ -736,18 +723,16 @@ void task_dimm_sm(struct task *i_self)
static bool L_readIssued = false;
const uint8_t engine = G_sysConfigData.dimm_i2c_engine;
static bool L_occ_owns_lock = false;
- // 60,000 x 500us (tick time) x 2 (called every other tick) = 60 seconds
- static unsigned int L_startup_delay = 60000;
- if (L_startup_delay > 0)
+ static unsigned int L_dimms_enabled = false;
+ if (!L_dimms_enabled)
{
- if (--L_startup_delay == 0)
- {
- TRAC_INFO("task_dimm_sm: Startup delay completed, DIMM temp collection will be started (0x%08X)", G_dimm_present_sensors.words[0]);
- G_dimm_enabled_sensors = G_dimm_present_sensors;
- }
+ L_dimms_enabled = true;
+ TRAC_INFO("task_dimm_sm: DIMM temp collection is being started (0x%08X)", G_dimm_present_sensors.words[0]);
+ G_dimm_enabled_sensors = G_dimm_present_sensors;
}
- else if (G_mem_monitoring_allowed)
+
+ if (G_mem_monitoring_allowed)
{
#ifdef DEBUG_LOCK_TESTING
SIMULATE_HOST();
@@ -929,6 +914,7 @@ void task_dimm_sm(struct task *i_self)
if ((DIMM_TICK == 0) || (DIMM_TICK == 8))
{
// If DIMM has huid/sensor then it should be present
+ // and if not disabled yet, start temp collection
if (NIMBUS_DIMM_PRESENT(L_dimmPort,L_dimmIndex) &&
(G_dimm[L_dimmPort][L_dimmIndex].disabled == false))
{
diff --git a/src/occ_405/lock/lock.c b/src/occ_405/lock/lock.c
index 973e9a5..2716b01 100755
--- a/src/occ_405/lock/lock.c
+++ b/src/occ_405/lock/lock.c
@@ -160,7 +160,7 @@ void update_i2c_lock(const lockOperation_e i_op, const uint8_t i_engine)
{
out32(OCB_OCCFLG_OR, occ_flags.value);
- TRAC_IMP("update_i2c_lock: OCC has aquired lock for I2C engine %d", i_engine);
+ TRAC_IMP("update_i2c_lock: OCC has acquired lock for I2C engine %d", i_engine);
}
} // end update_i2c_lock()
diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c
index eddf855..bf1f6f0 100755
--- a/src/occ_405/occbuildname.c
+++ b/src/occ_405/occbuildname.c
@@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =
#else
-volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170822a\0" /*</BuildName>*/ ;
+volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170825a\0" /*</BuildName>*/ ;
#endif
OpenPOWER on IntegriCloud