diff options
author | mbroyles <mbroyles@us.ibm.com> | 2019-07-18 14:53:24 -0500 |
---|---|---|
committer | Martha Broyles <mbroyles@us.ibm.com> | 2019-07-19 12:52:42 -0500 |
commit | d467852fe039a980180df22178ae09a89a3ed6d9 (patch) | |
tree | 008d1caec12e95eebc24e30d9a4770b42ea1d058 | |
parent | bae814cdb7dc0206d13bdd4c1b0f531f3da814eb (diff) | |
download | talos-occ-d467852fe039a980180df22178ae09a89a3ed6d9.tar.gz talos-occ-d467852fe039a980180df22178ae09a89a3ed6d9.zip |
Fix incorrect hw callout in Centaur DIMM OT errors
Change-Id: I2a7076f1a328daf18b3eff35cd75895c472a8962
CQ: SW470683
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/80639
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: William A Bryan <wilbryan@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-x | src/occ_405/amec/amec_health.c | 31 | ||||
-rw-r--r-- | src/occ_405/amec/amec_sensors_centaur.c | 15 | ||||
-rw-r--r-- | src/occ_405/amec/amec_sensors_ocmb.c | 15 | ||||
-rwxr-xr-x | src/occ_405/dimm/dimm.c | 16 | ||||
-rwxr-xr-x | src/occ_405/occbuildname.c | 2 |
5 files changed, 60 insertions, 19 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 91b2a28..562038d 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2018 */ +/* Contributors Listed Below - COPYRIGHT 2011,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -142,11 +142,12 @@ void amec_mem_mark_logged(uint8_t i_cent, */ void amec_health_check_dimm_temp() { - uint16_t l_ot_error, l_cur_temp, l_max_temp; + uint16_t l_ot_error, l_max_temp; sensor_t *l_sensor; uint8_t l_dimm; uint8_t l_port; - uint8_t l_max_port; // #ports in nimbus/#centaurs in cumulus + uint8_t l_max_port; // #ports in nimbus/#mem buf in cumulus/OCM + uint8_t l_max_dimm_per_port; // per port in nimbus/per mem buf in cumulus/OCM uint32_t l_callouts_count = 0; uint8_t l_new_callouts; uint64_t l_huid; @@ -155,10 +156,12 @@ void amec_health_check_dimm_temp() if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS) { l_max_port = NUM_DIMM_PORTS; + l_max_dimm_per_port = NUM_DIMMS_PER_I2CPORT; } else // MEM_TYPE_CUMULUS { l_max_port = MAX_NUM_CENTAURS; + l_max_dimm_per_port = NUM_DIMMS_PER_CENTAUR; } // Check to see if any dimms have reached the error temperature that @@ -170,7 +173,6 @@ void amec_health_check_dimm_temp() l_ot_error = g_amec->thermaldimm.ot_error; l_sensor = getSensorByGsid(TEMPDIMMTHRM); - l_cur_temp = l_sensor->sample; l_max_temp = l_sensor->sample_max; //iterate over all dimms @@ -186,14 +188,15 @@ void amec_health_check_dimm_temp() continue; } - TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. current[%d], hist_max[%d], port[%d]", - l_ot_error, - l_cur_temp, - l_max_temp, - l_port); + // if the previous port had errors commit it so this port gets new error log + if(l_err) + { + commitErrl(&l_err); + l_callouts_count = 0; + } //find the dimm(s) that need to be called out for this port - for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++) + for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++) { if (!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm))) { @@ -206,15 +209,19 @@ void amec_health_check_dimm_temp() l_dimm, &G_cent_overtemp_logged_bitmap, &G_dimm_overtemp_logged_bitmap.bytes[l_port]); - TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X overtemp - %dC", + TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X being called out for overtemp - %dC", (l_port<<8)|l_dimm, l_fru->cur_temp); - // Create single elog with up to MAX_CALLOUTS + // Create single elog with up to MAX_CALLOUTS for this port if(l_callouts_count < ERRL_MAX_CALLOUTS) { //If we don't have an error log for the callout, create one if(!l_err) { + TRAC_ERR("amec_health_check_dimm_temp: Creating log for port[%d] OT bitmap[0x%02X] logged bitmap[0x%02X]", + l_port, + G_dimm_overtemp_bitmap.bytes[l_port], + G_dimm_overtemp_logged_bitmap.bytes[l_port]); /* @ * @errortype * @moduleid AMEC_HEALTH_CHECK_DIMM_TEMP diff --git a/src/occ_405/amec/amec_sensors_centaur.c b/src/occ_405/amec/amec_sensors_centaur.c index 35fd262..8f946e6 100644 --- a/src/occ_405/amec/amec_sensors_centaur.c +++ b/src/occ_405/amec/amec_sensors_centaur.c @@ -71,7 +71,7 @@ void amec_perfcount_getmc( CentaurMemData * i_sensor_cache, uint8_t i_centaur); // Function Specification // -// Name: amec_update_dimm_dts_sensors +// Name: amec_update_centaur_sensors // // Description: Updates sensors that have data grabbed by the fast core data // task. @@ -116,6 +116,7 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen uint32_t l_sens_status; int32_t l_dimm_temp, l_prev_temp; static uint8_t L_ran_once[MAX_NUM_CENTAURS] = {FALSE}; + static bool L_ot_traced[MAX_NUM_CENTAURS][NUM_DIMMS_PER_CENTAUR] = {{false}}; // Harvest thermal data for all dimms for(k=0; k < NUM_DIMMS_PER_CENTAUR; k++) @@ -236,7 +237,17 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen if(l_dts[k] >= g_amec->thermaldimm.ot_error) { //Set a bit so that this dimm can be called out by the thermal thread - G_dimm_overtemp_bitmap.bytes[i_centaur] |= 1 << k; + G_dimm_overtemp_bitmap.bytes[i_centaur] |= (DIMM_SENSOR0 >> k); + // trace first time OT per DIMM + if( !L_ot_traced[i_centaur][k] ) + { + TRAC_ERR("amec_update_dimm_dts_sensors: centaur[%d] DIMM[%d] reached error temp[%d]. current[%d]", + i_centaur, + k, + g_amec->thermaldimm.ot_error, + l_dts[k]); + L_ot_traced[i_centaur][k] = true; + } } } diff --git a/src/occ_405/amec/amec_sensors_ocmb.c b/src/occ_405/amec/amec_sensors_ocmb.c index 0a9f072..88f277c 100644 --- a/src/occ_405/amec/amec_sensors_ocmb.c +++ b/src/occ_405/amec/amec_sensors_ocmb.c @@ -71,7 +71,7 @@ void amec_perfcount_ocmb_getmc( OcmbMemData * i_sensor_cache, uint8_t i_membuf); // Function Specification // -// Name: amec_update_ocmb_dimm_dts_sensors +// Name: amec_update_ocmb_sensors // // Description: Updates sensors that have data grabbed by the fast core data // task. @@ -119,6 +119,7 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m uint32_t l_hottest_dimm_loc = NUM_DIMMS_PER_OCMB; int32_t l_dimm_temp, l_prev_temp; static uint8_t L_ran_once[MAX_NUM_OCMBS] = {FALSE}; + static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}}; // Harvest thermal data for all dimms for(k=0; k < NUM_DIMMS_PER_OCMB; k++) @@ -253,7 +254,17 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m if(l_dts[k] >= g_amec->thermaldimm.ot_error) { //Set a bit so that this dimm can be called out by the thermal thread - G_dimm_overtemp_bitmap.bytes[i_membuf] |= 1 << k; + G_dimm_overtemp_bitmap.bytes[i_membuf] |= (DIMM_SENSOR0 >> k); + // trace first time OT per DIMM + if( !L_ot_traced[i_membuf][k] ) + { + TRAC_ERR("amec_update_ocmb_dimm_dts_sensors: Mem Buf[%d] DIMM[%d] reached error temp[%d]. current[%d]", + i_membuf, + k, + g_amec->thermaldimm.ot_error, + l_dts[k]); + L_ot_traced[i_membuf][k] = true; + } } } diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c index fd8e6e3..7757d2c 100755 --- a/src/occ_405/dimm/dimm.c +++ b/src/occ_405/dimm/dimm.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2018 */ +/* Contributors Listed Below - COPYRIGHT 2011,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -623,6 +623,7 @@ void process_dimm_temp() const uint8_t port = G_dimm_sm_args.i2cPort; const uint8_t dimm = G_dimm_sm_args.dimm; uint8_t l_dimm_temp = G_dimm_sm_args.temp; + static bool L_ot_traced[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{false}}; #define MIN_VALID_DIMM_TEMP 1 #define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek @@ -687,7 +688,18 @@ void process_dimm_temp() if (l_dimm_temp >= g_amec->thermaldimm.ot_error) { //Set a bit so that this dimm can be called out by the thermal thread - G_dimm_overtemp_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm; + G_dimm_overtemp_bitmap.bytes[port] |= (DIMM_SENSOR0 >> dimm); + + // trace first time OT per DIMM + if( !L_ot_traced[port][dimm] ) + { + TRAC_ERR("process_dimm_temp: port[%d] DIMM[%d] reached error temp[%d]. current[%d]", + port, + dimm, + g_amec->thermaldimm.ot_error, + l_dimm_temp); + L_ot_traced[port][dimm] = true; + } } l_fru->cur_temp = l_dimm_temp; diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c index 7eb2a15..a179b96 100755 --- a/src/occ_405/occbuildname.c +++ b/src/occ_405/occbuildname.c @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = #else -volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190712a\0" /*</BuildName>*/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190719a\0" /*</BuildName>*/ ; #endif |