summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2019-07-18 14:53:24 -0500
committerMartha Broyles <mbroyles@us.ibm.com>2019-07-19 12:52:42 -0500
commitd467852fe039a980180df22178ae09a89a3ed6d9 (patch)
tree008d1caec12e95eebc24e30d9a4770b42ea1d058
parentbae814cdb7dc0206d13bdd4c1b0f531f3da814eb (diff)
downloadtalos-occ-d467852fe039a980180df22178ae09a89a3ed6d9.tar.gz
talos-occ-d467852fe039a980180df22178ae09a89a3ed6d9.zip
Fix incorrect hw callout in Centaur DIMM OT errors
Change-Id: I2a7076f1a328daf18b3eff35cd75895c472a8962 CQ: SW470683 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/80639 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com> Reviewed-by: William A Bryan <wilbryan@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-xsrc/occ_405/amec/amec_health.c31
-rw-r--r--src/occ_405/amec/amec_sensors_centaur.c15
-rw-r--r--src/occ_405/amec/amec_sensors_ocmb.c15
-rwxr-xr-xsrc/occ_405/dimm/dimm.c16
-rwxr-xr-xsrc/occ_405/occbuildname.c2
5 files changed, 60 insertions, 19 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index 91b2a28..562038d 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2018 */
+/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -142,11 +142,12 @@ void amec_mem_mark_logged(uint8_t i_cent,
*/
void amec_health_check_dimm_temp()
{
- uint16_t l_ot_error, l_cur_temp, l_max_temp;
+ uint16_t l_ot_error, l_max_temp;
sensor_t *l_sensor;
uint8_t l_dimm;
uint8_t l_port;
- uint8_t l_max_port; // #ports in nimbus/#centaurs in cumulus
+ uint8_t l_max_port; // #ports in nimbus/#mem buf in cumulus/OCM
+ uint8_t l_max_dimm_per_port; // per port in nimbus/per mem buf in cumulus/OCM
uint32_t l_callouts_count = 0;
uint8_t l_new_callouts;
uint64_t l_huid;
@@ -155,10 +156,12 @@ void amec_health_check_dimm_temp()
if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
{
l_max_port = NUM_DIMM_PORTS;
+ l_max_dimm_per_port = NUM_DIMMS_PER_I2CPORT;
}
else // MEM_TYPE_CUMULUS
{
l_max_port = MAX_NUM_CENTAURS;
+ l_max_dimm_per_port = NUM_DIMMS_PER_CENTAUR;
}
// Check to see if any dimms have reached the error temperature that
@@ -170,7 +173,6 @@ void amec_health_check_dimm_temp()
l_ot_error = g_amec->thermaldimm.ot_error;
l_sensor = getSensorByGsid(TEMPDIMMTHRM);
- l_cur_temp = l_sensor->sample;
l_max_temp = l_sensor->sample_max;
//iterate over all dimms
@@ -186,14 +188,15 @@ void amec_health_check_dimm_temp()
continue;
}
- TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. current[%d], hist_max[%d], port[%d]",
- l_ot_error,
- l_cur_temp,
- l_max_temp,
- l_port);
+ // if the previous port had errors commit it so this port gets new error log
+ if(l_err)
+ {
+ commitErrl(&l_err);
+ l_callouts_count = 0;
+ }
//find the dimm(s) that need to be called out for this port
- for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
+ for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++)
{
if (!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)))
{
@@ -206,15 +209,19 @@ void amec_health_check_dimm_temp()
l_dimm,
&G_cent_overtemp_logged_bitmap,
&G_dimm_overtemp_logged_bitmap.bytes[l_port]);
- TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X overtemp - %dC",
+ TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X being called out for overtemp - %dC",
(l_port<<8)|l_dimm, l_fru->cur_temp);
- // Create single elog with up to MAX_CALLOUTS
+ // Create single elog with up to MAX_CALLOUTS for this port
if(l_callouts_count < ERRL_MAX_CALLOUTS)
{
//If we don't have an error log for the callout, create one
if(!l_err)
{
+ TRAC_ERR("amec_health_check_dimm_temp: Creating log for port[%d] OT bitmap[0x%02X] logged bitmap[0x%02X]",
+ l_port,
+ G_dimm_overtemp_bitmap.bytes[l_port],
+ G_dimm_overtemp_logged_bitmap.bytes[l_port]);
/* @
* @errortype
* @moduleid AMEC_HEALTH_CHECK_DIMM_TEMP
diff --git a/src/occ_405/amec/amec_sensors_centaur.c b/src/occ_405/amec/amec_sensors_centaur.c
index 35fd262..8f946e6 100644
--- a/src/occ_405/amec/amec_sensors_centaur.c
+++ b/src/occ_405/amec/amec_sensors_centaur.c
@@ -71,7 +71,7 @@ void amec_perfcount_getmc( CentaurMemData * i_sensor_cache, uint8_t i_centaur);
// Function Specification
//
-// Name: amec_update_dimm_dts_sensors
+// Name: amec_update_centaur_sensors
//
// Description: Updates sensors that have data grabbed by the fast core data
// task.
@@ -116,6 +116,7 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen
uint32_t l_sens_status;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_CENTAURS] = {FALSE};
+ static bool L_ot_traced[MAX_NUM_CENTAURS][NUM_DIMMS_PER_CENTAUR] = {{false}};
// Harvest thermal data for all dimms
for(k=0; k < NUM_DIMMS_PER_CENTAUR; k++)
@@ -236,7 +237,17 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen
if(l_dts[k] >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
- G_dimm_overtemp_bitmap.bytes[i_centaur] |= 1 << k;
+ G_dimm_overtemp_bitmap.bytes[i_centaur] |= (DIMM_SENSOR0 >> k);
+ // trace first time OT per DIMM
+ if( !L_ot_traced[i_centaur][k] )
+ {
+ TRAC_ERR("amec_update_dimm_dts_sensors: centaur[%d] DIMM[%d] reached error temp[%d]. current[%d]",
+ i_centaur,
+ k,
+ g_amec->thermaldimm.ot_error,
+ l_dts[k]);
+ L_ot_traced[i_centaur][k] = true;
+ }
}
}
diff --git a/src/occ_405/amec/amec_sensors_ocmb.c b/src/occ_405/amec/amec_sensors_ocmb.c
index 0a9f072..88f277c 100644
--- a/src/occ_405/amec/amec_sensors_ocmb.c
+++ b/src/occ_405/amec/amec_sensors_ocmb.c
@@ -71,7 +71,7 @@ void amec_perfcount_ocmb_getmc( OcmbMemData * i_sensor_cache, uint8_t i_membuf);
// Function Specification
//
-// Name: amec_update_ocmb_dimm_dts_sensors
+// Name: amec_update_ocmb_sensors
//
// Description: Updates sensors that have data grabbed by the fast core data
// task.
@@ -119,6 +119,7 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
uint32_t l_hottest_dimm_loc = NUM_DIMMS_PER_OCMB;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_OCMBS] = {FALSE};
+ static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};
// Harvest thermal data for all dimms
for(k=0; k < NUM_DIMMS_PER_OCMB; k++)
@@ -253,7 +254,17 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
if(l_dts[k] >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
- G_dimm_overtemp_bitmap.bytes[i_membuf] |= 1 << k;
+ G_dimm_overtemp_bitmap.bytes[i_membuf] |= (DIMM_SENSOR0 >> k);
+ // trace first time OT per DIMM
+ if( !L_ot_traced[i_membuf][k] )
+ {
+ TRAC_ERR("amec_update_ocmb_dimm_dts_sensors: Mem Buf[%d] DIMM[%d] reached error temp[%d]. current[%d]",
+ i_membuf,
+ k,
+ g_amec->thermaldimm.ot_error,
+ l_dts[k]);
+ L_ot_traced[i_membuf][k] = true;
+ }
}
}
diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c
index fd8e6e3..7757d2c 100755
--- a/src/occ_405/dimm/dimm.c
+++ b/src/occ_405/dimm/dimm.c
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2018 */
+/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -623,6 +623,7 @@ void process_dimm_temp()
const uint8_t port = G_dimm_sm_args.i2cPort;
const uint8_t dimm = G_dimm_sm_args.dimm;
uint8_t l_dimm_temp = G_dimm_sm_args.temp;
+ static bool L_ot_traced[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{false}};
#define MIN_VALID_DIMM_TEMP 1
#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek
@@ -687,7 +688,18 @@ void process_dimm_temp()
if (l_dimm_temp >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
- G_dimm_overtemp_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm;
+ G_dimm_overtemp_bitmap.bytes[port] |= (DIMM_SENSOR0 >> dimm);
+
+ // trace first time OT per DIMM
+ if( !L_ot_traced[port][dimm] )
+ {
+ TRAC_ERR("process_dimm_temp: port[%d] DIMM[%d] reached error temp[%d]. current[%d]",
+ port,
+ dimm,
+ g_amec->thermaldimm.ot_error,
+ l_dimm_temp);
+ L_ot_traced[port][dimm] = true;
+ }
}
l_fru->cur_temp = l_dimm_temp;
diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c
index 7eb2a15..a179b96 100755
--- a/src/occ_405/occbuildname.c
+++ b/src/occ_405/occbuildname.c
@@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =
#else
-volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190712a\0" /*</BuildName>*/ ;
+volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190719a\0" /*</BuildName>*/ ;
#endif
OpenPOWER on IntegriCloud