summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec/amec_health.c
diff options
context:
space:
mode:
authorWael El-Essawy <welessa@us.ibm.com>2016-08-29 19:33:20 -0500
committerWael El-Essawy <welessa@us.ibm.com>2016-09-16 11:19:15 -0400
commit8a7d7b2d13098453380fbdf69c0136515ba33b06 (patch)
tree9ce20ff68625d0a6d4b5d118514a48949232e173 /src/occ_405/amec/amec_health.c
parent444caf2b8e674263f27963b3c9739e48aa793d00 (diff)
downloadtalos-occ-8a7d7b2d13098453380fbdf69c0136515ba33b06.tar.gz
talos-occ-8a7d7b2d13098453380fbdf69c0136515ba33b06.zip
Memory Temperature Control Loop (memory throttling)
* Memory throttling due to over temp * Throttle when reach timeout getting new temperature readings * Log error for temperature exceeding ERROR threshold Change-Id: I089c88aadba84e7296ad87b8cb87fa8c045ff912 RTC: 131188 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/28933 Reviewed-by: Wael El-Essawy <welessa@us.ibm.com> Tested-by: Wael El-Essawy <welessa@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec/amec_health.c')
-rwxr-xr-xsrc/occ_405/amec/amec_health.c302
1 files changed, 180 insertions, 122 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index 7eaa84e..116dd38 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -46,13 +46,13 @@
//*************************************************************************/
// Have we already called out the dimm for overtemp (bitmap of dimms)?
-cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0};
+dimm_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0};
// Have we already called out the dimm for timeout (bitmap of dimms)?
-cent_sensor_flags_t G_dimm_timeout_logged_bitmap = {0};
+dimm_sensor_flags_t G_dimm_timeout_logged_bitmap = {0};
// Are any dimms currently in the timedout state (bitmap of dimm)?
-cent_sensor_flags_t G_dimm_temp_expired_bitmap = {0};
+dimm_sensor_flags_t G_dimm_temp_expired_bitmap = {0};
// Have we already called out the centaur for timeout (bitmap of centaurs)?
uint8_t G_cent_timeout_logged_bitmap = 0;
@@ -116,6 +116,18 @@ void amec_mem_mark_logged(uint8_t i_cent,
}
}
+
+/*
+ * Function Specification
+ *
+ * Name: amec_health_check_dimm_temp
+ *
+ * Description: Check if centaur's-dimm/rdimm-modules temperature exceeds the
+ * error temperature as defined in thermal control thresholds
+ * (ERROR field for Centaur/DIMM FRU Type)
+ *
+ * End Function Specification
+ */
void amec_health_check_dimm_temp()
{
/*------------------------------------------------------------------------*/
@@ -123,16 +135,26 @@ void amec_health_check_dimm_temp()
/*------------------------------------------------------------------------*/
uint16_t l_ot_error, l_cur_temp, l_max_temp;
sensor_t *l_sensor;
- uint32_t l_cent, l_dimm;
+ uint8_t l_dimm; // per centaur/port dimms in cumulus/nimbus
+ uint8_t l_index; // tracks centaurs/ports in cumulus/nimbus
+ uint8_t l_max_index; // #centaurs/ports in cumulus/nimbus
uint32_t l_callouts_count = 0;
uint8_t l_new_callouts;
uint64_t l_huid;
errlHndl_t l_err = NULL;
-
/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
+ if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
+ {
+ l_max_index = NUM_I2C_PORTS;
+ }
+ else // MEM_TYPE_CUMULUS
+ {
+ l_max_index = MAX_NUM_CENTAURS;
+ }
+
// Check to see if any dimms have reached the error temperature that
// haven't been called out already
if(G_dimm_overtemp_bitmap.bigword == G_dimm_overtemp_logged_bitmap.bigword)
@@ -149,12 +171,12 @@ void amec_health_check_dimm_temp()
l_cur_temp,
l_max_temp);
- //iterate over all centaurs
- for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
+ //iterate over all dimms
+ for(l_index = 0; l_index < l_max_index; l_index++)
{
//only callout a dimm if it hasn't been called out already
- l_new_callouts = G_dimm_overtemp_bitmap.bytes[l_cent] ^
- G_dimm_overtemp_logged_bitmap.bytes[l_cent];
+ l_new_callouts = G_dimm_overtemp_bitmap.bytes[l_index] ^
+ G_dimm_overtemp_logged_bitmap.bytes[l_index];
//skip to next centaur if no new callouts for this one
if(!l_new_callouts)
@@ -165,17 +187,18 @@ void amec_health_check_dimm_temp()
//find the dimm(s) that need to be called out behind this centaur
for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
{
- if(!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)))
+ if(!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)) &&
+ G_dimm_overtemp_bitmap.bytes[l_index])
{
continue;
}
- l_huid = amec_mem_get_huid(l_cent, l_dimm);
+ l_huid = amec_mem_get_huid(l_index, l_dimm);
- amec_mem_mark_logged(l_cent,
+ amec_mem_mark_logged(l_index,
l_dimm,
&G_cent_overtemp_logged_bitmap,
- &G_dimm_overtemp_logged_bitmap.bytes[l_cent]);
+ &G_dimm_overtemp_logged_bitmap.bytes[l_index]);
//If we don't have an error log for the callout, create one
if(!l_err)
@@ -236,113 +259,21 @@ void amec_health_check_dimm_temp()
}
}
-void amec_health_check_cent_temp()
-{
- /*------------------------------------------------------------------------*/
- /* Local Variables */
- /*------------------------------------------------------------------------*/
- uint16_t l_ot_error, l_cur_temp, l_max_temp;
- sensor_t *l_sensor;
- uint32_t l_cent;
- uint32_t l_callouts_count = 0;
- uint8_t l_new_callouts;
- uint64_t l_huid;
- errlHndl_t l_err = NULL;
-
- /*------------------------------------------------------------------------*/
- /* Code */
- /*------------------------------------------------------------------------*/
-
- // Check to see if any centaurs have reached the error temperature that
- // haven't been called out already
- l_new_callouts = G_cent_overtemp_bitmap ^ G_cent_overtemp_logged_bitmap;
- if(!l_new_callouts)
- {
- return;
- }
-
- l_ot_error = g_amec->thermalcent.ot_error;
- l_sensor = getSensorByGsid(TEMP2MSCENT);
- l_cur_temp = l_sensor->sample;
- l_max_temp = l_sensor->sample_max;
- TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. cur_max[%d], hist_max[%d] bitmap[0x%02X]",
- l_ot_error,
- l_cur_temp,
- l_max_temp,
- l_new_callouts);
-
- //find the centaur(s) that need to be called out
- for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
- {
- if(!(l_new_callouts & (CENTAUR0_PRESENT_MASK >> l_cent)))
- {
- continue;
- }
-
- l_huid = amec_mem_get_huid(l_cent, 0xff);
-
- amec_mem_mark_logged(l_cent,
- 0xff,
- &G_cent_overtemp_logged_bitmap,
- &G_dimm_overtemp_logged_bitmap.bytes[l_cent]);
-
- //If we don't have an error log for the callout, create one
- if(!l_err)
- {
- /* @
- * @errortype
- * @moduleid AMEC_HEALTH_CHECK_CENT_TEMP
- * @reasoncode CENT_ERROR_TEMP
- * @userdata1 Maximum centaur temperature
- * @userdata2 Centaur temperature threshold
- * @userdata4 OCC_NO_EXTENDED_RC
- * @devdesc Centaur memory controller(s) exceeded maximum safe
- * temperature.
- */
- l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TEMP, //modId
- CENT_ERROR_TEMP, //reasoncode
- OCC_NO_EXTENDED_RC, //Extended reason code
- ERRL_SEV_PREDICTIVE, //Severity
- NULL, //Trace Buf
- DEFAULT_TRACE_SIZE, //Trace Size
- l_max_temp, //userdata1
- l_ot_error); //userdata2
-
- // Callout the "over temperature" procedure
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_COMPONENT_ID,
- ERRL_COMPONENT_ID_OVER_TEMPERATURE,
- ERRL_CALLOUT_PRIORITY_HIGH);
- l_callouts_count = 1;
- }
-
- // Callout centaur
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- l_huid,
- ERRL_CALLOUT_PRIORITY_MED);
-
- l_callouts_count++;
-
- //If we've reached the max # of callouts for an error log
- //commit the error log
- if(l_callouts_count == ERRL_MAX_CALLOUTS)
- {
- commitErrl(&l_err);
- }
-
- }//iterate over centaurs
-
- if(l_err)
- {
- commitErrl(&l_err);
- }
-}
-
+/*
+ * Function Specification
+ *
+ * Name: amec_health_check_dimm_timeout
+ *
+ * Description: Check for centaur-dimm/rdimm-modules timeout condition
+ * as defined in thermal control thresholds
+ * (MAX_READ_TIMEOUT field for Centaur/DIMM FRU Type)
+ *
+ * End Function Specification
+ */
void amec_health_check_dimm_timeout()
{
- static cent_sensor_flags_t L_temp_update_bitmap_prev = {0};
- cent_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap;
+ static dimm_sensor_flags_t L_temp_update_bitmap_prev = {0};
+ dimm_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap;
uint8_t l_dimm, l_cent;
fru_temp_t* l_fru;
errlHndl_t l_err = NULL;
@@ -363,7 +294,7 @@ void amec_health_check_dimm_timeout()
G_dimm_temp_updated_bitmap.bigword = 0;
//check if we need to increment any timers (haven't been updated in the last second)
- l_need_inc.bigword = G_cent_enabled_sensors.bigword & ~l_temp_update_bitmap.bigword;
+ l_need_inc.bigword = G_dimm_enabled_sensors.bigword & ~l_temp_update_bitmap.bigword;
//check if we need to clear any timers (updated now but not updated previously)
l_need_clr.bigword = l_temp_update_bitmap.bigword & ~L_temp_update_bitmap_prev.bigword;
@@ -448,8 +379,8 @@ void amec_health_check_dimm_timeout()
continue;
}
- TRAC_ERR("Timed out reading dimm temperature on cent[%d] dimm[%d] temp[%d] flags[0x%02X]",
- l_cent, l_dimm, l_fru->cur_temp, l_fru->flags);
+ TRAC_ERR("Timed out reading dimm temperature on cent/port[%d] dimm[%d] temp[%d] flags[0x%02X]",
+ l_cent, l_dimm, l_fru->cur_temp, l_fru->flags);
if(!l_err)
{
@@ -540,7 +471,7 @@ void amec_health_check_dimm_timeout()
//info trace each time we recover
if(L_ran_once)
{
- TRAC_INFO("DIMM temperature collection has resumed on cent[%d] dimm[%d] temp[%d]",
+ TRAC_INFO("DIMM temperature collection has resumed on cent/port[%d] dimm[%d] temp[%d]",
l_cent, l_dimm, l_fru->cur_temp);
}
@@ -550,6 +481,133 @@ void amec_health_check_dimm_timeout()
L_ran_once = TRUE;
}
+
+
+/*
+ * Function Specification
+ *
+ * Name: amec_health_check_cent_dimm_temp
+ *
+ * Description: Check if the centaur's dimm chips temperature exceeds the error
+ * temperature as defined in thermal control thresholds
+ * (ERROR field for Centaur FRU Type)
+ *
+ * End Function Specification
+ */
+void amec_health_check_cent_temp()
+{
+ /*------------------------------------------------------------------------*/
+ /* Local Variables */
+ /*------------------------------------------------------------------------*/
+ uint16_t l_ot_error, l_cur_temp, l_max_temp;
+ sensor_t *l_sensor;
+ uint32_t l_cent;
+ uint32_t l_callouts_count = 0;
+ uint8_t l_new_callouts;
+ uint64_t l_huid;
+ errlHndl_t l_err = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Code */
+ /*------------------------------------------------------------------------*/
+
+ // Check to see if any centaurs have reached the error temperature that
+ // haven't been called out already
+ l_new_callouts = G_cent_overtemp_bitmap ^ G_cent_overtemp_logged_bitmap;
+ if(!l_new_callouts)
+ {
+ return;
+ }
+
+ l_ot_error = g_amec->thermalcent.ot_error;
+ l_sensor = getSensorByGsid(TEMP2MSCENT);
+ l_cur_temp = l_sensor->sample;
+ l_max_temp = l_sensor->sample_max;
+ TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. cur_max[%d], hist_max[%d] bitmap[0x%02X]",
+ l_ot_error,
+ l_cur_temp,
+ l_max_temp,
+ l_new_callouts);
+
+ //find the centaur(s) that need to be called out
+ for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
+ {
+ if(!(l_new_callouts & (CENTAUR0_PRESENT_MASK >> l_cent)))
+ {
+ continue;
+ }
+
+ l_huid = amec_mem_get_huid(l_cent, 0xff);
+
+ amec_mem_mark_logged(l_cent,
+ 0xff,
+ &G_cent_overtemp_logged_bitmap,
+ &G_dimm_overtemp_logged_bitmap.bytes[l_cent]);
+
+ //If we don't have an error log for the callout, create one
+ if(!l_err)
+ {
+ /* @
+ * @errortype
+ * @moduleid AMEC_HEALTH_CHECK_CENT_TEMP
+ * @reasoncode CENT_ERROR_TEMP
+ * @userdata1 Maximum centaur temperature
+ * @userdata2 Centaur temperature threshold
+ * @userdata4 OCC_NO_EXTENDED_RC
+ * @devdesc Centaur memory controller(s) exceeded maximum safe
+ * temperature.
+ */
+ l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TEMP, //modId
+ CENT_ERROR_TEMP, //reasoncode
+ OCC_NO_EXTENDED_RC, //Extended reason code
+ ERRL_SEV_PREDICTIVE, //Severity
+ NULL, //Trace Buf
+ DEFAULT_TRACE_SIZE, //Trace Size
+ l_max_temp, //userdata1
+ l_ot_error); //userdata2
+
+ // Callout the "over temperature" procedure
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_COMPONENT_ID,
+ ERRL_COMPONENT_ID_OVER_TEMPERATURE,
+ ERRL_CALLOUT_PRIORITY_HIGH);
+ l_callouts_count = 1;
+ }
+
+ // Callout centaur
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_HUID,
+ l_huid,
+ ERRL_CALLOUT_PRIORITY_MED);
+
+ l_callouts_count++;
+
+ //If we've reached the max # of callouts for an error log
+ //commit the error log
+ if(l_callouts_count == ERRL_MAX_CALLOUTS)
+ {
+ commitErrl(&l_err);
+ }
+
+ }//iterate over centaurs
+
+ if(l_err)
+ {
+ commitErrl(&l_err);
+ }
+}
+
+/*
+ * Function Specification
+ *
+ * Name: amec_health_check_cent_timeout
+ *
+ * Description: Check for centaur timeout condition
+ * as defined in thermal control thresholds
+ * (MAX_READ_TIMEOUT field for Centaur FRU Type)
+ *
+ * End Function Specification
+ */
void amec_health_check_cent_timeout()
{
static uint8_t L_temp_update_bitmap_prev = 0;
OpenPOWER on IntegriCloud