diff options
author | Wael El-Essawy <welessa@us.ibm.com> | 2016-08-29 19:33:20 -0500 |
---|---|---|
committer | Wael El-Essawy <welessa@us.ibm.com> | 2016-09-16 11:19:15 -0400 |
commit | 8a7d7b2d13098453380fbdf69c0136515ba33b06 (patch) | |
tree | 9ce20ff68625d0a6d4b5d118514a48949232e173 /src/occ_405/amec/amec_health.c | |
parent | 444caf2b8e674263f27963b3c9739e48aa793d00 (diff) | |
download | talos-occ-8a7d7b2d13098453380fbdf69c0136515ba33b06.tar.gz talos-occ-8a7d7b2d13098453380fbdf69c0136515ba33b06.zip |
Memory Temperature Control Loop (memory throttling)
* Memory throttling due to over temp
* Throttle when reach timeout getting new temperature readings
* Log error for temperature exceeding ERROR threshold
Change-Id: I089c88aadba84e7296ad87b8cb87fa8c045ff912
RTC: 131188
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/28933
Reviewed-by: Wael El-Essawy <welessa@us.ibm.com>
Tested-by: Wael El-Essawy <welessa@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec/amec_health.c')
-rwxr-xr-x | src/occ_405/amec/amec_health.c | 302 |
1 files changed, 180 insertions, 122 deletions
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 7eaa84e..116dd38 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -46,13 +46,13 @@ //*************************************************************************/ // Have we already called out the dimm for overtemp (bitmap of dimms)? -cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0}; +dimm_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0}; // Have we already called out the dimm for timeout (bitmap of dimms)? -cent_sensor_flags_t G_dimm_timeout_logged_bitmap = {0}; +dimm_sensor_flags_t G_dimm_timeout_logged_bitmap = {0}; // Are any dimms currently in the timedout state (bitmap of dimm)? -cent_sensor_flags_t G_dimm_temp_expired_bitmap = {0}; +dimm_sensor_flags_t G_dimm_temp_expired_bitmap = {0}; // Have we already called out the centaur for timeout (bitmap of centaurs)? uint8_t G_cent_timeout_logged_bitmap = 0; @@ -116,6 +116,18 @@ void amec_mem_mark_logged(uint8_t i_cent, } } + +/* + * Function Specification + * + * Name: amec_health_check_dimm_temp + * + * Description: Check if centaur's-dimm/rdimm-modules temperature exceeds the + * error temperature as defined in thermal control thresholds + * (ERROR field for Centaur/DIMM FRU Type) + * + * End Function Specification + */ void amec_health_check_dimm_temp() { /*------------------------------------------------------------------------*/ @@ -123,16 +135,26 @@ void amec_health_check_dimm_temp() /*------------------------------------------------------------------------*/ uint16_t l_ot_error, l_cur_temp, l_max_temp; sensor_t *l_sensor; - uint32_t l_cent, l_dimm; + uint8_t l_dimm; // per centaur/port dimms in cumulus/nimbus + uint8_t l_index; // tracks centaurs/ports in cumulus/nimbus + uint8_t l_max_index; // #centaurs/ports in cumulus/nimbus uint32_t l_callouts_count = 0; uint8_t l_new_callouts; uint64_t l_huid; errlHndl_t l_err = NULL; - /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ + if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS) + { + l_max_index = NUM_I2C_PORTS; + } + else // MEM_TYPE_CUMULUS + { + l_max_index = MAX_NUM_CENTAURS; + } + // Check to see if any dimms have reached the error temperature that // haven't been called out already if(G_dimm_overtemp_bitmap.bigword == G_dimm_overtemp_logged_bitmap.bigword) @@ -149,12 +171,12 @@ void amec_health_check_dimm_temp() l_cur_temp, l_max_temp); - //iterate over all centaurs - for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++) + //iterate over all dimms + for(l_index = 0; l_index < l_max_index; l_index++) { //only callout a dimm if it hasn't been called out already - l_new_callouts = G_dimm_overtemp_bitmap.bytes[l_cent] ^ - G_dimm_overtemp_logged_bitmap.bytes[l_cent]; + l_new_callouts = G_dimm_overtemp_bitmap.bytes[l_index] ^ + G_dimm_overtemp_logged_bitmap.bytes[l_index]; //skip to next centaur if no new callouts for this one if(!l_new_callouts) @@ -165,17 +187,18 @@ void amec_health_check_dimm_temp() //find the dimm(s) that need to be called out behind this centaur for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++) { - if(!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm))) + if(!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)) && + G_dimm_overtemp_bitmap.bytes[l_index]) { continue; } - l_huid = amec_mem_get_huid(l_cent, l_dimm); + l_huid = amec_mem_get_huid(l_index, l_dimm); - amec_mem_mark_logged(l_cent, + amec_mem_mark_logged(l_index, l_dimm, &G_cent_overtemp_logged_bitmap, - &G_dimm_overtemp_logged_bitmap.bytes[l_cent]); + &G_dimm_overtemp_logged_bitmap.bytes[l_index]); //If we don't have an error log for the callout, create one if(!l_err) @@ -236,113 +259,21 @@ void amec_health_check_dimm_temp() } } -void amec_health_check_cent_temp() -{ - /*------------------------------------------------------------------------*/ - /* Local Variables */ - /*------------------------------------------------------------------------*/ - uint16_t l_ot_error, l_cur_temp, l_max_temp; - sensor_t *l_sensor; - uint32_t l_cent; - uint32_t l_callouts_count = 0; - uint8_t l_new_callouts; - uint64_t l_huid; - errlHndl_t l_err = NULL; - - /*------------------------------------------------------------------------*/ - /* Code */ - /*------------------------------------------------------------------------*/ - - // Check to see if any centaurs have reached the error temperature that - // haven't been called out already - l_new_callouts = G_cent_overtemp_bitmap ^ G_cent_overtemp_logged_bitmap; - if(!l_new_callouts) - { - return; - } - - l_ot_error = g_amec->thermalcent.ot_error; - l_sensor = getSensorByGsid(TEMP2MSCENT); - l_cur_temp = l_sensor->sample; - l_max_temp = l_sensor->sample_max; - TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. cur_max[%d], hist_max[%d] bitmap[0x%02X]", - l_ot_error, - l_cur_temp, - l_max_temp, - l_new_callouts); - - //find the centaur(s) that need to be called out - for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++) - { - if(!(l_new_callouts & (CENTAUR0_PRESENT_MASK >> l_cent))) - { - continue; - } - - l_huid = amec_mem_get_huid(l_cent, 0xff); - - amec_mem_mark_logged(l_cent, - 0xff, - &G_cent_overtemp_logged_bitmap, - &G_dimm_overtemp_logged_bitmap.bytes[l_cent]); - - //If we don't have an error log for the callout, create one - if(!l_err) - { - /* @ - * @errortype - * @moduleid AMEC_HEALTH_CHECK_CENT_TEMP - * @reasoncode CENT_ERROR_TEMP - * @userdata1 Maximum centaur temperature - * @userdata2 Centaur temperature threshold - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc Centaur memory controller(s) exceeded maximum safe - * temperature. - */ - l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TEMP, //modId - CENT_ERROR_TEMP, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - l_max_temp, //userdata1 - l_ot_error); //userdata2 - - // Callout the "over temperature" procedure - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_OVER_TEMPERATURE, - ERRL_CALLOUT_PRIORITY_HIGH); - l_callouts_count = 1; - } - - // Callout centaur - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - l_huid, - ERRL_CALLOUT_PRIORITY_MED); - - l_callouts_count++; - - //If we've reached the max # of callouts for an error log - //commit the error log - if(l_callouts_count == ERRL_MAX_CALLOUTS) - { - commitErrl(&l_err); - } - - }//iterate over centaurs - - if(l_err) - { - commitErrl(&l_err); - } -} - +/* + * Function Specification + * + * Name: amec_health_check_dimm_timeout + * + * Description: Check for centaur-dimm/rdimm-modules timeout condition + * as defined in thermal control thresholds + * (MAX_READ_TIMEOUT field for Centaur/DIMM FRU Type) + * + * End Function Specification + */ void amec_health_check_dimm_timeout() { - static cent_sensor_flags_t L_temp_update_bitmap_prev = {0}; - cent_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap; + static dimm_sensor_flags_t L_temp_update_bitmap_prev = {0}; + dimm_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap; uint8_t l_dimm, l_cent; fru_temp_t* l_fru; errlHndl_t l_err = NULL; @@ -363,7 +294,7 @@ void amec_health_check_dimm_timeout() G_dimm_temp_updated_bitmap.bigword = 0; //check if we need to increment any timers (haven't been updated in the last second) - l_need_inc.bigword = G_cent_enabled_sensors.bigword & ~l_temp_update_bitmap.bigword; + l_need_inc.bigword = G_dimm_enabled_sensors.bigword & ~l_temp_update_bitmap.bigword; //check if we need to clear any timers (updated now but not updated previously) l_need_clr.bigword = l_temp_update_bitmap.bigword & ~L_temp_update_bitmap_prev.bigword; @@ -448,8 +379,8 @@ void amec_health_check_dimm_timeout() continue; } - TRAC_ERR("Timed out reading dimm temperature on cent[%d] dimm[%d] temp[%d] flags[0x%02X]", - l_cent, l_dimm, l_fru->cur_temp, l_fru->flags); + TRAC_ERR("Timed out reading dimm temperature on cent/port[%d] dimm[%d] temp[%d] flags[0x%02X]", + l_cent, l_dimm, l_fru->cur_temp, l_fru->flags); if(!l_err) { @@ -540,7 +471,7 @@ void amec_health_check_dimm_timeout() //info trace each time we recover if(L_ran_once) { - TRAC_INFO("DIMM temperature collection has resumed on cent[%d] dimm[%d] temp[%d]", + TRAC_INFO("DIMM temperature collection has resumed on cent/port[%d] dimm[%d] temp[%d]", l_cent, l_dimm, l_fru->cur_temp); } @@ -550,6 +481,133 @@ void amec_health_check_dimm_timeout() L_ran_once = TRUE; } + + +/* + * Function Specification + * + * Name: amec_health_check_cent_dimm_temp + * + * Description: Check if the centaur's dimm chips temperature exceeds the error + * temperature as defined in thermal control thresholds + * (ERROR field for Centaur FRU Type) + * + * End Function Specification + */ +void amec_health_check_cent_temp() +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + uint16_t l_ot_error, l_cur_temp, l_max_temp; + sensor_t *l_sensor; + uint32_t l_cent; + uint32_t l_callouts_count = 0; + uint8_t l_new_callouts; + uint64_t l_huid; + errlHndl_t l_err = NULL; + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + + // Check to see if any centaurs have reached the error temperature that + // haven't been called out already + l_new_callouts = G_cent_overtemp_bitmap ^ G_cent_overtemp_logged_bitmap; + if(!l_new_callouts) + { + return; + } + + l_ot_error = g_amec->thermalcent.ot_error; + l_sensor = getSensorByGsid(TEMP2MSCENT); + l_cur_temp = l_sensor->sample; + l_max_temp = l_sensor->sample_max; + TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. cur_max[%d], hist_max[%d] bitmap[0x%02X]", + l_ot_error, + l_cur_temp, + l_max_temp, + l_new_callouts); + + //find the centaur(s) that need to be called out + for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++) + { + if(!(l_new_callouts & (CENTAUR0_PRESENT_MASK >> l_cent))) + { + continue; + } + + l_huid = amec_mem_get_huid(l_cent, 0xff); + + amec_mem_mark_logged(l_cent, + 0xff, + &G_cent_overtemp_logged_bitmap, + &G_dimm_overtemp_logged_bitmap.bytes[l_cent]); + + //If we don't have an error log for the callout, create one + if(!l_err) + { + /* @ + * @errortype + * @moduleid AMEC_HEALTH_CHECK_CENT_TEMP + * @reasoncode CENT_ERROR_TEMP + * @userdata1 Maximum centaur temperature + * @userdata2 Centaur temperature threshold + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc Centaur memory controller(s) exceeded maximum safe + * temperature. + */ + l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TEMP, //modId + CENT_ERROR_TEMP, //reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + l_max_temp, //userdata1 + l_ot_error); //userdata2 + + // Callout the "over temperature" procedure + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_OVER_TEMPERATURE, + ERRL_CALLOUT_PRIORITY_HIGH); + l_callouts_count = 1; + } + + // Callout centaur + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + l_huid, + ERRL_CALLOUT_PRIORITY_MED); + + l_callouts_count++; + + //If we've reached the max # of callouts for an error log + //commit the error log + if(l_callouts_count == ERRL_MAX_CALLOUTS) + { + commitErrl(&l_err); + } + + }//iterate over centaurs + + if(l_err) + { + commitErrl(&l_err); + } +} + +/* + * Function Specification + * + * Name: amec_health_check_cent_timeout + * + * Description: Check for centaur timeout condition + * as defined in thermal control thresholds + * (MAX_READ_TIMEOUT field for Centaur FRU Type) + * + * End Function Specification + */ void amec_health_check_cent_timeout() { static uint8_t L_temp_update_bitmap_prev = 0; |