diff options
author | Chris Cain <cjcain@us.ibm.com> | 2018-03-27 17:50:57 -0500 |
---|---|---|
committer | Christopher J. Cain <cjcain@us.ibm.com> | 2018-03-28 14:43:25 -0400 |
commit | b3a2f75d837fd671f13dacb2464c36a5fc8fc69d (patch) | |
tree | af9d67a1bac2b5fce6b81030ab7c0ba98bea4490 | |
parent | bd605ba0a030b3490f0edebd8fb704722b6eab0d (diff) | |
download | talos-occ-b3a2f75d837fd671f13dacb2464c36a5fc8fc69d.tar.gz talos-occ-b3a2f75d837fd671f13dacb2464c36a5fc8fc69d.zip |
Fix DIMM overtemp bitmap and and trace updates
Change-Id: Ia0f998573316280f253eb3bc495f5c414c092461
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/56344
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
-rwxr-xr-x | src/occ_405/amec/amec_data.c | 28 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_health.c | 25 | ||||
-rwxr-xr-x | src/occ_405/dimm/dimm.c | 4 |
3 files changed, 31 insertions, 26 deletions
diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c index 5857373..6a51503 100755 --- a/src/occ_405/amec/amec_data.c +++ b/src/occ_405/amec/amec_data.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -157,6 +157,7 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) cmdh_thrm_thresholds_set_t *l_frudata = NULL; uint8_t l_dvfs_temp = 0; uint8_t l_error = 0; + bool l_pm_limits = false; /*------------------------------------------------------------------------*/ /* Code */ @@ -182,6 +183,9 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) } else { + l_pm_limits = true; + TRAC_INFO("AMEC_data_write_thrm_thresholds: Using PM limits"); + l_dvfs_temp = l_frudata[DATA_FRU_PROC].pm_dvfs; if(i_mode == OCC_MODE_TURBO) { @@ -200,11 +204,11 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) // Store the temperature timeout value g_amec->thermalproc.temp_timeout = l_frudata[DATA_FRU_PROC].max_read_timeout; - TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for processor", - l_dvfs_temp); + TRAC_INFO("AMEC_data_write_thrm_thresholds: Processor setpoints - DVFS: %u, Error: %u", + l_dvfs_temp, l_error); // Store the Centaur thermal data - if ((i_mode == OCC_MODE_NOMINAL) || (G_sysConfigData.system_type.kvm)) + if (!l_pm_limits) { // use normal thresholds for Nominal or OPAL l_dvfs_temp = l_frudata[DATA_FRU_CENTAUR].dvfs; @@ -231,11 +235,11 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) // Store the temperature timeout value g_amec->thermalcent.temp_timeout = l_frudata[DATA_FRU_CENTAUR].max_read_timeout; - TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for Centaur", - l_dvfs_temp); + TRAC_INFO("AMEC_data_write_thrm_thresholds: Centaur setpoints - DVFS: %u, Error: %u", + l_dvfs_temp, l_error); // Store the DIMM thermal data - if ((i_mode == OCC_MODE_NOMINAL) || (G_sysConfigData.system_type.kvm)) + if (!l_pm_limits) { // use normal thresholds for Nominal or OPAL l_dvfs_temp = l_frudata[DATA_FRU_DIMM].dvfs; @@ -261,8 +265,8 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) // Store the temperature timeout value g_amec->thermaldimm.temp_timeout = l_frudata[DATA_FRU_DIMM].max_read_timeout; - TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for DIMM", - l_dvfs_temp); + TRAC_INFO("AMEC_data_write_thrm_thresholds: DIMM setpoints - DVFS: %u, Error: %u", + l_dvfs_temp, l_error); g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM_OT_STATUS].error_count; @@ -270,7 +274,7 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) g_amec->vrhotproc.setpoint); // Store the VRM Vdd thermal data - if ((i_mode == OCC_MODE_NOMINAL) || (G_sysConfigData.system_type.kvm)) + if (!l_pm_limits) { // use normal thresholds for Nominal or OPAL l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].dvfs; @@ -296,8 +300,8 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) // Store the temperature timeout value g_amec->thermalvdd.temp_timeout = l_frudata[DATA_FRU_VRM_VDD].max_read_timeout; - TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRM Vdd", - l_dvfs_temp); + TRAC_INFO("AMEC_data_write_thrm_thresholds: VRM Vdd setpoints - DVFS: %u, Error: %u", + l_dvfs_temp, l_error); } while(0); diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 60d5a81..b80d043 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -171,10 +171,6 @@ void amec_health_check_dimm_temp() l_sensor = getSensorByGsid(TEMPDIMMTHRM); l_cur_temp = l_sensor->sample; l_max_temp = l_sensor->sample_max; - TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. cur_max[%d], hist_max[%d]", - l_ot_error, - l_cur_temp, - l_max_temp); //iterate over all dimms for(l_port = 0; l_port < l_max_port; l_port++) @@ -184,16 +180,21 @@ void amec_health_check_dimm_temp() G_dimm_overtemp_logged_bitmap.bytes[l_port]; //skip to next port if no new callouts for this one - if(!l_new_callouts) + if (!l_new_callouts || (G_dimm_overtemp_bitmap.bytes[l_port] == 0)) { continue; } + TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. current[%d], hist_max[%d], port[%d]", + l_ot_error, + l_cur_temp, + l_max_temp, + l_port); + //find the dimm(s) that need to be called out for this port for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++) { - if(!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)) && - G_dimm_overtemp_bitmap.bytes[l_port]) + if (!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm))) { continue; } @@ -217,8 +218,8 @@ void amec_health_check_dimm_temp() * @errortype * @moduleid AMEC_HEALTH_CHECK_DIMM_TEMP * @reasoncode DIMM_ERROR_TEMP - * @userdata1 Maximum dimm temperature - * @userdata2 Dimm temperature threshold + * @userdata1 Maximum DIMM temperature + * @userdata2 DIMM temperature threshold * @userdata4 OCC_NO_EXTENDED_RC * @devdesc Memory DIMM(s) exceeded maximum safe * temperature. @@ -321,7 +322,7 @@ void amec_health_check_dimm_timeout() if(G_dimm_temp_expired_bitmap.bytes[l_port]) { G_dimm_temp_expired_bitmap.bytes[l_port] = 0; - TRAC_INFO("All dimm sensors for centaur %d have been updated", l_port); + TRAC_INFO("All DIMM sensors for port %d have been updated", l_port); } continue; } @@ -519,7 +520,7 @@ void amec_health_check_cent_temp() l_sensor = getSensorByGsid(TEMPCENT); l_cur_temp = l_sensor->sample; l_max_temp = l_sensor->sample_max; - TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. cur_max[%d], hist_max[%d] bitmap[0x%02X]", + TRAC_ERR("amec_health_check_cent_temp: Centaur reached error temp[%d]. current[%d], hist_max[%d], bitmap[0x%02X]", l_ot_error, l_cur_temp, l_max_temp, diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c index bcdfb6c..40af9de 100755 --- a/src/occ_405/dimm/dimm.c +++ b/src/occ_405/dimm/dimm.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -684,7 +684,7 @@ void process_dimm_temp() if (l_dimm_temp >= g_amec->thermaldimm.ot_error) { //Set a bit so that this dimm can be called out by the thermal thread - G_dimm_overtemp_bitmap.bytes[port] |= 1 << dimm; + G_dimm_overtemp_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm; } l_fru->cur_temp = l_dimm_temp; |