diff options
author | William Bryan <wilbryan@us.ibm.com> | 2016-06-20 16:24:41 -0500 |
---|---|---|
committer | William A. Bryan <wilbryan@us.ibm.com> | 2016-06-29 14:27:02 -0400 |
commit | ad4295664e98414db0dcbf0d37fa5bde6dae80ca (patch) | |
tree | dd9080bc7c2f8eab147703c8a4e0a4b3833ec58e | |
parent | be72a02c54979ecee2a57649c6f9dd49ca5f2525 (diff) | |
download | talos-occ-ad4295664e98414db0dcbf0d37fa5bde6dae80ca.tar.gz talos-occ-ad4295664e98414db0dcbf0d37fa5bde6dae80ca.zip |
Fix DIMM temperature error handling for poll response
RTC:155187
Change-Id: I38039dc18de9bfc5b9194f63b3b869bf7c16991f
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/26067
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r-- | src/occ_405/amec/amec_controller.c | 7 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_freq.c | 17 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_health.c | 63 | ||||
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.c | 7 |
4 files changed, 51 insertions, 43 deletions
diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c index d0ecfc3..fe27587 100644 --- a/src/occ_405/amec/amec_controller.c +++ b/src/occ_405/amec/amec_controller.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2016 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -29,11 +29,12 @@ #include <occ_common.h> #include <sensor.h> #include <amec_sys.h> +#include <centaur_data.h> //************************************************************************* // Externs //************************************************************************* -extern uint8_t G_dimm_temp_expired_bitmap; +extern cent_sensor_flags_t G_dimm_temp_expired_bitmap; extern uint8_t G_cent_temp_expired_bitmap; //************************************************************************* // Macros @@ -178,7 +179,7 @@ void amec_controller_dimm_thermal() // Get TEMP2MSDIMM sensor value l_sensor = getSensorByGsid(TEMP2MSDIMM); - if(G_dimm_temp_expired_bitmap) + if(G_dimm_temp_expired_bitmap.bigword) { //we were not able to read one or more dimm temperatures. //Assume temperature is at the setpoint plus 1 degree C. diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c index 0afe676..93a43fb 100755 --- a/src/occ_405/amec/amec_freq.c +++ b/src/occ_405/amec/amec_freq.c @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: src/occ/amec/amec_freq.c $ */ +/* $Source: src/occ_405/amec/amec_freq.c $ */ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2016 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -48,12 +48,13 @@ #include <amec_data.h> #include <amec_freq.h> #include "pss_constants.h" +#include <centaur_data.h> //************************************************************************* // Externs //************************************************************************* extern uint8_t G_cent_temp_expired_bitmap; -extern uint8_t G_dimm_temp_expired_bitmap; +extern cent_sensor_flags_t G_dimm_temp_expired_bitmap; //************************************************************************* // Defines/Enums @@ -78,7 +79,7 @@ const uint32_t G_pmc_ffdc_scom_addrs[PMC_FFDC_SCOM_ADDRS_SIZE] = PMC_LFIR_ERR_MASK_REG, OCB_OCCLFIR, PBA_FIR, - TOD_VALUE_REG + TOD_VALUE_REG }; //FFDC OCI addresses as requested by Greg Still in defect SW247927 @@ -618,14 +619,12 @@ void amec_slv_mem_voting_box(void) if(!L_throttle_traced) { L_throttle_traced = TRUE; -// @TODO - TEMP - No dimm temp Colection yet -/* - TRAC_INFO("Memory is being throttled. reason[%d] vote[%d] cent_expired[0x%02x] dimm_expired[0x%02x]", + TRAC_INFO("Memory is being throttled. reason[%d] vote[%d] cent_expired[0x%02x] dimm_expired[0x%08x%08x]", l_reason, l_vote, G_cent_temp_expired_bitmap, - G_dimm_temp_expired_bitmap); -*/ + G_dimm_temp_expired_bitmap.words[0], + G_dimm_temp_expired_bitmap.words[1]); } } else diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 1e061bf..a559f04 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -23,9 +23,9 @@ /* */ /* IBM_PROLOG_END_TAG */ -//************************************************************************* +//*************************************************************************/ // Includes -//************************************************************************* +//*************************************************************************/ #include "amec_health.h" #include "amec_sys.h" #include "amec_service_codes.h" @@ -33,17 +33,17 @@ #include <centaur_data.h> #include <proc_data.h> -//************************************************************************* +//*************************************************************************/ // Externs -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Defines/Enums -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Globals -//************************************************************************* +//*************************************************************************/ // Have we already called out the dimm for overtemp (bitmap of dimms)? cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0}; @@ -51,9 +51,8 @@ cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0}; // Have we already called out the dimm for timeout (bitmap of dimms)? cent_sensor_flags_t G_dimm_timeout_logged_bitmap = {0}; -// Are any dimms currently in the timedout state (bitmap of centaurs)? -// Note: this only tells you which centaur, not which dimm. -uint8_t G_dimm_temp_expired_bitmap = 0; +// Are any dimms currently in the timedout state (bitmap of dimm)? +cent_sensor_flags_t G_dimm_temp_expired_bitmap = {0}; // Have we already called out the centaur for timeout (bitmap of centaurs)? uint8_t G_cent_timeout_logged_bitmap = 0; @@ -67,13 +66,13 @@ uint8_t G_cent_temp_expired_bitmap = 0; // Array to store the update tag of each core's temperature sensor uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0}; -//************************************************************************* +//*************************************************************************/ // Function Declarations -//************************************************************************* +//*************************************************************************/ -//************************************************************************* +//*************************************************************************/ // Functions -//************************************************************************* +//*************************************************************************/ uint64_t amec_mem_get_huid(uint8_t i_cent, uint8_t i_dimm) { uint64_t l_huid; @@ -380,16 +379,17 @@ void amec_health_check_dimm_timeout() break; } - //iterate across all centaurs incrementing dimm sensor timers as needed + //iterate across all centaurs/ports incrementing dimm sensor timers as needed for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++) { //any dimm timers behind this centaur need incrementing? if(!l_need_inc.bytes[l_cent]) { - //all dimm sensors were updated for this centaur. Clear the dimm timeout bit for this centaur. - if(G_dimm_temp_expired_bitmap & (CENTAUR0_PRESENT_MASK >> l_cent)) + // All dimm sensors were updated for this centaur/port + // Trace this fact and clear the expired byte for all DIMMs on this centaur/port + if(G_dimm_temp_expired_bitmap.bytes[l_cent]) { - G_dimm_temp_expired_bitmap &= ~(CENTAUR0_PRESENT_MASK >> l_cent); + G_dimm_temp_expired_bitmap.bytes[l_cent] = 0; TRAC_INFO("All dimm sensors for centaur %d have been updated", l_cent); } continue; @@ -398,9 +398,14 @@ void amec_health_check_dimm_timeout() //There's at least one dimm requiring an increment, find the dimm for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++) { - //not this one, go to next one + //not this one, check if we need to clear the dimm timeout and go to the next one if(!(l_need_inc.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm))) { + // Clear this one if needed + if(G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm)) + { + G_dimm_temp_expired_bitmap.bytes[l_cent] &= ~(DIMM_SENSOR0 >> l_dimm); + } continue; } @@ -430,12 +435,12 @@ void amec_health_check_dimm_timeout() continue; } - //temperature has expired. Notify control algorithms which centaur. - if(!(G_dimm_temp_expired_bitmap & (CENTAUR0_PRESENT_MASK >> l_cent))) + //temperature has expired. Notify control algorithms which DIMM + if(!(G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm))) { - G_dimm_temp_expired_bitmap |= CENTAUR0_PRESENT_MASK >> l_cent; - TRAC_ERR("Timed out reading dimm temperature sensor on cent %d.", - l_cent); + G_dimm_temp_expired_bitmap.bytes[l_cent] |= (DIMM_SENSOR0 >> l_dimm); + TRAC_ERR("Timed out reading dimm temperature sensor on cent %d dimm %d.", + l_cent, l_dimm); } //If we've already logged an error for this FRU go to the next one. @@ -460,7 +465,7 @@ void amec_health_check_dimm_timeout() * */ l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId - FRU_TEMP_TIMEOUT, //reasoncode + FRU_TEMP_TIMEOUT, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf @@ -496,7 +501,7 @@ void amec_health_check_dimm_timeout() &G_dimm_timeout_logged_bitmap.bytes[l_cent]); } //iterate over all dimms - } //iterate over all centaurs + } //iterate over all centaurs/ports if(l_err) { @@ -509,7 +514,7 @@ void amec_health_check_dimm_timeout() break; } - //iterate across all centaurs clearing dimm sensor timers as needed + //iterate across all centaurs/ports clearing dimm sensor timers as needed for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++) { @@ -541,7 +546,7 @@ void amec_health_check_dimm_timeout() } }//iterate over all dimms - }//iterate over all centaurs + }//iterate over all centaurs/ports }while(0); L_ran_once = TRUE; } diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 3b7da38..48d1d5d 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -46,6 +46,9 @@ #include "amec_master_smh.h" #include <proc_data.h> #include "homer.h" +#include <centaur_data.h> + +extern cent_sensor_flags_t G_dimm_temp_expired_bitmap; // This table contains tunable parameter information that can be exposed to // customers (only Master OCC should access/control this table) @@ -261,7 +264,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) l_tempSensorList[l_sensorHeader.count].id = g_amec->proc[0].memctl[l_port].centaur.dimm_temps[l_dimm].temp_sid; //If a dimm timed out long enough, we should return 0xFFFF for that sensor. - if (G_dimm_timeout_logged_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm)) + if (G_dimm_temp_expired_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm)) { l_tempSensorList[l_sensorHeader.count].value = 0xFFFF; } @@ -300,7 +303,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) { l_tempSensorList[l_sensorHeader.count].id = g_amec->proc[0].memctl[l_cent].centaur.dimm_temps[l_dimm].temp_sid; //If a dimm timed out long enough, we should return 0xFFFF for that sensor. - if (G_dimm_timeout_logged_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm)) + if (G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm)) { l_tempSensorList[l_sensorHeader.count].value = 0xFFFF; } |