summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2016-06-20 16:24:41 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2016-06-29 14:27:02 -0400
commitad4295664e98414db0dcbf0d37fa5bde6dae80ca (patch)
treedd9080bc7c2f8eab147703c8a4e0a4b3833ec58e
parentbe72a02c54979ecee2a57649c6f9dd49ca5f2525 (diff)
downloadtalos-occ-ad4295664e98414db0dcbf0d37fa5bde6dae80ca.tar.gz
talos-occ-ad4295664e98414db0dcbf0d37fa5bde6dae80ca.zip
Fix DIMM temperature error handling for poll response
RTC:155187 Change-Id: I38039dc18de9bfc5b9194f63b3b869bf7c16991f Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/26067 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r--src/occ_405/amec/amec_controller.c7
-rwxr-xr-xsrc/occ_405/amec/amec_freq.c17
-rwxr-xr-xsrc/occ_405/amec/amec_health.c63
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c7
4 files changed, 51 insertions, 43 deletions
diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c
index d0ecfc3..fe27587 100644
--- a/src/occ_405/amec/amec_controller.c
+++ b/src/occ_405/amec/amec_controller.c
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2015 */
+/* Contributors Listed Below - COPYRIGHT 2011,2016 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -29,11 +29,12 @@
#include <occ_common.h>
#include <sensor.h>
#include <amec_sys.h>
+#include <centaur_data.h>
//*************************************************************************
// Externs
//*************************************************************************
-extern uint8_t G_dimm_temp_expired_bitmap;
+extern cent_sensor_flags_t G_dimm_temp_expired_bitmap;
extern uint8_t G_cent_temp_expired_bitmap;
//*************************************************************************
// Macros
@@ -178,7 +179,7 @@ void amec_controller_dimm_thermal()
// Get TEMP2MSDIMM sensor value
l_sensor = getSensorByGsid(TEMP2MSDIMM);
- if(G_dimm_temp_expired_bitmap)
+ if(G_dimm_temp_expired_bitmap.bigword)
{
//we were not able to read one or more dimm temperatures.
//Assume temperature is at the setpoint plus 1 degree C.
diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c
index 0afe676..93a43fb 100755
--- a/src/occ_405/amec/amec_freq.c
+++ b/src/occ_405/amec/amec_freq.c
@@ -1,11 +1,11 @@
/* IBM_PROLOG_BEGIN_TAG */
/* This is an automatically generated prolog. */
/* */
-/* $Source: src/occ/amec/amec_freq.c $ */
+/* $Source: src/occ_405/amec/amec_freq.c $ */
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2015 */
+/* Contributors Listed Below - COPYRIGHT 2011,2016 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -48,12 +48,13 @@
#include <amec_data.h>
#include <amec_freq.h>
#include "pss_constants.h"
+#include <centaur_data.h>
//*************************************************************************
// Externs
//*************************************************************************
extern uint8_t G_cent_temp_expired_bitmap;
-extern uint8_t G_dimm_temp_expired_bitmap;
+extern cent_sensor_flags_t G_dimm_temp_expired_bitmap;
//*************************************************************************
// Defines/Enums
@@ -78,7 +79,7 @@ const uint32_t G_pmc_ffdc_scom_addrs[PMC_FFDC_SCOM_ADDRS_SIZE] =
PMC_LFIR_ERR_MASK_REG,
OCB_OCCLFIR,
PBA_FIR,
- TOD_VALUE_REG
+ TOD_VALUE_REG
};
//FFDC OCI addresses as requested by Greg Still in defect SW247927
@@ -618,14 +619,12 @@ void amec_slv_mem_voting_box(void)
if(!L_throttle_traced)
{
L_throttle_traced = TRUE;
-// @TODO - TEMP - No dimm temp Colection yet
-/*
- TRAC_INFO("Memory is being throttled. reason[%d] vote[%d] cent_expired[0x%02x] dimm_expired[0x%02x]",
+ TRAC_INFO("Memory is being throttled. reason[%d] vote[%d] cent_expired[0x%02x] dimm_expired[0x%08x%08x]",
l_reason,
l_vote,
G_cent_temp_expired_bitmap,
- G_dimm_temp_expired_bitmap);
-*/
+ G_dimm_temp_expired_bitmap.words[0],
+ G_dimm_temp_expired_bitmap.words[1]);
}
}
else
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index 1e061bf..a559f04 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -23,9 +23,9 @@
/* */
/* IBM_PROLOG_END_TAG */
-//*************************************************************************
+//*************************************************************************/
// Includes
-//*************************************************************************
+//*************************************************************************/
#include "amec_health.h"
#include "amec_sys.h"
#include "amec_service_codes.h"
@@ -33,17 +33,17 @@
#include <centaur_data.h>
#include <proc_data.h>
-//*************************************************************************
+//*************************************************************************/
// Externs
-//*************************************************************************
+//*************************************************************************/
-//*************************************************************************
+//*************************************************************************/
// Defines/Enums
-//*************************************************************************
+//*************************************************************************/
-//*************************************************************************
+//*************************************************************************/
// Globals
-//*************************************************************************
+//*************************************************************************/
// Have we already called out the dimm for overtemp (bitmap of dimms)?
cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0};
@@ -51,9 +51,8 @@ cent_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0};
// Have we already called out the dimm for timeout (bitmap of dimms)?
cent_sensor_flags_t G_dimm_timeout_logged_bitmap = {0};
-// Are any dimms currently in the timedout state (bitmap of centaurs)?
-// Note: this only tells you which centaur, not which dimm.
-uint8_t G_dimm_temp_expired_bitmap = 0;
+// Are any dimms currently in the timedout state (bitmap of dimm)?
+cent_sensor_flags_t G_dimm_temp_expired_bitmap = {0};
// Have we already called out the centaur for timeout (bitmap of centaurs)?
uint8_t G_cent_timeout_logged_bitmap = 0;
@@ -67,13 +66,13 @@ uint8_t G_cent_temp_expired_bitmap = 0;
// Array to store the update tag of each core's temperature sensor
uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0};
-//*************************************************************************
+//*************************************************************************/
// Function Declarations
-//*************************************************************************
+//*************************************************************************/
-//*************************************************************************
+//*************************************************************************/
// Functions
-//*************************************************************************
+//*************************************************************************/
uint64_t amec_mem_get_huid(uint8_t i_cent, uint8_t i_dimm)
{
uint64_t l_huid;
@@ -380,16 +379,17 @@ void amec_health_check_dimm_timeout()
break;
}
- //iterate across all centaurs incrementing dimm sensor timers as needed
+ //iterate across all centaurs/ports incrementing dimm sensor timers as needed
for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
{
//any dimm timers behind this centaur need incrementing?
if(!l_need_inc.bytes[l_cent])
{
- //all dimm sensors were updated for this centaur. Clear the dimm timeout bit for this centaur.
- if(G_dimm_temp_expired_bitmap & (CENTAUR0_PRESENT_MASK >> l_cent))
+ // All dimm sensors were updated for this centaur/port
+ // Trace this fact and clear the expired byte for all DIMMs on this centaur/port
+ if(G_dimm_temp_expired_bitmap.bytes[l_cent])
{
- G_dimm_temp_expired_bitmap &= ~(CENTAUR0_PRESENT_MASK >> l_cent);
+ G_dimm_temp_expired_bitmap.bytes[l_cent] = 0;
TRAC_INFO("All dimm sensors for centaur %d have been updated", l_cent);
}
continue;
@@ -398,9 +398,14 @@ void amec_health_check_dimm_timeout()
//There's at least one dimm requiring an increment, find the dimm
for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
{
- //not this one, go to next one
+ //not this one, check if we need to clear the dimm timeout and go to the next one
if(!(l_need_inc.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm)))
{
+ // Clear this one if needed
+ if(G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm))
+ {
+ G_dimm_temp_expired_bitmap.bytes[l_cent] &= ~(DIMM_SENSOR0 >> l_dimm);
+ }
continue;
}
@@ -430,12 +435,12 @@ void amec_health_check_dimm_timeout()
continue;
}
- //temperature has expired. Notify control algorithms which centaur.
- if(!(G_dimm_temp_expired_bitmap & (CENTAUR0_PRESENT_MASK >> l_cent)))
+ //temperature has expired. Notify control algorithms which DIMM
+ if(!(G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm)))
{
- G_dimm_temp_expired_bitmap |= CENTAUR0_PRESENT_MASK >> l_cent;
- TRAC_ERR("Timed out reading dimm temperature sensor on cent %d.",
- l_cent);
+ G_dimm_temp_expired_bitmap.bytes[l_cent] |= (DIMM_SENSOR0 >> l_dimm);
+ TRAC_ERR("Timed out reading dimm temperature sensor on cent %d dimm %d.",
+ l_cent, l_dimm);
}
//If we've already logged an error for this FRU go to the next one.
@@ -460,7 +465,7 @@ void amec_health_check_dimm_timeout()
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId
- FRU_TEMP_TIMEOUT, //reasoncode
+ FRU_TEMP_TIMEOUT, //reasoncode
OCC_NO_EXTENDED_RC, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
@@ -496,7 +501,7 @@ void amec_health_check_dimm_timeout()
&G_dimm_timeout_logged_bitmap.bytes[l_cent]);
} //iterate over all dimms
- } //iterate over all centaurs
+ } //iterate over all centaurs/ports
if(l_err)
{
@@ -509,7 +514,7 @@ void amec_health_check_dimm_timeout()
break;
}
- //iterate across all centaurs clearing dimm sensor timers as needed
+ //iterate across all centaurs/ports clearing dimm sensor timers as needed
for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
{
@@ -541,7 +546,7 @@ void amec_health_check_dimm_timeout()
}
}//iterate over all dimms
- }//iterate over all centaurs
+ }//iterate over all centaurs/ports
}while(0);
L_ran_once = TRUE;
}
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 3b7da38..48d1d5d 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -46,6 +46,9 @@
#include "amec_master_smh.h"
#include <proc_data.h>
#include "homer.h"
+#include <centaur_data.h>
+
+extern cent_sensor_flags_t G_dimm_temp_expired_bitmap;
// This table contains tunable parameter information that can be exposed to
// customers (only Master OCC should access/control this table)
@@ -261,7 +264,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
l_tempSensorList[l_sensorHeader.count].id = g_amec->proc[0].memctl[l_port].centaur.dimm_temps[l_dimm].temp_sid;
//If a dimm timed out long enough, we should return 0xFFFF for that sensor.
- if (G_dimm_timeout_logged_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm))
+ if (G_dimm_temp_expired_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm))
{
l_tempSensorList[l_sensorHeader.count].value = 0xFFFF;
}
@@ -300,7 +303,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
{
l_tempSensorList[l_sensorHeader.count].id = g_amec->proc[0].memctl[l_cent].centaur.dimm_temps[l_dimm].temp_sid;
//If a dimm timed out long enough, we should return 0xFFFF for that sensor.
- if (G_dimm_timeout_logged_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm))
+ if (G_dimm_temp_expired_bitmap.bytes[l_cent] & (DIMM_SENSOR0 >> l_dimm))
{
l_tempSensorList[l_sensorHeader.count].value = 0xFFFF;
}
OpenPOWER on IntegriCloud