summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec
diff options
context:
space:
mode:
Diffstat (limited to 'src/occ_405/amec')
-rw-r--r--src/occ_405/amec/amec_controller.c236
-rw-r--r--src/occ_405/amec/amec_controller.h4
-rwxr-xr-xsrc/occ_405/amec/amec_data.c92
-rwxr-xr-xsrc/occ_405/amec/amec_freq.c24
-rw-r--r--src/occ_405/amec/amec_freq.h5
-rwxr-xr-xsrc/occ_405/amec/amec_health.c126
-rw-r--r--src/occ_405/amec/amec_init.c15
-rw-r--r--src/occ_405/amec/amec_sensors_ocmb.c364
-rwxr-xr-xsrc/occ_405/amec/amec_sys.h27
9 files changed, 660 insertions, 233 deletions
diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c
index 1f02eda..a74ad2b 100644
--- a/src/occ_405/amec/amec_controller.c
+++ b/src/occ_405/amec/amec_controller.c
@@ -36,6 +36,8 @@
//*************************************************************************
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern uint16_t G_cent_temp_expired_bitmap;
+extern uint8_t G_ocm_dts_type_expired_bitmap;
+
//*************************************************************************
// Macros
//*************************************************************************
@@ -246,8 +248,8 @@ void amec_controller_vrm_vdd_thermal()
// Description: This function implements the Proportional Controller for the
// DIMM thermal control. Although it doesn't return any
// results, it populates the thermal vote in the field
-// g_amec->thermaldimm.speed_request.
-//
+// g_amec->thermaldimm.speed_request, g_amec->thermalmcdimm.speed_request,
+// g_amec->thermalpmic.speed_request and g_amec->thermalmcext.speed_request,
// Task Flags:
//
// End Function Specification
@@ -256,82 +258,197 @@ void amec_controller_dimm_thermal()
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
+ uint8_t i = 0;
+ uint8_t l_max_dimm_types = 0;
+ const uint16_t l_dimm_types[4] = {DATA_FRU_DIMM,
+ DATA_FRU_MEMCTRL_DRAM,
+ DATA_FRU_PMIC,
+ DATA_FRU_MEMCTRL_EXT};
uint16_t l_thermal_winner = 0;
uint16_t l_residue = 0;
uint16_t l_old_residue = 0;
+ uint16_t l_throttle_temp = 0;
+ uint16_t l_Pgain = 0;
+ uint16_t * l_speed_request = NULL;
+ uint16_t * l_total_res = NULL;
int16_t l_error = 0;
int16_t l_mem_speed = 0;
int16_t l_throttle_chg = 0;
int32_t l_throttle = 0;
sensor_t * l_sensor = NULL;
+ bool l_timeout = false;
/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
- // Get TEMPDIMMTHRM sensor value
- l_sensor = getSensorByGsid(TEMPDIMMTHRM);
-
- if(G_dimm_temp_expired_bitmap.dw[0] ||
- G_dimm_temp_expired_bitmap.dw[1])
- {
- //we were not able to read one or more dimm temperatures.
- //Assume temperature is at the setpoint plus 1 degree C.
- l_thermal_winner = g_amec->thermaldimm.setpoint + 10;
- }
- else
- {
- // Use the highest temperature of all DIMMs in 0.1 degrees C
- l_thermal_winner = l_sensor->sample * 10;
- }
-
- // Check if there is an error
- if (g_amec->thermaldimm.setpoint == l_thermal_winner)
- return;
-
- // Calculate the thermal control error
- l_error = g_amec->thermaldimm.setpoint - l_thermal_winner;
-
- // Proportional Controller for the thermal control loop based on DIMM
- // temperatures
- l_throttle = (int32_t) l_error * g_amec->thermaldimm.Pgain;
- l_residue = (uint16_t) l_throttle;
- l_throttle_chg = (int16_t) (l_throttle >> 16);
-
- if ((int16_t) l_throttle_chg > AMEC_MEMORY_SPEED_CHANGE_LIMIT)
+ // loop for the number of different fru types the "dimm" sensors can be
+ // to determine memory throttle based on each type
+ if(MEM_TYPE_OCM == G_sysConfigData.mem_type)
{
- l_throttle_chg = AMEC_MEMORY_SPEED_CHANGE_LIMIT;
+ // all 4 types are possible:
+ l_max_dimm_types = 4;
}
else
{
- if ((int16_t) l_throttle_chg < (-AMEC_MEMORY_SPEED_CHANGE_LIMIT))
- {
- l_throttle_chg = -AMEC_MEMORY_SPEED_CHANGE_LIMIT;
- }
+ // can only be the one DATA_FRU_DIMM type which must be listed first in l_dimm_types
+ l_max_dimm_types = 1;
}
- // Calculate the new thermal speed request for DIMMs
- l_mem_speed = g_amec->thermaldimm.speed_request +
- (int16_t) l_throttle_chg * AMEC_MEMORY_STEP_SIZE;
-
- // Proceed with residue summation to correctly follow set-point
- l_old_residue = g_amec->thermaldimm.total_res;
- g_amec->thermaldimm.total_res += l_residue;
- if (g_amec->thermaldimm.total_res < l_old_residue)
+ for(i= 0; i < l_max_dimm_types; i++)
{
- l_mem_speed += AMEC_MEMORY_STEP_SIZE;
- }
-
- // Enforce actuator saturation limits
- if (l_mem_speed > AMEC_MEMORY_MAX_STEP)
- l_mem_speed = AMEC_MEMORY_MAX_STEP;
- if (l_mem_speed < AMEC_MEMORY_MIN_STEP)
- l_mem_speed = AMEC_MEMORY_MIN_STEP;
-
- // Generate the new thermal speed request
- g_amec->thermaldimm.speed_request = (uint16_t) l_mem_speed;
-
- // Update the Memory OT Throttle Sensor
- if(g_amec->thermaldimm.speed_request < AMEC_MEMORY_MAX_STEP)
+ l_timeout = false; // default this type did not timeout
+
+ // setup vars specific for type being processed
+ if(l_dimm_types[i] == DATA_FRU_DIMM)
+ {
+ // use control values for DATA_FRU_DIMM type
+ l_throttle_temp = g_amec->thermaldimm.setpoint;
+ l_Pgain = g_amec->thermaldimm.Pgain;
+ l_speed_request = &g_amec->thermaldimm.speed_request;
+ l_total_res = &g_amec->thermaldimm.total_res;
+
+ // Get the highest DIMM temperature in 0.1 degrees C
+ l_sensor = getSensorByGsid(TEMPDIMMTHRM);
+ l_thermal_winner = l_sensor->sample * 10;
+
+ // check for time out
+ if(G_dimm_temp_expired_bitmap.dw[0] || G_dimm_temp_expired_bitmap.dw[1])
+ {
+ if(MEM_TYPE_OCM != G_sysConfigData.mem_type)
+ {
+ // non-OCM can only have DIMM type so timeout must be for DIMM
+ l_timeout = true;
+ }
+ else if(G_ocm_dts_type_expired_bitmap & OCM_DTS_TYPE_DIMM_MASK) // MEM_TYPE_OCM
+ {
+ l_timeout = true;
+ }
+ }
+ } // end if DATA_FRU_DIMM
+ else if(l_dimm_types[i] == DATA_FRU_MEMCTRL_DRAM)
+ {
+ // use control values for DATA_FRU_MEMCTRL_DRAM type
+ l_throttle_temp = g_amec->thermalmcdimm.setpoint;
+ l_Pgain = g_amec->thermalmcdimm.Pgain;
+ l_speed_request = &g_amec->thermalmcdimm.speed_request;
+ l_total_res = &g_amec->thermalmcdimm.total_res;
+
+ // Get the highest Memctrl+DRAM temperature in 0.1 degrees C
+ l_sensor = getSensorByGsid(TEMPMCDIMMTHRM);
+ l_thermal_winner = l_sensor->sample * 10;
+
+ // check if this type timed out
+ if(G_ocm_dts_type_expired_bitmap & OCM_DTS_TYPE_MEMCTRL_DRAM_MASK)
+ {
+ l_timeout = true;
+ }
+ }
+ else if(l_dimm_types[i] == DATA_FRU_PMIC)
+ {
+ // use control values for DATA_FRU_PMIC type
+ l_throttle_temp = g_amec->thermalpmic.setpoint;
+ l_Pgain = g_amec->thermalpmic.Pgain;
+ l_speed_request = &g_amec->thermalpmic.speed_request;
+ l_total_res = &g_amec->thermalpmic.total_res;
+
+ // Get the highest PMIC temperature in 0.1 degrees C
+ l_sensor = getSensorByGsid(TEMPPMICTHRM);
+ l_thermal_winner = l_sensor->sample * 10;
+
+ // check if this type timed out
+ if(G_ocm_dts_type_expired_bitmap & OCM_DTS_TYPE_PMIC_MASK)
+ {
+ l_timeout = true;
+ }
+ }
+ else if(l_dimm_types[i] == DATA_FRU_MEMCTRL_EXT)
+ {
+ // use control values for DATA_FRU_MEMCTRL_EXT type
+ l_throttle_temp = g_amec->thermalmcext.setpoint;
+ l_Pgain = g_amec->thermalmcext.Pgain;
+ l_speed_request = &g_amec->thermalmcext.speed_request;
+ l_total_res = &g_amec->thermalmcext.total_res;
+
+ // Get the highest external mem controller temperature in 0.1 degrees C
+ l_sensor = getSensorByGsid(TEMPMCEXTTHRM);
+ l_thermal_winner = l_sensor->sample * 10;
+
+ // check if this type timed out
+ if(G_ocm_dts_type_expired_bitmap & OCM_DTS_TYPE_MEMCTRL_EXT_MASK)
+ {
+ l_timeout = true;
+ }
+ }
+ else
+ {
+ // should never happen -- code bug
+ TRAC_ERR("amec_controller_dimm_thermal: Invalid DIMM sensor type[0x%02X] at idx[%d]",
+ l_dimm_types[i],
+ i);
+ continue;
+ }
+
+ // start common code for all types to determine throttle level
+ // Adjust the temperature if there was a time out reading this sensor fru type
+ if(l_timeout)
+ {
+ //Assume temperature is at the throttle temp plus 1 degree C.
+ l_thermal_winner = l_throttle_temp + 10;
+ }
+
+ // Check if this type is being used and the temp differs from the throttle point
+ if( (!l_thermal_winner) || (l_throttle_temp == l_thermal_winner) )
+ continue;
+
+ // Calculate the thermal control error
+ l_error = l_throttle_temp - l_thermal_winner;
+
+ // Proportional Controller for the thermal control loop based on memory temperatures
+ l_throttle = (int32_t) l_error * l_Pgain;
+ l_residue = (uint16_t) l_throttle;
+ l_throttle_chg = (int16_t) (l_throttle >> 16);
+
+ if ((int16_t) l_throttle_chg > AMEC_MEMORY_SPEED_CHANGE_LIMIT)
+ {
+ l_throttle_chg = AMEC_MEMORY_SPEED_CHANGE_LIMIT;
+ }
+ else
+ {
+ if ((int16_t) l_throttle_chg < (-AMEC_MEMORY_SPEED_CHANGE_LIMIT))
+ {
+ l_throttle_chg = -AMEC_MEMORY_SPEED_CHANGE_LIMIT;
+ }
+ }
+
+ // Calculate the new thermal speed request
+ l_mem_speed = *l_speed_request +
+ (int16_t) l_throttle_chg * AMEC_MEMORY_STEP_SIZE;
+
+ // Proceed with residue summation to correctly follow set-point
+ l_old_residue = *l_total_res;
+ *l_total_res += l_residue;
+ if (*l_total_res < l_old_residue)
+ {
+ l_mem_speed += AMEC_MEMORY_STEP_SIZE;
+ }
+
+ // Enforce actuator saturation limits
+ if (l_mem_speed > AMEC_MEMORY_MAX_STEP)
+ l_mem_speed = AMEC_MEMORY_MAX_STEP;
+ if (l_mem_speed < AMEC_MEMORY_MIN_STEP)
+ l_mem_speed = AMEC_MEMORY_MIN_STEP;
+
+ // Save the new thermal speed request for this memory sensor type
+ *l_speed_request = (uint16_t) l_mem_speed;
+
+ } // end for loop processing each memory sensor type
+
+ // Done processing all types now determine if any of them are driving throttling
+ // and update the Memory OT Throttle Sensor
+ if( (g_amec->thermaldimm.speed_request < AMEC_MEMORY_MAX_STEP) ||
+ (g_amec->thermalmcdimm.speed_request < AMEC_MEMORY_MAX_STEP) ||
+ (g_amec->thermalpmic.speed_request < AMEC_MEMORY_MAX_STEP) ||
+ (g_amec->thermalmcext.speed_request < AMEC_MEMORY_MAX_STEP) )
{
// Memory speed is less than max indicate throttle due to OT
sensor_update(AMECSENSOR_PTR(MEMOTTHROT), 1);
@@ -340,7 +457,6 @@ void amec_controller_dimm_thermal()
{
sensor_update(AMECSENSOR_PTR(MEMOTTHROT), 0);
}
-
}
diff --git a/src/occ_405/amec/amec_controller.h b/src/occ_405/amec/amec_controller.h
index 97e379c..8808a03 100644
--- a/src/occ_405/amec/amec_controller.h
+++ b/src/occ_405/amec/amec_controller.h
@@ -117,8 +117,8 @@ void amec_controller_membuf_thermal();
* This function implements a Proportional Controller for the
* thermal control loop based on DIMM temperatures. Although it
* doesn't return any results, it populates the thermal vote in
- * the field g_amec->thermaldimm.speed_request.
- *
+ * the fields g_amec->thermaldimm.speed_request, g_amec->thermalmcdimm.speed_request,
+ * g_amec->thermalpmic.speed_request and g_amec->thermalmcext.speed_request
*/
void amec_controller_dimm_thermal();
diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c
index 43ff5b3..6aa6b78 100755
--- a/src/occ_405/amec/amec_data.c
+++ b/src/occ_405/amec/amec_data.c
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2018 */
+/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -268,6 +268,96 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode)
TRAC_INFO("AMEC_data_write_thrm_thresholds: DIMM setpoints - DVFS: %u, Error: %u",
l_dvfs_temp, l_error);
+ // Store the Memctrl+DIMM thermal data
+ if (!l_pm_limits)
+ {
+ // use normal thresholds for Nominal or OPAL
+ l_dvfs_temp = l_frudata[DATA_FRU_MEMCTRL_DRAM].dvfs;
+ l_error = l_frudata[DATA_FRU_MEMCTRL_DRAM].error;
+ }
+ else
+ {
+ l_dvfs_temp = l_frudata[DATA_FRU_MEMCTRL_DRAM].pm_dvfs;
+ if(i_mode == OCC_MODE_TURBO)
+ {
+ //Need to log an error if we throttle in static turbo mode (for mfg)
+ l_error = l_dvfs_temp;
+ }
+ else
+ {
+ l_error = l_frudata[DATA_FRU_MEMCTRL_DRAM].pm_error;
+ }
+ }
+ // Store the DVFS thermal setpoint in 0.1 degrees C
+ g_amec->thermalmcdimm.setpoint = l_dvfs_temp * 10;
+ // Store the error temperature for OT detection
+ g_amec->thermalmcdimm.ot_error = l_error;
+ // Store the temperature timeout value
+ g_amec->thermalmcdimm.temp_timeout = l_frudata[DATA_FRU_DIMM].max_read_timeout;
+
+ TRAC_INFO("AMEC_data_write_thrm_thresholds: MC+DIMM setpoints - DVFS: %u, Error: %u",
+ l_dvfs_temp, l_error);
+
+ // Store the PMIC thermal data
+ if (!l_pm_limits)
+ {
+ // use normal thresholds for Nominal or OPAL
+ l_dvfs_temp = l_frudata[DATA_FRU_PMIC].dvfs;
+ l_error = l_frudata[DATA_FRU_PMIC].error;
+ }
+ else
+ {
+ l_dvfs_temp = l_frudata[DATA_FRU_PMIC].pm_dvfs;
+ if(i_mode == OCC_MODE_TURBO)
+ {
+ //Need to log an error if we throttle in static turbo mode (for mfg)
+ l_error = l_dvfs_temp;
+ }
+ else
+ {
+ l_error = l_frudata[DATA_FRU_PMIC].pm_error;
+ }
+ }
+ // Store the DVFS thermal setpoint in 0.1 degrees C
+ g_amec->thermalpmic.setpoint = l_dvfs_temp * 10;
+ // Store the error temperature for OT detection
+ g_amec->thermalpmic.ot_error = l_error;
+ // Store the temperature timeout value
+ g_amec->thermalpmic.temp_timeout = l_frudata[DATA_FRU_PMIC].max_read_timeout;
+
+ TRAC_INFO("AMEC_data_write_thrm_thresholds: PMIC setpoints - DVFS: %u, Error: %u",
+ l_dvfs_temp, l_error);
+
+ // Store the external mem ctrl thermal data
+ if (!l_pm_limits)
+ {
+ // use normal thresholds for Nominal or OPAL
+ l_dvfs_temp = l_frudata[DATA_FRU_MEMCTRL_EXT].dvfs;
+ l_error = l_frudata[DATA_FRU_MEMCTRL_EXT].error;
+ }
+ else
+ {
+ l_dvfs_temp = l_frudata[DATA_FRU_MEMCTRL_EXT].pm_dvfs;
+ if(i_mode == OCC_MODE_TURBO)
+ {
+ //Need to log an error if we throttle in static turbo mode (for mfg)
+ l_error = l_dvfs_temp;
+ }
+ else
+ {
+ l_error = l_frudata[DATA_FRU_MEMCTRL_EXT].pm_error;
+ }
+ }
+ // Store the DVFS thermal setpoint in 0.1 degrees C
+ g_amec->thermalmcext.setpoint = l_dvfs_temp * 10;
+ // Store the error temperature for OT detection
+ g_amec->thermalmcext.ot_error = l_error;
+ // Store the temperature timeout value
+ g_amec->thermalmcext.temp_timeout = l_frudata[DATA_FRU_MEMCTRL_EXT].max_read_timeout;
+
+ TRAC_INFO("AMEC_data_write_thrm_thresholds: External MC setpoints - DVFS: %u, Error: %u",
+ l_dvfs_temp, l_error);
+
// Store the VRM Vdd thermal data
if (!l_pm_limits)
{
diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c
index 4275b68..ea33a3e 100755
--- a/src/occ_405/amec/amec_freq.c
+++ b/src/occ_405/amec/amec_freq.c
@@ -760,6 +760,30 @@ void amec_slv_mem_voting_box(void)
kvm_reason = MEMORY_OVER_TEMP;
}
+ // Check vote from Mem ctrl+DIMM thermal control loop
+ if (l_vote > g_amec->thermalmcdimm.speed_request)
+ {
+ l_vote = g_amec->thermalmcdimm.speed_request;
+ l_reason = AMEC_MEM_VOTING_REASON_MCDIMM;
+ kvm_reason = MEMORY_OVER_TEMP;
+ }
+
+ // Check vote from Pmic thermal control loop
+ if (l_vote > g_amec->thermalpmic.speed_request)
+ {
+ l_vote = g_amec->thermalpmic.speed_request;
+ l_reason = AMEC_MEM_VOTING_REASON_PMIC;
+ kvm_reason = MEMORY_OVER_TEMP;
+ }
+
+ // Check vote from external mem controller thermal control loop
+ if (l_vote > g_amec->thermalmcext.speed_request)
+ {
+ l_vote = g_amec->thermalmcext.speed_request;
+ l_reason = AMEC_MEM_VOTING_REASON_MC_EXT;
+ kvm_reason = MEMORY_OVER_TEMP;
+ }
+
// Check if memory autoslewing is enabled
if (g_amec->mnfg_parms.mem_autoslew)
{
diff --git a/src/occ_405/amec/amec_freq.h b/src/occ_405/amec/amec_freq.h
index 7fe8a26..8df060a 100644
--- a/src/occ_405/amec/amec_freq.h
+++ b/src/occ_405/amec/amec_freq.h
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2015 */
+/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -100,6 +100,9 @@ typedef enum
AMEC_MEM_VOTING_REASON_CENT = 0x01,
AMEC_MEM_VOTING_REASON_DIMM = 0x02,
AMEC_MEM_VOTING_REASON_SLEW = 0x03,
+ AMEC_MEM_VOTING_REASON_MCDIMM = 0x04,
+ AMEC_MEM_VOTING_REASON_PMIC = 0x05,
+ AMEC_MEM_VOTING_REASON_MC_EXT = 0x06,
}amec_mem_voting_reason_t;
// This is memory throttle reason code encoded in OPAL dynamic data
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index 0d99f20..cdb7d6b 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -56,6 +56,9 @@ dimm_sensor_flags_t G_dimm_timeout_logged_bitmap = {{0}};
// Are any dimms currently in the timedout state (bitmap of dimm)?
dimm_sensor_flags_t G_dimm_temp_expired_bitmap = {{0}};
+// Timedout state of OCMB "DIMM" sensors by fru type (bitmap of DTS type)
+uint8_t G_ocm_dts_type_expired_bitmap = 0;
+
// Have we already called out the centaur for timeout (bitmap of centaurs)?
uint16_t G_cent_timeout_logged_bitmap = 0;
@@ -177,10 +180,6 @@ void amec_health_check_dimm_temp()
return;
}
- l_ot_error = g_amec->thermaldimm.ot_error;
- l_sensor = getSensorByGsid(TEMPDIMMTHRM);
- l_max_temp = l_sensor->sample_max;
-
//iterate over all dimms
for(l_port = 0; l_port < l_max_port; l_port++)
{
@@ -211,14 +210,51 @@ void amec_health_check_dimm_temp()
fru_temp_t* l_fru;
l_fru = &g_amec->proc[0].memctl[l_port].centaur.dimm_temps[l_dimm];
+ switch(l_fru->temp_fru_type)
+ {
+ case DATA_FRU_DIMM:
+ l_ot_error = g_amec->thermaldimm.ot_error;
+ l_sensor = getSensorByGsid(TEMPDIMMTHRM);
+ l_max_temp = l_sensor->sample_max;
+ break;
+
+ case DATA_FRU_MEMCTRL_DRAM:
+ l_ot_error = g_amec->thermalmcdimm.ot_error;
+ l_sensor = getSensorByGsid(TEMPMCDIMMTHRM);
+ l_max_temp = l_sensor->sample_max;
+ break;
+
+ case DATA_FRU_PMIC:
+ l_ot_error = g_amec->thermalpmic.ot_error;
+ l_sensor = getSensorByGsid(TEMPPMICTHRM);
+ l_max_temp = l_sensor->sample_max;
+ break;
+
+ case DATA_FRU_MEMCTRL_EXT:
+ l_ot_error = g_amec->thermalmcext.ot_error;
+ l_sensor = getSensorByGsid(TEMPMCEXTTHRM);
+ l_max_temp = l_sensor->sample_max;
+ break;
+
+ default:
+ // this is a code bug trace and let the error be logged for debug
+ TRAC_ERR("amec_health_check_dimm_temp: sensor[%04X] marked as OT has invalid type[%d]",
+ (l_port<<8)|l_dimm, l_fru->temp_fru_type);
+ l_ot_error = 0xff;
+ l_max_temp = 0xff;
+ break;
+ }
+ TRAC_ERR("amec_health_check_dimm_temp: sensor[%04X] type[0x%02X] reached error temp[%d] current[%d]",
+ (l_port<<8)|l_dimm, l_fru->temp_fru_type, l_ot_error, l_fru->cur_temp);
+
amec_mem_mark_logged(l_port,
l_dimm,
&G_cent_overtemp_logged_bitmap,
&G_dimm_overtemp_logged_bitmap.bytes[l_port]);
- TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X being called out for overtemp - %dC",
- (l_port<<8)|l_dimm, l_fru->cur_temp);
- // Create single elog with up to MAX_CALLOUTS for this port
+ // Create single elog with up to MAX_CALLOUTS
+ // this will be generic regardless of temperature sensor type, the callouts will be correct
+ // and the traces will point to specific types/thresholds
if(l_callouts_count < ERRL_MAX_CALLOUTS)
{
//If we don't have an error log for the callout, create one
@@ -290,12 +326,15 @@ void amec_health_check_dimm_timeout()
{
static dimm_sensor_flags_t L_temp_update_bitmap_prev = {{0}};
dimm_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap;
- uint8_t l_dimm, l_port;
+ uint8_t l_dimm, l_port, l_temp_timeout;
fru_temp_t* l_fru;
errlHndl_t l_err = NULL;
uint32_t l_callouts_count = 0;
uint64_t l_huid;
static bool L_ran_once = FALSE;
+ uint8_t l_max_port = 0; // #ports in nimbus/#mem buffs in cumulus/OCM
+ uint8_t l_max_dimm_per_port = 0; // per port in nimbus/per mem buff in cumulus/OCM
+ uint8_t l_ocm_dts_type_expired_bitmap = 0;
do
{
@@ -331,8 +370,6 @@ void amec_health_check_dimm_timeout()
break;
}
- uint8_t l_max_port; // #ports in nimbus/#mem buffs in cumulus/OCM
- uint8_t l_max_dimm_per_port; // per port in nimbus/per mem buff in cumulus/OCM
if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
{
l_max_port = NUM_DIMM_PORTS;
@@ -400,17 +437,42 @@ void amec_health_check_dimm_timeout()
}
//check if the temperature reading is still useable
- if(g_amec->thermaldimm.temp_timeout == 0xff ||
- l_fru->sample_age < g_amec->thermaldimm.temp_timeout)
+ if(l_fru->temp_fru_type == DATA_FRU_DIMM)
+ {
+ l_temp_timeout = g_amec->thermaldimm.temp_timeout;
+ }
+
+ else if(l_fru->temp_fru_type == DATA_FRU_MEMCTRL_DRAM)
+ {
+ l_temp_timeout = g_amec->thermalmcdimm.temp_timeout;
+ }
+
+ else if(l_fru->temp_fru_type == DATA_FRU_PMIC)
+ {
+ l_temp_timeout = g_amec->thermalpmic.temp_timeout;
+ }
+
+ else if(l_fru->temp_fru_type == DATA_FRU_MEMCTRL_EXT)
+ {
+ l_temp_timeout = g_amec->thermalmcext.temp_timeout;
+ }
+
+ else // invalid type or not used, ignore
+ l_temp_timeout = 0xff;
+
+ if(l_temp_timeout == 0xff ||
+ l_fru->sample_age < l_temp_timeout)
{
continue;
}
- //temperature has expired. Notify control algorithms which DIMM
+ //temperature has expired. Notify control algorithms which DIMM DTS and type
if(!(G_dimm_temp_expired_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm)))
{
G_dimm_temp_expired_bitmap.bytes[l_port] |= (DIMM_SENSOR0 >> l_dimm);
- TRAC_ERR("Timed out reading DIMM%04X temperature sensor", (l_port<<8)|l_dimm);
+ TRAC_ERR("Timed out reading DIMM%04X temperature sensor type[0x%02X]",
+ (l_port<<8)|l_dimm,
+ l_fru->temp_fru_type);
}
//If we've already logged an error for this FRU go to the next one.
@@ -421,7 +483,7 @@ void amec_health_check_dimm_timeout()
// To prevent DIMMs from incorrectly being called out, don't log errors if there have
// been timeouts with GPE1 tasks not finishing
- if(G_error_history[ERRH_GPE1_NOT_IDLE] > g_amec->thermaldimm.temp_timeout)
+ if(G_error_history[ERRH_GPE1_NOT_IDLE] > l_temp_timeout)
{
TRAC_ERR("Timed out reading DIMM temperature due to GPE1 issues");
// give notification that GPE1 error should now be logged which will reset the OCC
@@ -460,7 +522,7 @@ void amec_health_check_dimm_timeout()
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
- g_amec->thermaldimm.temp_timeout, //userdata1
+ l_temp_timeout, //userdata1
0); //userdata2
}
@@ -527,6 +589,38 @@ void amec_health_check_dimm_timeout()
}//iterate over all dimms
}//iterate over all centaurs/ports
}while(0);
+
+ // For OCM the "DIMM" dts are used for different types. Need to determine what type the
+ // "DIMM" DTS readings are for so the control loop will handle timeout based on correct type
+ if(MEM_TYPE_OCM == G_sysConfigData.mem_type)
+ {
+ if(G_dimm_temp_expired_bitmap.dw[0] || G_dimm_temp_expired_bitmap.dw[1])
+ {
+ // at least one sensor expired. Set type for each expired sensor
+ //iterate across all OCMBs
+ for(l_port = 0; l_port < l_max_port; l_port++)
+ {
+ //iterate over all "dimm" DTS readings
+ for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++)
+ {
+ if(G_dimm_temp_expired_bitmap.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm))
+ {
+ // found an expired sensor
+ l_ocm_dts_type_expired_bitmap |= g_amec->proc[0].memctl[l_port].centaur.dimm_temps[l_dimm].dts_type_mask;
+ }
+ }//iterate over all dimms
+ }//iterate over all OCMBs
+ } // if temp expired
+
+ // check if there is a change to any type expired
+ if(G_ocm_dts_type_expired_bitmap != l_ocm_dts_type_expired_bitmap)
+ {
+ TRAC_INFO("DIMM DTS type expired bitmap changed from[0x%04X] to[0x%04X]",
+ G_ocm_dts_type_expired_bitmap, l_ocm_dts_type_expired_bitmap);
+ G_ocm_dts_type_expired_bitmap = l_ocm_dts_type_expired_bitmap;
+ }
+ } // if mem type OCM
+
L_ran_once = TRUE;
} // end amec_health_check_dimm_timeout()
diff --git a/src/occ_405/amec/amec_init.c b/src/occ_405/amec/amec_init.c
index 3d3faa6..6b51b1d 100644
--- a/src/occ_405/amec/amec_init.c
+++ b/src/occ_405/amec/amec_init.c
@@ -267,6 +267,21 @@ void amec_init_gamec_struct(void)
g_amec->thermalcent.Pgain = 30000;
g_amec->thermalcent.speed_request = AMEC_MEMORY_MAX_STEP;
+ // Initialize thermal controller based on temperature sensor covering both mem ctrl and DIMM
+ g_amec->thermalmcdimm.setpoint = 850; //In 0.1 degrees C -> 850 = 85.0 C
+ g_amec->thermalmcdimm.Pgain = 30000;
+ g_amec->thermalmcdimm.speed_request = AMEC_MEMORY_MAX_STEP;
+
+ // Initialize thermal controller based on PMIC temperatures
+ g_amec->thermalpmic.setpoint = 850; //In 0.1 degrees C -> 850 = 85.0 C
+ g_amec->thermalpmic.Pgain = 30000;
+ g_amec->thermalpmic.speed_request = AMEC_MEMORY_MAX_STEP;
+
+ // Initialize thermal controller based on external mem controller temperatures
+ g_amec->thermalmcext.setpoint = 850; //In 0.1 degrees C -> 850 = 85.0 C
+ g_amec->thermalmcext.Pgain = 30000;
+ g_amec->thermalmcext.speed_request = AMEC_MEMORY_MAX_STEP;
+
// Initialize component power caps
g_amec->pcap.active_proc_pcap = 0;
g_amec->pcap.active_mem_level = 0;
diff --git a/src/occ_405/amec/amec_sensors_ocmb.c b/src/occ_405/amec/amec_sensors_ocmb.c
index a2cd4a6..8ffbefa 100644
--- a/src/occ_405/amec/amec_sensors_ocmb.c
+++ b/src/occ_405/amec/amec_sensors_ocmb.c
@@ -110,21 +110,21 @@ void amec_update_ocmb_sensors(uint8_t i_membuf)
// End Function Specification
void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf)
{
+// confirmed ok to use same values for all types (internal mc, dimm, external mc, pmic...)
#define MIN_VALID_DIMM_TEMP 1
#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek 04/23/2019
-#define MAX_MEM_TEMP_CHANGE 2
+#define MAX_MEM_TEMP_CHANGE 4
- uint32_t k, l_hottest_dimm_temp;
+ uint32_t k;
uint16_t l_dts[NUM_DIMMS_PER_OCMB] = {0};
- uint32_t l_hottest_dimm_loc = NUM_DIMMS_PER_OCMB;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_OCMBS] = {FALSE};
- static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};
- // Harvest thermal data for all dimms
+ // Harvest thermal data for memory thermal sensors that are enabled and being used
for(k=0; k < NUM_DIMMS_PER_OCMB; k++)
{
- if(!CENTAUR_SENSOR_ENABLED(i_membuf, k))
+ if( (!CENTAUR_SENSOR_ENABLED(i_membuf, k)) ||
+ (g_amec->proc[0].memctl[i_membuf].centaur.dimm_temps[k].temp_fru_type == DATA_FRU_NOT_USED) )
{
continue;
}
@@ -250,48 +250,17 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
}
}
- //Check if at or above the error temperature
- if(l_dts[k] >= g_amec->thermaldimm.ot_error)
- {
- //Set a bit so that this dimm can be called out by the thermal thread
- G_dimm_overtemp_bitmap.bytes[i_membuf] |= (DIMM_SENSOR0 >> k);
- // trace first time OT per DIMM
- if( !L_ot_traced[i_membuf][k] )
- {
- TRAC_ERR("amec_update_ocmb_dimm_dts_sensors: Mem Buf[%d] DIMM[%d] reached error temp[%d]. current[%d]",
- i_membuf,
- k,
- g_amec->thermaldimm.ot_error,
- l_dts[k]);
- L_ot_traced[i_membuf][k] = true;
- }
- }
+ //Check for over temperature must be done by type and will be checked
+ // in amec_update_ocmb_temp_sensors() which happens after all OCMBs have been read
}
- // Find hottest temperature from all DIMMs for this centaur
- for(l_hottest_dimm_temp = 0, k = 0; k < NUM_DIMMS_PER_OCMB; k++)
+ // update the current temperatures
+ for(k = 0; k < NUM_DIMMS_PER_OCMB; k++)
{
- if(l_dts[k] > l_hottest_dimm_temp)
- {
- l_hottest_dimm_temp = l_dts[k];
- l_hottest_dimm_loc = k;
- }
g_amec->proc[0].memctl[i_membuf].centaur.dimm_temps[k].cur_temp = l_dts[k];
}
- amec_centaur_t* l_centaur_ptr = &g_amec->proc[0].memctl[i_membuf].centaur;
-
- //only update location if hottest dimm temp is greater than previous maximum
- if(l_hottest_dimm_temp > l_centaur_ptr->tempdimmax.sample_max)
- {
- sensor_update(&l_centaur_ptr->locdimmax, l_hottest_dimm_loc);
- }
-
- //update the max dimm temperature sensor for this centaur
- sensor_update(&l_centaur_ptr->tempdimmax, l_hottest_dimm_temp);
-
L_ran_once[i_membuf] = TRUE;
- AMEC_DBG("Centaur[%d]: HotDimm=%d\n",i_membuf,l_hottest_dimm_temp);
}
// Function Specification
@@ -317,129 +286,223 @@ void amec_update_ocmb_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf
fru_temp_t* l_fru = &g_amec->proc[0].memctl[i_membuf].centaur.centaur_hottest;
- l_prev_temp = l_fru->cur_temp;
- if(!l_prev_temp)
+ // Internal DTS sensor is either for internal memctrl or not being used
+ // ignore the internal sensor if it isn't marked for internal memctrl
+ if(l_fru->temp_fru_type == DATA_FRU_CENTAUR)
{
- l_prev_temp = l_sens_temp;
- }
-
- //Check DTS status bits
- if(i_sensor_cache->status.fields.ubdts0_valid &&
- (!i_sensor_cache->status.fields.ubdts0_err))
- {
- //make sure temperature is within a 'reasonable' range.
- if(l_sens_temp < MIN_VALID_MEMBUF_TEMP ||
- l_sens_temp > MAX_VALID_MEMBUF_TEMP)
- {
- //set a flag so that if we end up logging an error we have something to debug why
- l_fru->flags |= FRU_TEMP_OUT_OF_RANGE;
- l_dts = l_prev_temp;
- }
- else
- {
- //don't allow temp to change more than is reasonable since last read
- if(l_sens_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
- {
- l_dts = l_prev_temp + MAX_MEM_TEMP_CHANGE;
- if(!l_fru->flags)
- {
- TRAC_INFO("membuf temp rose faster than reasonable: membuf[%d] prev[%d] cur[%d]",
- i_membuf, l_prev_temp, l_sens_temp);
- l_fru->flags |= FRU_TEMP_FAST_CHANGE;
- }
- }
- else if (l_sens_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE))
- {
- l_dts = l_prev_temp - MAX_MEM_TEMP_CHANGE;
- if(!l_fru->flags)
- {
- TRAC_INFO("membuf temp fell faster than reasonable: cent[%d] prev[%d] cur[%d]",
- i_membuf, l_prev_temp, l_sens_temp);
- l_fru->flags |= FRU_TEMP_FAST_CHANGE;
- }
- }
- else //reasonable amount of change occurred
- {
- l_dts = l_sens_temp;
- l_fru->flags &= ~FRU_TEMP_FAST_CHANGE;
- }
-
- //Notify thermal thread that temperature has been updated
- G_cent_temp_updated_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
-
- //clear error flags
- l_fru->flags &= FRU_TEMP_FAST_CHANGE;
- }
+ l_prev_temp = l_fru->cur_temp;
+ if(!l_prev_temp)
+ {
+ l_prev_temp = l_sens_temp;
+ }
+
+ //Check DTS status bits
+ if(i_sensor_cache->status.fields.ubdts0_valid &&
+ (!i_sensor_cache->status.fields.ubdts0_err))
+ {
+ //make sure temperature is within a 'reasonable' range.
+ if(l_sens_temp < MIN_VALID_MEMBUF_TEMP ||
+ l_sens_temp > MAX_VALID_MEMBUF_TEMP)
+ {
+ //set a flag so that if we end up logging an error we have something to debug why
+ l_fru->flags |= FRU_TEMP_OUT_OF_RANGE;
+ l_dts = l_prev_temp;
+ }
+ else
+ {
+ //don't allow temp to change more than is reasonable since last read
+ if(l_sens_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
+ {
+ l_dts = l_prev_temp + MAX_MEM_TEMP_CHANGE;
+ if(!l_fru->flags)
+ {
+ TRAC_INFO("membuf temp rose faster than reasonable: membuf[%d] prev[%d] cur[%d]",
+ i_membuf, l_prev_temp, l_sens_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else if (l_sens_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE))
+ {
+ l_dts = l_prev_temp - MAX_MEM_TEMP_CHANGE;
+ if(!l_fru->flags)
+ {
+ TRAC_INFO("membuf temp fell faster than reasonable: cent[%d] prev[%d] cur[%d]",
+ i_membuf, l_prev_temp, l_sens_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else //reasonable amount of change occurred
+ {
+ l_dts = l_sens_temp;
+ l_fru->flags &= ~FRU_TEMP_FAST_CHANGE;
+ }
+
+ //Notify thermal thread that temperature has been updated
+ G_cent_temp_updated_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
+
+ //clear error flags
+ l_fru->flags &= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else //status was INVALID
+ {
+ if(L_ran_once[i_membuf])
+ {
+ //Trace the error if we haven't traced it already for this sensor
+ if(!(l_fru->flags & FRU_SENSOR_STATUS_INVALID) &&
+ i_sensor_cache->status.fields.ubdts0_err)
+ {
+ TRAC_ERR("Membuf %d temp sensor error.", i_membuf);
+ }
+
+ l_fru->flags |= FRU_SENSOR_STATUS_INVALID;
+ }
+
+ //use last temperature
+ l_dts = l_prev_temp;
+ }
+
+ L_ran_once[i_membuf] = TRUE;
+
+ //Check if at or above the error temperature
+ if(l_dts >= g_amec->thermalcent.ot_error)
+ {
+ //Set a bit so that this dimm can be called out by the thermal thread
+ G_cent_overtemp_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
+ }
+
+ // Update Interim Data - later this will get picked up to form centaur sensor
+ l_fru->cur_temp = l_dts;
+
+ AMEC_DBG("Membuf[%d]: HotMembuf=%d\n",i_membuf,l_dts);
}
- else //status was INVALID
+ else // internal sensor not being used
{
- if(L_ran_once[i_membuf])
- {
- //Trace the error if we haven't traced it already for this sensor
- if(!(l_fru->flags & FRU_SENSOR_STATUS_INVALID) &&
- i_sensor_cache->status.fields.ubdts0_err)
- {
- TRAC_ERR("Membuf %d temp sensor error.", i_membuf);
- }
+ // make sure temperature is 0 indicating not present
+ l_fru->cur_temp = 0;
- l_fru->flags |= FRU_SENSOR_STATUS_INVALID;
- }
+ //Notify thermal thread that temperature has been updated so no timeout error is logged
+ G_cent_temp_updated_bitmap |= CENTAUR0_PRESENT_MASK >> i_membuf;
- //use last temperature
- l_dts = l_prev_temp;
+ //clear error flags
+ l_fru->flags = 0;
}
-
- L_ran_once[i_membuf] = TRUE;
-
- //Check if at or above the error temperature
- if(l_dts >= g_amec->thermalcent.ot_error)
- {
- //Set a bit so that this dimm can be called out by the thermal thread
- G_cent_overtemp_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
- }
-
- // Update Interim Data - later this will get picked up to form centaur sensor
- g_amec->proc[0].memctl[i_membuf].centaur.centaur_hottest.cur_temp = l_dts;
-
- AMEC_DBG("Membuf[%d]: HotMembuf=%d\n",i_membuf,l_dts);
}
// Function Specification
//
// Name: amec_update_ocmb_temp_sensors
//
-// Description: Updates thermal sensors that have data grabbed by the centaur.
+// Description: Updates thermal sensors to give summary (across all OCMBs) for each mem type
//
// Thread: RealTime Loop
//
// End Function Specification
void amec_update_ocmb_temp_sensors(void)
{
- uint32_t k;
+ uint32_t k, l_dimm;
uint32_t l_hot_dimm = 0;
uint32_t l_hot_mb = 0;
+ uint32_t l_hot_mb_dimm = 0;
+ uint32_t l_hot_pmic = 0;
+ uint32_t l_hot_ext_mb = 0;
+ uint8_t l_ot_error = 0;
+ uint8_t l_cur_temp = 0;
+ uint8_t l_fru_type = DATA_FRU_NOT_USED;
+ static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};
- // -----------------------------------------------------------
- // Find hottest temperature from all membufs for this Proc chip
- // Find hottest temperature from all DIMMs for this Proc chip
- // -----------------------------------------------------------
for(k=0; k < MAX_NUM_OCMBS; k++)
{
- if(g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp > l_hot_mb)
+ // Find hottest temperature from all internal membufs for this Proc chip
+ // make sure the type is "CENTAUR" i.e. internal memory controller temp
+ if( (g_amec->proc[0].memctl[k].centaur.centaur_hottest.temp_fru_type == DATA_FRU_CENTAUR) &&
+ (g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp > l_hot_mb) )
{
l_hot_mb = g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp;
}
- if(g_amec->proc[0].memctl[k].centaur.tempdimmax.sample > l_hot_dimm)
+
+ // process each of the thermal sensors (stored as "dimm" temps)
+ // based on what type they are for and finding the hottest for each type
+ for(l_dimm=0; l_dimm < NUM_DIMMS_PER_OCMB; l_dimm++)
{
- l_hot_dimm = g_amec->proc[0].memctl[k].centaur.tempdimmax.sample;
- }
- }
+ l_fru_type = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].temp_fru_type;
+ l_cur_temp = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+
+ switch(l_fru_type)
+ {
+ case DATA_FRU_DIMM:
+ l_ot_error = g_amec->thermaldimm.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_dimm)
+ {
+ l_hot_dimm = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_MEMCTRL_DRAM:
+ l_ot_error = g_amec->thermalmcdimm.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_mb_dimm)
+ {
+ l_hot_mb_dimm = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_PMIC:
+ l_ot_error = g_amec->thermalpmic.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_pmic)
+ {
+ l_hot_pmic = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_MEMCTRL_EXT:
+ l_ot_error = g_amec->thermalmcext.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_ext_mb)
+ {
+ l_hot_ext_mb = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_NOT_USED:
+ default:
+ // ignore reading
+ l_ot_error = 0;
+ break;
+ } // end switch fru type
+
+ // check if this "DIMM" sensor is over its error temperature
+ if( l_ot_error && (l_cur_temp >= l_ot_error) )
+ {
+ //Set a bit so that this sensor can be called out by the thermal thread
+ G_dimm_overtemp_bitmap.bytes[k] |= (DIMM_SENSOR0 >> l_dimm);
+ // trace first time OT per DIMM DTS sensor
+ if( !L_ot_traced[k][l_dimm] )
+ {
+ TRAC_ERR("amec_update_ocmb_temp_sensors: OCMB[%d] DTS[%d] type[0x%02X] reached error temp[%d]. current[%d]",
+ k,
+ l_dimm,
+ l_fru_type,
+ l_ot_error,
+ l_cur_temp);
+ L_ot_traced[k][l_dimm] = true;
+ }
+ }
+ } // end for each "dimm" thermal sensor
+ } // end for each OCMB
+
sensor_update(&g_amec->proc[0].tempcent,l_hot_mb);
AMEC_DBG("HotMembuf=%d\n",l_hot_mb);
sensor_update(&g_amec->proc[0].tempdimmthrm,l_hot_dimm);
AMEC_DBG("HotDimm=%d\n",l_hot_dimm);
+ sensor_update(&g_amec->proc[0].tempmcdimmthrm,l_hot_mb_dimm);
+ AMEC_DBG("HotMCDimm=%d\n",l_hot_mb_dimm);
+
+ sensor_update(&g_amec->proc[0].temppmicthrm,l_hot_pmic);
+ AMEC_DBG("HotPmic=%d\n",l_hot_pmic);
+
+ sensor_update(&g_amec->proc[0].tempmcextthrm,l_hot_ext_mb);
+ AMEC_DBG("HotExternalMembuf=%d\n",l_hot_ext_mb);
}
@@ -510,24 +573,27 @@ void amec_perfcount_ocmb_getmc( OcmbMemData * i_sensor_cache,
g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms = tempreg;
- // Go after second MC performance counter (power ups and activations)
- tempu = l_sensor_cache->mba_act;
- templ = l_sensor_cache->mba_powerups;
-
- // ------------------------------------------------------------
- // Sensor: MRDMx (0.01 Mrps) Memory read requests per sec
- // ------------------------------------------------------------
- tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms;
- tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memread2ms;
- sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mrd)), tempreg);
-
- // -------------------------------------------------------------
- // Sensor: MWRMx (0.01 Mrps) Memory write requests per sec
- // -------------------------------------------------------------
- tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memwrite2ms;
- tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memwrite2ms;
- sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mwr)), tempreg);
-
+ // Due to limited SRAM only have sensor support for first 12 mem buffs
+ if(i_membuf < 12)
+ {
+ // Go after second MC performance counter (power ups and activations)
+ tempu = l_sensor_cache->mba_act;
+ templ = l_sensor_cache->mba_powerups;
+
+ // ------------------------------------------------------------
+ // Sensor: MRDMx (0.01 Mrps) Memory read requests per sec
+ // ------------------------------------------------------------
+ tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms;
+ tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memread2ms;
+ sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mrd)), tempreg);
+
+ // -------------------------------------------------------------
+ // Sensor: MWRMx (0.01 Mrps) Memory write requests per sec
+ // -------------------------------------------------------------
+ tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memwrite2ms;
+ tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memwrite2ms;
+ sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mwr)), tempreg);
+ }
return;
}
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index c19cd53..f17b42b 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -146,6 +146,7 @@ typedef struct
amec_cent_mem_speed_t last_mem_speed_sent;
} amec_portpair_t;
+// bit masks for fru_temp_t flags
#define FRU_SENSOR_STATUS_STALLED 0x01
#define FRU_SENSOR_STATUS_ERROR 0x02
#define FRU_SENSOR_STATUS_VALID_OLD 0x04
@@ -154,6 +155,12 @@ typedef struct
#define FRU_TEMP_FAST_CHANGE 0x20
#define FRU_SENSOR_CENT_NEST_FIR6 0x40 //centaur only
+// OpenCAPI memory only bit masks for fru_temp_t dts_type_mask
+#define OCM_DTS_TYPE_DIMM_MASK 0x01
+#define OCM_DTS_TYPE_MEMCTRL_DRAM_MASK 0x02
+#define OCM_DTS_TYPE_PMIC_MASK 0x04
+#define OCM_DTS_TYPE_MEMCTRL_EXT_MASK 0x08
+
typedef struct
{
uint8_t cur_temp;
@@ -161,6 +168,10 @@ typedef struct
uint8_t flags;
// Sensor ID for reporting temperature to BMC and FSP
uint32_t temp_sid;
+ // Indicates what eConfigDataFruType this temperature is for
+ uint8_t temp_fru_type;
+ // Indicates what this temperature is for
+ uint8_t dts_type_mask;
}fru_temp_t;
typedef struct
@@ -191,7 +202,6 @@ typedef struct
// Sensor ID for reporting temperature to BMC and FSP
uint32_t temp_sid;
-
} amec_centaur_t;
typedef struct
@@ -469,8 +479,11 @@ typedef struct
vectorSensor_t util_vector;
// Memory Summary Sensors
- sensor_t tempcent;
- sensor_t tempdimmthrm;
+ sensor_t tempcent; // hottest of all DATA_FRU_CENTAUR monitored by this OCC
+ sensor_t tempdimmthrm; // hottest of all DATA_FRU_DIMM monitored by this OCC
+ sensor_t tempmcdimmthrm; // hottest of all DATA_FRU_MEMCTRL_DRAM monitored by this OCC
+ sensor_t temppmicthrm; // hottest of all DATA_FRU_PMIC monitored by this OCC
+ sensor_t tempmcextthrm; // hottest of all DATA_FRU_MEMCTRL_EXT monitored by this OCC
sensor_t mempwrthrot;
sensor_t memotthrot;
@@ -628,10 +641,16 @@ typedef struct
//---------------------------------------------------------
// Thermal Controller based on processor temperatures
amec_controller_t thermalproc;
- // Thermal Controller based on Centaur temperatures
+ // Thermal Controller based on Centaur (internal mc) temperatures
amec_controller_t thermalcent;
// Thermal Controller based on DIMM temperatures
amec_controller_t thermaldimm;
+ // Thermal Controller based on temperature sensors covering both Memctrl+DIMM
+ amec_controller_t thermalmcdimm;
+ // Thermal Controller based on PMIC temperatures
+ amec_controller_t thermalpmic;
+ // Thermal Controller based on external mem controller temperatures
+ amec_controller_t thermalmcext;
// Thermal Controller based on VRM Vdd temperatures
amec_controller_t thermalvdd;
OpenPOWER on IntegriCloud