summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xsrc/occ_405/amec/amec_analytics.c4
-rw-r--r--src/occ_405/amec/amec_controller.c4
-rwxr-xr-xsrc/occ_405/amec/amec_health.c9
-rw-r--r--src/occ_405/amec/amec_sensors_centaur.c2
-rwxr-xr-xsrc/occ_405/amec/amec_sys.h2
-rwxr-xr-xsrc/occ_405/dcom/dcom.h2
-rwxr-xr-xsrc/occ_405/dimm/dimm.c120
-rwxr-xr-xsrc/occ_405/sensor/sensor_enum.h2
-rwxr-xr-xsrc/occ_405/sensor/sensor_info.c2
-rwxr-xr-xsrc/occ_405/sensor/sensor_table.c4
-rwxr-xr-xsrc/occ_405/thread/chom.c4
11 files changed, 121 insertions, 34 deletions
diff --git a/src/occ_405/amec/amec_analytics.c b/src/occ_405/amec/amec_analytics.c
index 29e8fe9..d7dd1f8 100755
--- a/src/occ_405/amec/amec_analytics.c
+++ b/src/occ_405/amec/amec_analytics.c
@@ -465,7 +465,7 @@ void amec_analytics_main(void)
break;
case 4:
- tempreg = (g_amec->proc[0].temp2msdimm.sample) << 8; // upper byte
+ tempreg = (g_amec->proc[0].temp16msdimm.sample) << 8; // upper byte
break;
case 5:
@@ -473,7 +473,7 @@ void amec_analytics_main(void)
break;
case 6:
- // tempreg=(g_amec->proc[2].temp2msdimm.sample)<<8; // upper byte
+ // tempreg=(g_amec->proc[2].temp16msdimm.sample)<<8; // upper byte
tempreg = 0;
break;
diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c
index fe27587..3a91087 100644
--- a/src/occ_405/amec/amec_controller.c
+++ b/src/occ_405/amec/amec_controller.c
@@ -176,8 +176,8 @@ void amec_controller_dimm_thermal()
/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
- // Get TEMP2MSDIMM sensor value
- l_sensor = getSensorByGsid(TEMP2MSDIMM);
+ // Get TEMP16MSDIMM sensor value
+ l_sensor = getSensorByGsid(TEMP16MSDIMM);
if(G_dimm_temp_expired_bitmap.bigword)
{
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index a559f04..7eaa84e 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -86,13 +86,12 @@ uint64_t amec_mem_get_huid(uint8_t i_cent, uint8_t i_dimm)
{
//we're being asked for a dimm huid
l_huid = G_sysConfigData.dimm_huids[i_cent][i_dimm];
- if(!l_huid)
+ if((l_huid == 0) && (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type))
{
- //if we don't have a valid dimm huid, use the
- //centaur huid.
- //TODO: this will not work for ISDIMMS.
+ //if we don't have a valid dimm huid, use the centaur huid.
l_huid = G_sysConfigData.centaur_huids[i_cent];
}
+ // else NIMBUS huid of 0 indicates not present (should never get called)
}
return l_huid;
}
@@ -142,7 +141,7 @@ void amec_health_check_dimm_temp()
}
l_ot_error = g_amec->thermaldimm.ot_error;
- l_sensor = getSensorByGsid(TEMP2MSDIMM);
+ l_sensor = getSensorByGsid(TEMP16MSDIMM);
l_cur_temp = l_sensor->sample;
l_max_temp = l_sensor->sample_max;
TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. cur_max[%d], hist_max[%d]",
diff --git a/src/occ_405/amec/amec_sensors_centaur.c b/src/occ_405/amec/amec_sensors_centaur.c
index 0eb7187..7be2ef8 100644
--- a/src/occ_405/amec/amec_sensors_centaur.c
+++ b/src/occ_405/amec/amec_sensors_centaur.c
@@ -414,7 +414,7 @@ void amec_update_centaur_temp_sensors(void)
l_hot = g_amec->proc[0].memctl[k].centaur.tempdimmax.sample;
}
}
- sensor_update(&g_amec->proc[0].temp2msdimm,l_hot);
+ sensor_update(&g_amec->proc[0].temp16msdimm,l_hot);
AMEC_DBG("HotDimm=%d\n",l_hot);
}
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 9e920cb..f405fa7 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -463,7 +463,7 @@ typedef struct
// Memory Summary Sensors
sensor_t temp2mscent;
- sensor_t temp2msdimm;
+ sensor_t temp16msdimm;
sensor_t memsp2ms_tls;
// Calculations & Interim Data
diff --git a/src/occ_405/dcom/dcom.h b/src/occ_405/dcom/dcom.h
index 1ff263d..954d03b 100755
--- a/src/occ_405/dcom/dcom.h
+++ b/src/occ_405/dcom/dcom.h
@@ -213,7 +213,7 @@ typedef struct __attribute__ ((packed))
uint16_t pwrpx250usp0cy[MAX_CORES]; // [260]
uint16_t todclock[NUM_TOD_SENSORS]; // [308]
uint16_t temp2mscent; // [314]
- uint16_t temp2msdimm; // [316]
+ uint16_t temp16msdimm; // [316]
uint16_t util2msp0; // [318]
uint16_t ips2msp0; // [320]
uint16_t nutil3sp0cy[MAX_CORES]; // [322]
diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c
index fd308a9..9453514 100755
--- a/src/occ_405/dimm/dimm.c
+++ b/src/occ_405/dimm/dimm.c
@@ -42,6 +42,7 @@
#include "amec_sys.h"
#include "lock.h"
#include "common.h"
+#include "centaur_data.h"
extern bool G_mem_monitoring_allowed;
extern task_t G_task_table[TASK_END];
@@ -53,12 +54,14 @@ bool G_dimm_i2c_reset_required = false;
uint32_t G_dimm_i2c_reset_cause = 0;
#define MAX_CONSECUTIVE_DIMM_RESETS 1
+// On Nimbus, we are using the centaur number as the I2C port (keep same structure)
+// There can be 8 DIMMs under a Centaur and 8 DIMMs per I2C port (max of 2 ports)
+#define NUM_DIMMS_PER_I2CPORT NUM_DIMMS_PER_CENTAUR
typedef struct {
bool disabled;
uint8_t errorCount;
- uint64_t lastReading;
} dimmData_t;
-dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_CENTAUR] = {{{false,0}}};
+dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{{false,0}}};
// If still no i2c interrupt after MAX_TICK_COUNT_WAIT, then try next operation anyway
#define MAX_TICK_COUNT_WAIT 2
@@ -171,7 +174,7 @@ void memory_init()
// Initialization was successful. Set task flags to allow memory
// tasks to run and also prevent from doing initialization again.
G_task_table[mem_task].flags = MEMORY_DATA_RTL_FLAGS;
- //G_task_table[TASK_ID_CENTAUR_CONTROL].flags = MEMORY_CONTROL_RTL_FLAGS;
+ //G_task_table[mem_task].flags = MEMORY_CONTROL_RTL_FLAGS;
}
}
}
@@ -202,7 +205,7 @@ void update_hottest_dimm()
int pIndex, dIndex;
for (pIndex = 0; pIndex < G_maxDimmPorts; ++pIndex)
{
- for (dIndex = 0; dIndex < NUM_DIMMS_PER_CENTAUR; ++dIndex)
+ for (dIndex = 0; dIndex < NUM_DIMMS_PER_I2CPORT; ++dIndex)
{
if (g_amec->proc[0].memctl[pIndex].centaur.dimm_temps[dIndex].cur_temp > hottest)
{
@@ -215,20 +218,24 @@ void update_hottest_dimm()
DIMM_DBG("update_hottest_dimm: hottest DIMM temp for this sample: %dC (loc=%d)", hottest, hottest_loc);
if(hottest > g_amec->proc[0].memctl[0].centaur.tempdimmax.sample_max)
{
- // Save hottest DIMM location ever sampled
+ // Save hottest DIMM location ever sampled. There is no location for the temp16msdimm
+ // sensor, so just store it in memctl[0] location.
DIMM_DBG("update_hottest_dimm: Hottest DIMM ever sampled was DIMM%d %dC (prior %dC)",
hottest_loc, hottest, g_amec->proc[0].memctl[0].centaur.tempdimmax.sample_max);
+ // Store the hottest DIMM location in locdimmax sensor
sensor_update(&g_amec->proc[0].memctl[0].centaur.locdimmax, hottest_loc);
}
- // Nimbus has no Centaurs, but store hottest temp in memctl[0]
+ // Store the hottest DIMM temp in tempdimmax sensor
sensor_update(&g_amec->proc[0].memctl[0].centaur.tempdimmax, hottest);
+ // Store the hottest DIMM temp in temp16msdimm sensor
+ sensor_update(&g_amec->proc[0].temp16msdimm, hottest);
}
// Update current I2C port/DIMM index to next potential DIMM
void use_next_dimm(uint8_t * i_port, uint8_t * i_dimm)
{
- if (++*i_dimm == NUM_DIMMS_PER_CENTAUR)
+ if (++*i_dimm == NUM_DIMMS_PER_I2CPORT)
{
// Finished all DIMMs for current port, switch to new port
*i_port = 1 - *i_port;
@@ -257,6 +264,8 @@ void mark_dimm_failed()
WORD_LOW(G_dimm_sm_args.error.ffdc),
G_dimm_sm_request.request.completion_state);
+ g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].flags |= FRU_SENSOR_STATUS_ERROR;
+
if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS)
{
// Disable collection on this DIMM, collect FFDC and log error
@@ -597,6 +606,92 @@ void SIMULATE_HOST()
// Function Specification
//
+// Name: process_dimm_temp
+//
+// Description: Validate and store DIMM temperature
+//
+// End Function Specification
+void process_dimm_temp()
+{
+ const uint8_t port = G_dimm_sm_args.i2cPort;
+ const uint8_t dimm = G_dimm_sm_args.dimm;
+ uint8_t l_dimm_temp = G_dimm_sm_args.temp;
+
+#define MIN_VALID_DIMM_TEMP 1
+#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek
+#define MAX_MEM_TEMP_CHANGE 16
+
+ // Last DIMM read completed, update sensor and clear error count
+ DIMM_DBG("process_dimm_temp: Successfully read DIMM%04X temperature: %dC, tick %d",
+ DIMM_AND_PORT, l_dimm_temp, DIMM_TICK);
+
+ fru_temp_t* l_fru = &g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm];
+
+ uint8_t l_prev_temp = l_fru->cur_temp;
+ if (l_prev_temp == 0)
+ {
+ l_prev_temp = l_dimm_temp;
+ }
+
+ //make sure temperature is within a 'reasonable' range.
+ if (l_dimm_temp < MIN_VALID_DIMM_TEMP ||
+ l_dimm_temp > MAX_VALID_DIMM_TEMP)
+ {
+ //set a flag so that if we end up logging an error we have something to debug why
+ l_fru->flags |= FRU_TEMP_OUT_OF_RANGE;
+ l_dimm_temp = l_prev_temp;
+ }
+ else
+ {
+ //don't allow temp to change more than is reasonable
+ if (l_dimm_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
+ {
+ if (!l_fru->flags)
+ {
+ TRAC_INFO("dimm temp rose faster than reasonable: DIMM%04X prev[%d] cur[%d]",
+ DIMM_AND_PORT, l_prev_temp, l_dimm_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ l_dimm_temp = l_prev_temp + MAX_MEM_TEMP_CHANGE;
+ }
+ else if (l_dimm_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE))
+ {
+ if (!l_fru->flags)
+ {
+ TRAC_INFO("dimm temp fell faster than reasonable: DIMM%04X prev[%d] cur[%d]",
+ DIMM_AND_PORT, l_prev_temp, l_dimm_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ l_dimm_temp = l_prev_temp - MAX_MEM_TEMP_CHANGE;
+ }
+ else //reasonable amount of change occurred
+ {
+ l_fru->flags &= ~FRU_TEMP_FAST_CHANGE;
+ }
+
+ //Notify thermal thread that temperature has been updated
+ G_dimm_temp_updated_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm;
+
+ //clear other error flags
+ l_fru->flags &= FRU_TEMP_FAST_CHANGE;
+ }
+
+ //Check if at or above the error temperature
+ if (l_dimm_temp >= g_amec->thermaldimm.ot_error)
+ {
+ //Set a bit so that this dimm can be called out by the thermal thread
+ G_dimm_overtemp_bitmap.bytes[port] |= 1 << dimm;
+ }
+
+ l_fru->cur_temp = l_dimm_temp;
+ G_dimm[port][dimm].errorCount = 0;
+
+} // end process_dimm_temp()
+
+
+
+// Function Specification
+//
// Name: task_dimm_sm
//
// Description: DIMM State Machine - Called every other tick to collect all of
@@ -722,15 +817,8 @@ void task_dimm_sm(struct task *i_self)
case DIMM_STATE_READ_TEMP:
if (L_readIssued)
{
- const uint8_t port = G_dimm_sm_args.i2cPort;
- const uint8_t dimm = G_dimm_sm_args.dimm;
-
- // Last DIMM read completed, update sensor and clear error count
- DIMM_DBG("task_dimm_sm: Successfully read DIMM%04X temperature: %dC, tick %d",
- DIMM_AND_PORT, G_dimm_sm_args.temp, DIMM_TICK);
- g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].cur_temp = G_dimm_sm_args.temp;
- G_dimm[port][dimm].lastReading = ((ssx_timebase_get())/(SSX_TIMEBASE_FREQUENCY_HZ/1000000));
- G_dimm[port][dimm].errorCount = 0;
+ // Validate and store temperature
+ process_dimm_temp();
// Move on to next DIMM
use_next_dimm(&L_dimmPort, &L_dimmIndex);
diff --git a/src/occ_405/sensor/sensor_enum.h b/src/occ_405/sensor/sensor_enum.h
index e9a394a..5a6f44a 100755
--- a/src/occ_405/sensor/sensor_enum.h
+++ b/src/occ_405/sensor/sensor_enum.h
@@ -721,7 +721,7 @@ enum e_gsid
MLP2P0M7,
TEMP2MSCENT,
- TEMP2MSDIMM,
+ TEMP16MSDIMM,
MEMSP2MS,
// ------------------------------------------------------
diff --git a/src/occ_405/sensor/sensor_info.c b/src/occ_405/sensor/sensor_info.c
index b94bd30..08e0e54 100755
--- a/src/occ_405/sensor/sensor_info.c
+++ b/src/occ_405/sensor/sensor_info.c
@@ -280,7 +280,7 @@ const sensor_info_t G_sensor_info[] =
/* ==MemSummarySensors== NameString Units Type Location Number Freq ScaleFactor */
SENSOR_INFO_T_ENTRY( TEMP2MSCENT, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ),
- SENSOR_INFO_T_ENTRY( TEMP2MSDIMM, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ),
+ SENSOR_INFO_T_ENTRY( TEMP16MSDIMM, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ),
SENSOR_INFO_T_ENTRY( MEMSP2MS, "%\0", AMEC_SENSOR_TYPE_PERF, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ),
/* ==PartSummarySensors== NameString Units Type Location Number Freq ScaleFactor */
diff --git a/src/occ_405/sensor/sensor_table.c b/src/occ_405/sensor/sensor_table.c
index c4978e1..409b7af 100755
--- a/src/occ_405/sensor/sensor_table.c
+++ b/src/occ_405/sensor/sensor_table.c
@@ -362,7 +362,7 @@ const sensor_ptr_t G_amec_sensor_list[] =
SENSOR_PTR( TEMP2MSCENT, &g_amec_sys.proc[0].temp2mscent),
- SENSOR_PTR( TEMP2MSDIMM, &g_amec_sys.proc[0].temp2msdimm),
+ SENSOR_PTR( TEMP16MSDIMM, &g_amec_sys.proc[0].temp16msdimm),
SENSOR_PTR( MEMSP2MS, &g_amec_sys.proc[0].memsp2ms_tls),
// ------------------------------------------------------
@@ -511,7 +511,7 @@ const minisensor_ptr_t G_amec_mini_sensor_list[] INIT_SECTION =
PORTPAIR_MINI_SENSOR_PTRS_NULL(M4WR2MSP0M),
MINI_SENSOR_PTR( TEMP2MSCENT, &G_dcom_slv_outbox_tx.temp2mscent),
- MINI_SENSOR_PTR( TEMP2MSDIMM, &G_dcom_slv_outbox_tx.temp2msdimm),
+ MINI_SENSOR_PTR( TEMP16MSDIMM, &G_dcom_slv_outbox_tx.temp16msdimm),
MINI_SENSOR_PTR( MEMSP2MS, NULL),
// ------------------------------------------------------
diff --git a/src/occ_405/thread/chom.c b/src/occ_405/thread/chom.c
index ffa652e..06065a0 100755
--- a/src/occ_405/thread/chom.c
+++ b/src/occ_405/thread/chom.c
@@ -248,9 +248,9 @@ void chom_update_sensors()
l_max_cent_temp = G_dcom_slv_outbox_rx[i].temp2mscent;
}
- if (G_dcom_slv_outbox_rx[i].temp2msdimm > l_max_dimm_temp)
+ if (G_dcom_slv_outbox_rx[i].temp16msdimm > l_max_dimm_temp)
{
- l_max_dimm_temp = G_dcom_slv_outbox_rx[i].temp2msdimm;
+ l_max_dimm_temp = G_dcom_slv_outbox_rx[i].temp16msdimm;
}
}
g_chom->sensorData[0].sensor[CHOMTEMPPROC].sample = l_max_core_temp;
OpenPOWER on IntegriCloud