From 5b69d47a980cc851b7eea9f45f7eb9bb0f567a04 Mon Sep 17 00:00:00 2001 From: Chris Cain Date: Wed, 3 Aug 2016 10:57:57 -0500 Subject: Add DIMM temperature validation, update fru flags, and rename dimm sensor Change-Id: Ie201160d92b0d00dd523c78eb1496a1b05e2647a Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/27836 Tested-by: FSP CI Jenkins Reviewed-by: Martha Broyles Reviewed-by: William A. Bryan Reviewed-by: Christopher J. Cain --- src/occ_405/amec/amec_analytics.c | 4 +- src/occ_405/amec/amec_controller.c | 4 +- src/occ_405/amec/amec_health.c | 9 ++- src/occ_405/amec/amec_sensors_centaur.c | 2 +- src/occ_405/amec/amec_sys.h | 2 +- src/occ_405/dcom/dcom.h | 2 +- src/occ_405/dimm/dimm.c | 120 +++++++++++++++++++++++++++----- src/occ_405/sensor/sensor_enum.h | 2 +- src/occ_405/sensor/sensor_info.c | 2 +- src/occ_405/sensor/sensor_table.c | 4 +- src/occ_405/thread/chom.c | 4 +- 11 files changed, 121 insertions(+), 34 deletions(-) (limited to 'src') diff --git a/src/occ_405/amec/amec_analytics.c b/src/occ_405/amec/amec_analytics.c index 29e8fe9..d7dd1f8 100755 --- a/src/occ_405/amec/amec_analytics.c +++ b/src/occ_405/amec/amec_analytics.c @@ -465,7 +465,7 @@ void amec_analytics_main(void) break; case 4: - tempreg = (g_amec->proc[0].temp2msdimm.sample) << 8; // upper byte + tempreg = (g_amec->proc[0].temp16msdimm.sample) << 8; // upper byte break; case 5: @@ -473,7 +473,7 @@ void amec_analytics_main(void) break; case 6: - // tempreg=(g_amec->proc[2].temp2msdimm.sample)<<8; // upper byte + // tempreg=(g_amec->proc[2].temp16msdimm.sample)<<8; // upper byte tempreg = 0; break; diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c index fe27587..3a91087 100644 --- a/src/occ_405/amec/amec_controller.c +++ b/src/occ_405/amec/amec_controller.c @@ -176,8 +176,8 @@ void amec_controller_dimm_thermal() /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ - // Get TEMP2MSDIMM sensor value - l_sensor = getSensorByGsid(TEMP2MSDIMM); + // Get TEMP16MSDIMM sensor value + l_sensor = getSensorByGsid(TEMP16MSDIMM); if(G_dimm_temp_expired_bitmap.bigword) { diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index a559f04..7eaa84e 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -86,13 +86,12 @@ uint64_t amec_mem_get_huid(uint8_t i_cent, uint8_t i_dimm) { //we're being asked for a dimm huid l_huid = G_sysConfigData.dimm_huids[i_cent][i_dimm]; - if(!l_huid) + if((l_huid == 0) && (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type)) { - //if we don't have a valid dimm huid, use the - //centaur huid. - //TODO: this will not work for ISDIMMS. + //if we don't have a valid dimm huid, use the centaur huid. l_huid = G_sysConfigData.centaur_huids[i_cent]; } + // else NIMBUS huid of 0 indicates not present (should never get called) } return l_huid; } @@ -142,7 +141,7 @@ void amec_health_check_dimm_temp() } l_ot_error = g_amec->thermaldimm.ot_error; - l_sensor = getSensorByGsid(TEMP2MSDIMM); + l_sensor = getSensorByGsid(TEMP16MSDIMM); l_cur_temp = l_sensor->sample; l_max_temp = l_sensor->sample_max; TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. cur_max[%d], hist_max[%d]", diff --git a/src/occ_405/amec/amec_sensors_centaur.c b/src/occ_405/amec/amec_sensors_centaur.c index 0eb7187..7be2ef8 100644 --- a/src/occ_405/amec/amec_sensors_centaur.c +++ b/src/occ_405/amec/amec_sensors_centaur.c @@ -414,7 +414,7 @@ void amec_update_centaur_temp_sensors(void) l_hot = g_amec->proc[0].memctl[k].centaur.tempdimmax.sample; } } - sensor_update(&g_amec->proc[0].temp2msdimm,l_hot); + sensor_update(&g_amec->proc[0].temp16msdimm,l_hot); AMEC_DBG("HotDimm=%d\n",l_hot); } diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 9e920cb..f405fa7 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -463,7 +463,7 @@ typedef struct // Memory Summary Sensors sensor_t temp2mscent; - sensor_t temp2msdimm; + sensor_t temp16msdimm; sensor_t memsp2ms_tls; // Calculations & Interim Data diff --git a/src/occ_405/dcom/dcom.h b/src/occ_405/dcom/dcom.h index 1ff263d..954d03b 100755 --- a/src/occ_405/dcom/dcom.h +++ b/src/occ_405/dcom/dcom.h @@ -213,7 +213,7 @@ typedef struct __attribute__ ((packed)) uint16_t pwrpx250usp0cy[MAX_CORES]; // [260] uint16_t todclock[NUM_TOD_SENSORS]; // [308] uint16_t temp2mscent; // [314] - uint16_t temp2msdimm; // [316] + uint16_t temp16msdimm; // [316] uint16_t util2msp0; // [318] uint16_t ips2msp0; // [320] uint16_t nutil3sp0cy[MAX_CORES]; // [322] diff --git a/src/occ_405/dimm/dimm.c b/src/occ_405/dimm/dimm.c index fd308a9..9453514 100755 --- a/src/occ_405/dimm/dimm.c +++ b/src/occ_405/dimm/dimm.c @@ -42,6 +42,7 @@ #include "amec_sys.h" #include "lock.h" #include "common.h" +#include "centaur_data.h" extern bool G_mem_monitoring_allowed; extern task_t G_task_table[TASK_END]; @@ -53,12 +54,14 @@ bool G_dimm_i2c_reset_required = false; uint32_t G_dimm_i2c_reset_cause = 0; #define MAX_CONSECUTIVE_DIMM_RESETS 1 +// On Nimbus, we are using the centaur number as the I2C port (keep same structure) +// There can be 8 DIMMs under a Centaur and 8 DIMMs per I2C port (max of 2 ports) +#define NUM_DIMMS_PER_I2CPORT NUM_DIMMS_PER_CENTAUR typedef struct { bool disabled; uint8_t errorCount; - uint64_t lastReading; } dimmData_t; -dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_CENTAUR] = {{{false,0}}}; +dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{{false,0}}}; // If still no i2c interrupt after MAX_TICK_COUNT_WAIT, then try next operation anyway #define MAX_TICK_COUNT_WAIT 2 @@ -171,7 +174,7 @@ void memory_init() // Initialization was successful. Set task flags to allow memory // tasks to run and also prevent from doing initialization again. G_task_table[mem_task].flags = MEMORY_DATA_RTL_FLAGS; - //G_task_table[TASK_ID_CENTAUR_CONTROL].flags = MEMORY_CONTROL_RTL_FLAGS; + //G_task_table[mem_task].flags = MEMORY_CONTROL_RTL_FLAGS; } } } @@ -202,7 +205,7 @@ void update_hottest_dimm() int pIndex, dIndex; for (pIndex = 0; pIndex < G_maxDimmPorts; ++pIndex) { - for (dIndex = 0; dIndex < NUM_DIMMS_PER_CENTAUR; ++dIndex) + for (dIndex = 0; dIndex < NUM_DIMMS_PER_I2CPORT; ++dIndex) { if (g_amec->proc[0].memctl[pIndex].centaur.dimm_temps[dIndex].cur_temp > hottest) { @@ -215,20 +218,24 @@ void update_hottest_dimm() DIMM_DBG("update_hottest_dimm: hottest DIMM temp for this sample: %dC (loc=%d)", hottest, hottest_loc); if(hottest > g_amec->proc[0].memctl[0].centaur.tempdimmax.sample_max) { - // Save hottest DIMM location ever sampled + // Save hottest DIMM location ever sampled. There is no location for the temp16msdimm + // sensor, so just store it in memctl[0] location. DIMM_DBG("update_hottest_dimm: Hottest DIMM ever sampled was DIMM%d %dC (prior %dC)", hottest_loc, hottest, g_amec->proc[0].memctl[0].centaur.tempdimmax.sample_max); + // Store the hottest DIMM location in locdimmax sensor sensor_update(&g_amec->proc[0].memctl[0].centaur.locdimmax, hottest_loc); } - // Nimbus has no Centaurs, but store hottest temp in memctl[0] + // Store the hottest DIMM temp in tempdimmax sensor sensor_update(&g_amec->proc[0].memctl[0].centaur.tempdimmax, hottest); + // Store the hottest DIMM temp in temp16msdimm sensor + sensor_update(&g_amec->proc[0].temp16msdimm, hottest); } // Update current I2C port/DIMM index to next potential DIMM void use_next_dimm(uint8_t * i_port, uint8_t * i_dimm) { - if (++*i_dimm == NUM_DIMMS_PER_CENTAUR) + if (++*i_dimm == NUM_DIMMS_PER_I2CPORT) { // Finished all DIMMs for current port, switch to new port *i_port = 1 - *i_port; @@ -257,6 +264,8 @@ void mark_dimm_failed() WORD_LOW(G_dimm_sm_args.error.ffdc), G_dimm_sm_request.request.completion_state); + g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].flags |= FRU_SENSOR_STATUS_ERROR; + if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS) { // Disable collection on this DIMM, collect FFDC and log error @@ -595,6 +604,92 @@ void SIMULATE_HOST() +// Function Specification +// +// Name: process_dimm_temp +// +// Description: Validate and store DIMM temperature +// +// End Function Specification +void process_dimm_temp() +{ + const uint8_t port = G_dimm_sm_args.i2cPort; + const uint8_t dimm = G_dimm_sm_args.dimm; + uint8_t l_dimm_temp = G_dimm_sm_args.temp; + +#define MIN_VALID_DIMM_TEMP 1 +#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek +#define MAX_MEM_TEMP_CHANGE 16 + + // Last DIMM read completed, update sensor and clear error count + DIMM_DBG("process_dimm_temp: Successfully read DIMM%04X temperature: %dC, tick %d", + DIMM_AND_PORT, l_dimm_temp, DIMM_TICK); + + fru_temp_t* l_fru = &g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm]; + + uint8_t l_prev_temp = l_fru->cur_temp; + if (l_prev_temp == 0) + { + l_prev_temp = l_dimm_temp; + } + + //make sure temperature is within a 'reasonable' range. + if (l_dimm_temp < MIN_VALID_DIMM_TEMP || + l_dimm_temp > MAX_VALID_DIMM_TEMP) + { + //set a flag so that if we end up logging an error we have something to debug why + l_fru->flags |= FRU_TEMP_OUT_OF_RANGE; + l_dimm_temp = l_prev_temp; + } + else + { + //don't allow temp to change more than is reasonable + if (l_dimm_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE)) + { + if (!l_fru->flags) + { + TRAC_INFO("dimm temp rose faster than reasonable: DIMM%04X prev[%d] cur[%d]", + DIMM_AND_PORT, l_prev_temp, l_dimm_temp); + l_fru->flags |= FRU_TEMP_FAST_CHANGE; + } + l_dimm_temp = l_prev_temp + MAX_MEM_TEMP_CHANGE; + } + else if (l_dimm_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE)) + { + if (!l_fru->flags) + { + TRAC_INFO("dimm temp fell faster than reasonable: DIMM%04X prev[%d] cur[%d]", + DIMM_AND_PORT, l_prev_temp, l_dimm_temp); + l_fru->flags |= FRU_TEMP_FAST_CHANGE; + } + l_dimm_temp = l_prev_temp - MAX_MEM_TEMP_CHANGE; + } + else //reasonable amount of change occurred + { + l_fru->flags &= ~FRU_TEMP_FAST_CHANGE; + } + + //Notify thermal thread that temperature has been updated + G_dimm_temp_updated_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm; + + //clear other error flags + l_fru->flags &= FRU_TEMP_FAST_CHANGE; + } + + //Check if at or above the error temperature + if (l_dimm_temp >= g_amec->thermaldimm.ot_error) + { + //Set a bit so that this dimm can be called out by the thermal thread + G_dimm_overtemp_bitmap.bytes[port] |= 1 << dimm; + } + + l_fru->cur_temp = l_dimm_temp; + G_dimm[port][dimm].errorCount = 0; + +} // end process_dimm_temp() + + + // Function Specification // // Name: task_dimm_sm @@ -722,15 +817,8 @@ void task_dimm_sm(struct task *i_self) case DIMM_STATE_READ_TEMP: if (L_readIssued) { - const uint8_t port = G_dimm_sm_args.i2cPort; - const uint8_t dimm = G_dimm_sm_args.dimm; - - // Last DIMM read completed, update sensor and clear error count - DIMM_DBG("task_dimm_sm: Successfully read DIMM%04X temperature: %dC, tick %d", - DIMM_AND_PORT, G_dimm_sm_args.temp, DIMM_TICK); - g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].cur_temp = G_dimm_sm_args.temp; - G_dimm[port][dimm].lastReading = ((ssx_timebase_get())/(SSX_TIMEBASE_FREQUENCY_HZ/1000000)); - G_dimm[port][dimm].errorCount = 0; + // Validate and store temperature + process_dimm_temp(); // Move on to next DIMM use_next_dimm(&L_dimmPort, &L_dimmIndex); diff --git a/src/occ_405/sensor/sensor_enum.h b/src/occ_405/sensor/sensor_enum.h index e9a394a..5a6f44a 100755 --- a/src/occ_405/sensor/sensor_enum.h +++ b/src/occ_405/sensor/sensor_enum.h @@ -721,7 +721,7 @@ enum e_gsid MLP2P0M7, TEMP2MSCENT, - TEMP2MSDIMM, + TEMP16MSDIMM, MEMSP2MS, // ------------------------------------------------------ diff --git a/src/occ_405/sensor/sensor_info.c b/src/occ_405/sensor/sensor_info.c index b94bd30..08e0e54 100755 --- a/src/occ_405/sensor/sensor_info.c +++ b/src/occ_405/sensor/sensor_info.c @@ -280,7 +280,7 @@ const sensor_info_t G_sensor_info[] = /* ==MemSummarySensors== NameString Units Type Location Number Freq ScaleFactor */ SENSOR_INFO_T_ENTRY( TEMP2MSCENT, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ), - SENSOR_INFO_T_ENTRY( TEMP2MSDIMM, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ), + SENSOR_INFO_T_ENTRY( TEMP16MSDIMM, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ), SENSOR_INFO_T_ENTRY( MEMSP2MS, "%\0", AMEC_SENSOR_TYPE_PERF, AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_250US_IN_HZ, AMEFP( 1, 0) ), /* ==PartSummarySensors== NameString Units Type Location Number Freq ScaleFactor */ diff --git a/src/occ_405/sensor/sensor_table.c b/src/occ_405/sensor/sensor_table.c index c4978e1..409b7af 100755 --- a/src/occ_405/sensor/sensor_table.c +++ b/src/occ_405/sensor/sensor_table.c @@ -362,7 +362,7 @@ const sensor_ptr_t G_amec_sensor_list[] = SENSOR_PTR( TEMP2MSCENT, &g_amec_sys.proc[0].temp2mscent), - SENSOR_PTR( TEMP2MSDIMM, &g_amec_sys.proc[0].temp2msdimm), + SENSOR_PTR( TEMP16MSDIMM, &g_amec_sys.proc[0].temp16msdimm), SENSOR_PTR( MEMSP2MS, &g_amec_sys.proc[0].memsp2ms_tls), // ------------------------------------------------------ @@ -511,7 +511,7 @@ const minisensor_ptr_t G_amec_mini_sensor_list[] INIT_SECTION = PORTPAIR_MINI_SENSOR_PTRS_NULL(M4WR2MSP0M), MINI_SENSOR_PTR( TEMP2MSCENT, &G_dcom_slv_outbox_tx.temp2mscent), - MINI_SENSOR_PTR( TEMP2MSDIMM, &G_dcom_slv_outbox_tx.temp2msdimm), + MINI_SENSOR_PTR( TEMP16MSDIMM, &G_dcom_slv_outbox_tx.temp16msdimm), MINI_SENSOR_PTR( MEMSP2MS, NULL), // ------------------------------------------------------ diff --git a/src/occ_405/thread/chom.c b/src/occ_405/thread/chom.c index ffa652e..06065a0 100755 --- a/src/occ_405/thread/chom.c +++ b/src/occ_405/thread/chom.c @@ -248,9 +248,9 @@ void chom_update_sensors() l_max_cent_temp = G_dcom_slv_outbox_rx[i].temp2mscent; } - if (G_dcom_slv_outbox_rx[i].temp2msdimm > l_max_dimm_temp) + if (G_dcom_slv_outbox_rx[i].temp16msdimm > l_max_dimm_temp) { - l_max_dimm_temp = G_dcom_slv_outbox_rx[i].temp2msdimm; + l_max_dimm_temp = G_dcom_slv_outbox_rx[i].temp16msdimm; } } g_chom->sensorData[0].sensor[CHOMTEMPPROC].sample = l_max_core_temp; -- cgit v1.2.1