summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec/amec_sensors_ocmb.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/occ_405/amec/amec_sensors_ocmb.c')
-rw-r--r--src/occ_405/amec/amec_sensors_ocmb.c364
1 files changed, 215 insertions, 149 deletions
diff --git a/src/occ_405/amec/amec_sensors_ocmb.c b/src/occ_405/amec/amec_sensors_ocmb.c
index a2cd4a6..8ffbefa 100644
--- a/src/occ_405/amec/amec_sensors_ocmb.c
+++ b/src/occ_405/amec/amec_sensors_ocmb.c
@@ -110,21 +110,21 @@ void amec_update_ocmb_sensors(uint8_t i_membuf)
// End Function Specification
void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf)
{
+// confirmed ok to use same values for all types (internal mc, dimm, external mc, pmic...)
#define MIN_VALID_DIMM_TEMP 1
#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek 04/23/2019
-#define MAX_MEM_TEMP_CHANGE 2
+#define MAX_MEM_TEMP_CHANGE 4
- uint32_t k, l_hottest_dimm_temp;
+ uint32_t k;
uint16_t l_dts[NUM_DIMMS_PER_OCMB] = {0};
- uint32_t l_hottest_dimm_loc = NUM_DIMMS_PER_OCMB;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_OCMBS] = {FALSE};
- static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};
- // Harvest thermal data for all dimms
+ // Harvest thermal data for memory thermal sensors that are enabled and being used
for(k=0; k < NUM_DIMMS_PER_OCMB; k++)
{
- if(!CENTAUR_SENSOR_ENABLED(i_membuf, k))
+ if( (!CENTAUR_SENSOR_ENABLED(i_membuf, k)) ||
+ (g_amec->proc[0].memctl[i_membuf].centaur.dimm_temps[k].temp_fru_type == DATA_FRU_NOT_USED) )
{
continue;
}
@@ -250,48 +250,17 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
}
}
- //Check if at or above the error temperature
- if(l_dts[k] >= g_amec->thermaldimm.ot_error)
- {
- //Set a bit so that this dimm can be called out by the thermal thread
- G_dimm_overtemp_bitmap.bytes[i_membuf] |= (DIMM_SENSOR0 >> k);
- // trace first time OT per DIMM
- if( !L_ot_traced[i_membuf][k] )
- {
- TRAC_ERR("amec_update_ocmb_dimm_dts_sensors: Mem Buf[%d] DIMM[%d] reached error temp[%d]. current[%d]",
- i_membuf,
- k,
- g_amec->thermaldimm.ot_error,
- l_dts[k]);
- L_ot_traced[i_membuf][k] = true;
- }
- }
+ //Check for over temperature must be done by type and will be checked
+ // in amec_update_ocmb_temp_sensors() which happens after all OCMBs have been read
}
- // Find hottest temperature from all DIMMs for this centaur
- for(l_hottest_dimm_temp = 0, k = 0; k < NUM_DIMMS_PER_OCMB; k++)
+ // update the current temperatures
+ for(k = 0; k < NUM_DIMMS_PER_OCMB; k++)
{
- if(l_dts[k] > l_hottest_dimm_temp)
- {
- l_hottest_dimm_temp = l_dts[k];
- l_hottest_dimm_loc = k;
- }
g_amec->proc[0].memctl[i_membuf].centaur.dimm_temps[k].cur_temp = l_dts[k];
}
- amec_centaur_t* l_centaur_ptr = &g_amec->proc[0].memctl[i_membuf].centaur;
-
- //only update location if hottest dimm temp is greater than previous maximum
- if(l_hottest_dimm_temp > l_centaur_ptr->tempdimmax.sample_max)
- {
- sensor_update(&l_centaur_ptr->locdimmax, l_hottest_dimm_loc);
- }
-
- //update the max dimm temperature sensor for this centaur
- sensor_update(&l_centaur_ptr->tempdimmax, l_hottest_dimm_temp);
-
L_ran_once[i_membuf] = TRUE;
- AMEC_DBG("Centaur[%d]: HotDimm=%d\n",i_membuf,l_hottest_dimm_temp);
}
// Function Specification
@@ -317,129 +286,223 @@ void amec_update_ocmb_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf
fru_temp_t* l_fru = &g_amec->proc[0].memctl[i_membuf].centaur.centaur_hottest;
- l_prev_temp = l_fru->cur_temp;
- if(!l_prev_temp)
+ // Internal DTS sensor is either for internal memctrl or not being used
+ // ignore the internal sensor if it isn't marked for internal memctrl
+ if(l_fru->temp_fru_type == DATA_FRU_CENTAUR)
{
- l_prev_temp = l_sens_temp;
- }
-
- //Check DTS status bits
- if(i_sensor_cache->status.fields.ubdts0_valid &&
- (!i_sensor_cache->status.fields.ubdts0_err))
- {
- //make sure temperature is within a 'reasonable' range.
- if(l_sens_temp < MIN_VALID_MEMBUF_TEMP ||
- l_sens_temp > MAX_VALID_MEMBUF_TEMP)
- {
- //set a flag so that if we end up logging an error we have something to debug why
- l_fru->flags |= FRU_TEMP_OUT_OF_RANGE;
- l_dts = l_prev_temp;
- }
- else
- {
- //don't allow temp to change more than is reasonable since last read
- if(l_sens_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
- {
- l_dts = l_prev_temp + MAX_MEM_TEMP_CHANGE;
- if(!l_fru->flags)
- {
- TRAC_INFO("membuf temp rose faster than reasonable: membuf[%d] prev[%d] cur[%d]",
- i_membuf, l_prev_temp, l_sens_temp);
- l_fru->flags |= FRU_TEMP_FAST_CHANGE;
- }
- }
- else if (l_sens_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE))
- {
- l_dts = l_prev_temp - MAX_MEM_TEMP_CHANGE;
- if(!l_fru->flags)
- {
- TRAC_INFO("membuf temp fell faster than reasonable: cent[%d] prev[%d] cur[%d]",
- i_membuf, l_prev_temp, l_sens_temp);
- l_fru->flags |= FRU_TEMP_FAST_CHANGE;
- }
- }
- else //reasonable amount of change occurred
- {
- l_dts = l_sens_temp;
- l_fru->flags &= ~FRU_TEMP_FAST_CHANGE;
- }
-
- //Notify thermal thread that temperature has been updated
- G_cent_temp_updated_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
-
- //clear error flags
- l_fru->flags &= FRU_TEMP_FAST_CHANGE;
- }
+ l_prev_temp = l_fru->cur_temp;
+ if(!l_prev_temp)
+ {
+ l_prev_temp = l_sens_temp;
+ }
+
+ //Check DTS status bits
+ if(i_sensor_cache->status.fields.ubdts0_valid &&
+ (!i_sensor_cache->status.fields.ubdts0_err))
+ {
+ //make sure temperature is within a 'reasonable' range.
+ if(l_sens_temp < MIN_VALID_MEMBUF_TEMP ||
+ l_sens_temp > MAX_VALID_MEMBUF_TEMP)
+ {
+ //set a flag so that if we end up logging an error we have something to debug why
+ l_fru->flags |= FRU_TEMP_OUT_OF_RANGE;
+ l_dts = l_prev_temp;
+ }
+ else
+ {
+ //don't allow temp to change more than is reasonable since last read
+ if(l_sens_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
+ {
+ l_dts = l_prev_temp + MAX_MEM_TEMP_CHANGE;
+ if(!l_fru->flags)
+ {
+ TRAC_INFO("membuf temp rose faster than reasonable: membuf[%d] prev[%d] cur[%d]",
+ i_membuf, l_prev_temp, l_sens_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else if (l_sens_temp < (l_prev_temp - MAX_MEM_TEMP_CHANGE))
+ {
+ l_dts = l_prev_temp - MAX_MEM_TEMP_CHANGE;
+ if(!l_fru->flags)
+ {
+ TRAC_INFO("membuf temp fell faster than reasonable: cent[%d] prev[%d] cur[%d]",
+ i_membuf, l_prev_temp, l_sens_temp);
+ l_fru->flags |= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else //reasonable amount of change occurred
+ {
+ l_dts = l_sens_temp;
+ l_fru->flags &= ~FRU_TEMP_FAST_CHANGE;
+ }
+
+ //Notify thermal thread that temperature has been updated
+ G_cent_temp_updated_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
+
+ //clear error flags
+ l_fru->flags &= FRU_TEMP_FAST_CHANGE;
+ }
+ }
+ else //status was INVALID
+ {
+ if(L_ran_once[i_membuf])
+ {
+ //Trace the error if we haven't traced it already for this sensor
+ if(!(l_fru->flags & FRU_SENSOR_STATUS_INVALID) &&
+ i_sensor_cache->status.fields.ubdts0_err)
+ {
+ TRAC_ERR("Membuf %d temp sensor error.", i_membuf);
+ }
+
+ l_fru->flags |= FRU_SENSOR_STATUS_INVALID;
+ }
+
+ //use last temperature
+ l_dts = l_prev_temp;
+ }
+
+ L_ran_once[i_membuf] = TRUE;
+
+ //Check if at or above the error temperature
+ if(l_dts >= g_amec->thermalcent.ot_error)
+ {
+ //Set a bit so that this dimm can be called out by the thermal thread
+ G_cent_overtemp_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
+ }
+
+ // Update Interim Data - later this will get picked up to form centaur sensor
+ l_fru->cur_temp = l_dts;
+
+ AMEC_DBG("Membuf[%d]: HotMembuf=%d\n",i_membuf,l_dts);
}
- else //status was INVALID
+ else // internal sensor not being used
{
- if(L_ran_once[i_membuf])
- {
- //Trace the error if we haven't traced it already for this sensor
- if(!(l_fru->flags & FRU_SENSOR_STATUS_INVALID) &&
- i_sensor_cache->status.fields.ubdts0_err)
- {
- TRAC_ERR("Membuf %d temp sensor error.", i_membuf);
- }
+ // make sure temperature is 0 indicating not present
+ l_fru->cur_temp = 0;
- l_fru->flags |= FRU_SENSOR_STATUS_INVALID;
- }
+ //Notify thermal thread that temperature has been updated so no timeout error is logged
+ G_cent_temp_updated_bitmap |= CENTAUR0_PRESENT_MASK >> i_membuf;
- //use last temperature
- l_dts = l_prev_temp;
+ //clear error flags
+ l_fru->flags = 0;
}
-
- L_ran_once[i_membuf] = TRUE;
-
- //Check if at or above the error temperature
- if(l_dts >= g_amec->thermalcent.ot_error)
- {
- //Set a bit so that this dimm can be called out by the thermal thread
- G_cent_overtemp_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);
- }
-
- // Update Interim Data - later this will get picked up to form centaur sensor
- g_amec->proc[0].memctl[i_membuf].centaur.centaur_hottest.cur_temp = l_dts;
-
- AMEC_DBG("Membuf[%d]: HotMembuf=%d\n",i_membuf,l_dts);
}
// Function Specification
//
// Name: amec_update_ocmb_temp_sensors
//
-// Description: Updates thermal sensors that have data grabbed by the centaur.
+// Description: Updates thermal sensors to give summary (across all OCMBs) for each mem type
//
// Thread: RealTime Loop
//
// End Function Specification
void amec_update_ocmb_temp_sensors(void)
{
- uint32_t k;
+ uint32_t k, l_dimm;
uint32_t l_hot_dimm = 0;
uint32_t l_hot_mb = 0;
+ uint32_t l_hot_mb_dimm = 0;
+ uint32_t l_hot_pmic = 0;
+ uint32_t l_hot_ext_mb = 0;
+ uint8_t l_ot_error = 0;
+ uint8_t l_cur_temp = 0;
+ uint8_t l_fru_type = DATA_FRU_NOT_USED;
+ static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};
- // -----------------------------------------------------------
- // Find hottest temperature from all membufs for this Proc chip
- // Find hottest temperature from all DIMMs for this Proc chip
- // -----------------------------------------------------------
for(k=0; k < MAX_NUM_OCMBS; k++)
{
- if(g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp > l_hot_mb)
+ // Find hottest temperature from all internal membufs for this Proc chip
+ // make sure the type is "CENTAUR" i.e. internal memory controller temp
+ if( (g_amec->proc[0].memctl[k].centaur.centaur_hottest.temp_fru_type == DATA_FRU_CENTAUR) &&
+ (g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp > l_hot_mb) )
{
l_hot_mb = g_amec->proc[0].memctl[k].centaur.centaur_hottest.cur_temp;
}
- if(g_amec->proc[0].memctl[k].centaur.tempdimmax.sample > l_hot_dimm)
+
+ // process each of the thermal sensors (stored as "dimm" temps)
+ // based on what type they are for and finding the hottest for each type
+ for(l_dimm=0; l_dimm < NUM_DIMMS_PER_OCMB; l_dimm++)
{
- l_hot_dimm = g_amec->proc[0].memctl[k].centaur.tempdimmax.sample;
- }
- }
+ l_fru_type = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].temp_fru_type;
+ l_cur_temp = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+
+ switch(l_fru_type)
+ {
+ case DATA_FRU_DIMM:
+ l_ot_error = g_amec->thermaldimm.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_dimm)
+ {
+ l_hot_dimm = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_MEMCTRL_DRAM:
+ l_ot_error = g_amec->thermalmcdimm.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_mb_dimm)
+ {
+ l_hot_mb_dimm = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_PMIC:
+ l_ot_error = g_amec->thermalpmic.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_pmic)
+ {
+ l_hot_pmic = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_MEMCTRL_EXT:
+ l_ot_error = g_amec->thermalmcext.ot_error;
+ if(g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp > l_hot_ext_mb)
+ {
+ l_hot_ext_mb = g_amec->proc[0].memctl[k].centaur.dimm_temps[l_dimm].cur_temp;
+ }
+ break;
+
+ case DATA_FRU_NOT_USED:
+ default:
+ // ignore reading
+ l_ot_error = 0;
+ break;
+ } // end switch fru type
+
+ // check if this "DIMM" sensor is over its error temperature
+ if( l_ot_error && (l_cur_temp >= l_ot_error) )
+ {
+ //Set a bit so that this sensor can be called out by the thermal thread
+ G_dimm_overtemp_bitmap.bytes[k] |= (DIMM_SENSOR0 >> l_dimm);
+ // trace first time OT per DIMM DTS sensor
+ if( !L_ot_traced[k][l_dimm] )
+ {
+ TRAC_ERR("amec_update_ocmb_temp_sensors: OCMB[%d] DTS[%d] type[0x%02X] reached error temp[%d]. current[%d]",
+ k,
+ l_dimm,
+ l_fru_type,
+ l_ot_error,
+ l_cur_temp);
+ L_ot_traced[k][l_dimm] = true;
+ }
+ }
+ } // end for each "dimm" thermal sensor
+ } // end for each OCMB
+
sensor_update(&g_amec->proc[0].tempcent,l_hot_mb);
AMEC_DBG("HotMembuf=%d\n",l_hot_mb);
sensor_update(&g_amec->proc[0].tempdimmthrm,l_hot_dimm);
AMEC_DBG("HotDimm=%d\n",l_hot_dimm);
+ sensor_update(&g_amec->proc[0].tempmcdimmthrm,l_hot_mb_dimm);
+ AMEC_DBG("HotMCDimm=%d\n",l_hot_mb_dimm);
+
+ sensor_update(&g_amec->proc[0].temppmicthrm,l_hot_pmic);
+ AMEC_DBG("HotPmic=%d\n",l_hot_pmic);
+
+ sensor_update(&g_amec->proc[0].tempmcextthrm,l_hot_ext_mb);
+ AMEC_DBG("HotExternalMembuf=%d\n",l_hot_ext_mb);
}
@@ -510,24 +573,27 @@ void amec_perfcount_ocmb_getmc( OcmbMemData * i_sensor_cache,
g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms = tempreg;
- // Go after second MC performance counter (power ups and activations)
- tempu = l_sensor_cache->mba_act;
- templ = l_sensor_cache->mba_powerups;
-
- // ------------------------------------------------------------
- // Sensor: MRDMx (0.01 Mrps) Memory read requests per sec
- // ------------------------------------------------------------
- tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms;
- tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memread2ms;
- sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mrd)), tempreg);
-
- // -------------------------------------------------------------
- // Sensor: MWRMx (0.01 Mrps) Memory write requests per sec
- // -------------------------------------------------------------
- tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memwrite2ms;
- tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memwrite2ms;
- sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mwr)), tempreg);
-
+ // Due to limited SRAM only have sensor support for first 12 mem buffs
+ if(i_membuf < 12)
+ {
+ // Go after second MC performance counter (power ups and activations)
+ tempu = l_sensor_cache->mba_act;
+ templ = l_sensor_cache->mba_powerups;
+
+ // ------------------------------------------------------------
+ // Sensor: MRDMx (0.01 Mrps) Memory read requests per sec
+ // ------------------------------------------------------------
+ tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memread2ms;
+ tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memread2ms;
+ sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mrd)), tempreg);
+
+ // -------------------------------------------------------------
+ // Sensor: MWRMx (0.01 Mrps) Memory write requests per sec
+ // -------------------------------------------------------------
+ tempreg = g_amec->proc[0].memctl[i_membuf].centaur.portpair[0].perf.memwrite2ms;
+ tempreg += g_amec->proc[0].memctl[i_membuf].centaur.portpair[1].perf.memwrite2ms;
+ sensor_update( (&(g_amec->proc[0].memctl[i_membuf].mwr)), tempreg);
+ }
return;
}
OpenPOWER on IntegriCloud