summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2017-11-01 08:44:03 -0500
committerChristopher J. Cain <cjcain@us.ibm.com>2017-11-03 15:16:58 -0400
commitbb703e413c363255a578ffdb35e954a5b71de4c9 (patch)
tree7c804d5d9b1b6853b832526cefdfa391e1bf3e48
parentd90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (diff)
downloadtalos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.tar.gz
talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.zip
Prevent logging 2A11 when quad is offline
Change-Id: Ic8cda9a6b8b311057ba2c4f0d9dc7e228e700c27 CQ: SW404469 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/49093 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
-rwxr-xr-xsrc/occ_405/amec/amec_dps.c4
-rwxr-xr-xsrc/occ_405/amec/amec_freq.c79
-rwxr-xr-xsrc/occ_405/amec/amec_health.c2
-rwxr-xr-xsrc/occ_405/amec/amec_sensors_core.c96
-rwxr-xr-xsrc/occ_405/proc/proc_data.c37
-rwxr-xr-xsrc/occ_405/proc/proc_data.h21
6 files changed, 181 insertions, 58 deletions
diff --git a/src/occ_405/amec/amec_dps.c b/src/occ_405/amec/amec_dps.c
index 6f5ff15..e41783a 100755
--- a/src/occ_405/amec/amec_dps.c
+++ b/src/occ_405/amec/amec_dps.c
@@ -84,7 +84,7 @@ void amec_dps_update_core_util(void)
// Update moving average of util_slack and util_active for all cores
for(l_idx=0; l_idx<MAX_NUM_CORES; l_idx++)
{
- if (!CORE_PRESENT(l_idx))
+ if (!CORE_PRESENT(l_idx) || CORE_OFFLINE(l_idx))
{
continue; //nothing to do if the core's disabled
}
@@ -351,7 +351,7 @@ void amec_dps_main(void)
for (l_idx=0; l_idx<MAX_NUM_CORES; l_idx++)
{
// Find the first valid core and send its frequency
- if (CORE_PRESENT(l_idx))
+ if (CORE_PRESENT(l_idx) && !CORE_OFFLINE(l_idx))
{
G_dcom_slv_outbox_tx.factual =
AMECSENSOR_ARRAY_PTR(FREQREQC0,l_idx)->sample;
diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c
index 9150328..7b02ba7 100755
--- a/src/occ_405/amec/amec_freq.c
+++ b/src/occ_405/amec/amec_freq.c
@@ -387,7 +387,7 @@ void amec_slv_proc_voting_box(void)
for (k=0; k<MAX_NUM_CORES; k++)
{
- if(CORE_PRESENT(k))
+ if( CORE_PRESENT(k) && !CORE_OFFLINE(k) )
{
l_core_freq = l_chip_fmax;
l_core_reason = l_chip_reason;
@@ -556,11 +556,12 @@ void amec_slv_proc_voting_box(void)
{
g_amec->proc[0].core_max_freq = l_core_freq;
}
- }
+ } // if core present and not offline
else
{
- l_core_freq = 0;
- l_core_reason = 0;
+ //Set f_request to 0 so this core is ignored in amec_slv_freq_smh()
+ g_amec->proc[0].core[k].f_request = 0;
+ g_amec->proc[0].core[k].f_reason = 0;
}
}//End of for loop
@@ -600,7 +601,8 @@ void amec_slv_freq_smh(void)
uint8_t core_idx = 0; // loop through cores within each quad
Pstate pmax[MAXIMUM_QUADS] = {0}; // max pstate (min frequency) within each quad
Pstate pmax_chip = 0; // highest Pstate (lowest frequency) across all quads
- bool l_atLeast1Core[MAXIMUM_QUADS] = {FALSE}; // at least 1 core present in quad
+ bool l_atLeast1Core[MAXIMUM_QUADS] = {FALSE}; // at least 1 core present and online in quad
+ bool l_atLeast1Quad = FALSE; // at least 1 quad online
static bool L_mfg_set_trace[MAXIMUM_QUADS] = {FALSE};
static bool L_mfg_clear_trace[MAXIMUM_QUADS] = {FALSE};
@@ -619,6 +621,7 @@ void amec_slv_freq_smh(void)
if(g_amec->proc[0].core[core_num].f_request != 0)
{
l_atLeast1Core[quad] = TRUE;
+ l_atLeast1Quad = TRUE;
// The higher the pstate number, the lower the frequency
if(pmax[quad] < proc_freq2pstate(g_amec->proc[0].core[core_num].f_request))
{
@@ -630,45 +633,49 @@ void amec_slv_freq_smh(void)
}
}
- // check for mfg quad Pstate request and set Pstate for each quad
- for (quad = 0; quad < MAXIMUM_QUADS; quad++)
+ // Skip determining new frequency if all cores in all quads are offline
+ if(l_atLeast1Quad)
{
- // set quad with no cores present to lowest frequency for the chip
- if(l_atLeast1Core[quad] == FALSE)
- pmax[quad] = pmax_chip;
-
- // check if there is a mnfg Pstate request for this quad
- if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF)
+ // check for mfg quad Pstate request and set Pstate for each quad
+ for (quad = 0; quad < MAXIMUM_QUADS; quad++)
{
- // use mnfg request if it is a lower frequency (higher pState)
- if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad])
- pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad];
+ // set quad with no cores present to lowest frequency for the chip
+ if(l_atLeast1Core[quad] == FALSE)
+ pmax[quad] = pmax_chip;
- if(L_mfg_clear_trace[quad] == FALSE)
- L_mfg_set_trace[quad] = TRUE;
- }
- else if(L_mfg_clear_trace[quad] == TRUE)
- {
- TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]);
- L_mfg_clear_trace[quad] = FALSE;
- }
+ // check if there is a mnfg Pstate request for this quad
+ if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF)
+ {
+ // use mnfg request if it is a lower frequency (higher pState)
+ if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad])
+ pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad];
+
+ if(L_mfg_clear_trace[quad] == FALSE)
+ L_mfg_set_trace[quad] = TRUE;
+ }
+ else if(L_mfg_clear_trace[quad] == TRUE)
+ {
+ TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]);
+ L_mfg_clear_trace[quad] = FALSE;
+ }
#ifdef PROC_DEBUG
- if (G_desired_pstate[quad] != pmax[quad])
- {
- TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]);
- }
+ if (G_desired_pstate[quad] != pmax[quad])
+ {
+ TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]);
+ }
#endif
- // update quad pstate request
- G_desired_pstate[quad] = pmax[quad];
+ // update quad pstate request
+ G_desired_pstate[quad] = pmax[quad];
- if(L_mfg_set_trace[quad] == TRUE)
- {
- TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]);
- L_mfg_set_trace[quad] = FALSE;
- L_mfg_clear_trace[quad] = TRUE;
+ if(L_mfg_set_trace[quad] == TRUE)
+ {
+ TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]);
+ L_mfg_set_trace[quad] = FALSE;
+ L_mfg_clear_trace[quad] = TRUE;
+ }
}
- }
+ } // if at least 1 core online
}
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index f0b2609..60d5a81 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -920,7 +920,7 @@ void amec_health_check_proc_timeout()
{
for(i=0; i<MAX_NUM_CORES; i++)
{
- if(!CORE_PRESENT(i))
+ if(!CORE_PRESENT(i) || CORE_OFFLINE(i))
{
// If this core is not present, move on
continue;
diff --git a/src/occ_405/amec/amec_sensors_core.c b/src/occ_405/amec/amec_sensors_core.c
index d2eddec..b723366 100755
--- a/src/occ_405/amec/amec_sensors_core.c
+++ b/src/occ_405/amec/amec_sensors_core.c
@@ -74,9 +74,14 @@ void amec_calc_droop_sensors(CoreData * i_core_data_ptr, uint8_t i_core);
void amec_update_proc_core_sensors(uint8_t i_core)
{
CoreData *l_core_data_ptr;
- uint16_t l_temp16 = 0;
uint32_t l_temp32 = 0;
+ uint16_t l_core_temp = 0;
+ uint16_t l_temp16 = 0;
+ uint16_t l_core_util = 0;
+ uint16_t l_core_freq = 0;
+ uint16_t l_time_interval = 0;
uint8_t i = 0;
+ uint8_t l_quad = i_core / 4; // Quad this core resides in
// Make sure the core is present, and that it has updated data.
if(CORE_PRESENT(i_core) && CORE_UPDATED(i_core))
@@ -95,8 +100,8 @@ void amec_update_proc_core_sensors(uint8_t i_core)
//-------------------------------------------------------
// Util / Freq
//-------------------------------------------------------
- // Skip this update if there was an empath collection error
- if (!CORE_EMPATH_ERROR(i_core))
+ // Skip this update if there was an empath collection error or if previously offline
+ if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core))
{
amec_calc_freq_and_util_sensors(l_core_data_ptr,i_core);
}
@@ -105,13 +110,16 @@ void amec_update_proc_core_sensors(uint8_t i_core)
// Performance counter - This function should be called
// after amec_calc_freq_and_util_sensors().
//-------------------------------------------------------
- amec_calc_dps_util_counters(i_core);
+ if(!CORE_OFFLINE(i_core))
+ {
+ amec_calc_dps_util_counters(i_core);
+ }
//-------------------------------------------------------
// IPS
//-------------------------------------------------------
// Skip this update if there was an empath collection error
- if (!CORE_EMPATH_ERROR(i_core))
+ if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core))
{
amec_calc_ips_sensors(l_core_data_ptr,i_core);
}
@@ -162,7 +170,83 @@ void amec_update_proc_core_sensors(uint8_t i_core)
l_temp16 = (uint16_t)(G_dcom_slv_inbox_doorbell_rx.tod>>45);
// hi 3 bits in 0.796 day resolution with 512MHz TOD clock
sensor_update( AMECSENSOR_PTR(TODclock2), l_temp16);
- }
+
+ // Core must be online that it was updated and now that the sensors have been updated make sure
+ // the core offline bit is off for this core. Clearing this prior to updating the temperature
+ // sensors may result in a false processor timeout error in health monitor
+ CLEAR_CORE_OFFLINE(i_core);
+ } // if core present and updated
+
+ else if(CORE_OFFLINE(i_core))
+ {
+ // core wasn't updated due to being offline, update sensors accordingly
+
+ // Determine "core" temperature that will be returned in the poll for fan control
+ // If there is at least 1 core online within the same quad use the quad temp else use the nest
+ if(QUAD_ONLINE(l_quad))
+ {
+ l_core_temp = AMECSENSOR_ARRAY_PTR(TEMPQ0, l_quad)->sample;
+ }
+ else
+ {
+ l_core_temp = getSensorByGsid(TEMPNEST)->sample;
+ }
+ if(l_core_temp)
+ {
+ sensor_update(AMECSENSOR_ARRAY_PTR(TEMPPROCTHRMC0,i_core), l_core_temp);
+ }
+
+ // Update utilization and frequency sensors to 0
+ sensor_update(AMECSENSOR_ARRAY_PTR(NUTILC0, i_core), 0);
+ sensor_update(AMECSENSOR_ARRAY_PTR(UTILC0, i_core), 0);
+ sensor_update(AMECSENSOR_ARRAY_PTR(IPSC0, i_core), 0);
+ sensor_update(AMECSENSOR_ARRAY_PTR(NOTBZEC0, i_core), 0);
+ sensor_update(AMECSENSOR_ARRAY_PTR(NOTFINC0, i_core), 0);
+ sensor_update(AMECSENSOR_ARRAY_PTR(FREQAC0, i_core), 0);
+ for(i=0; i<MAX_THREADS_PER_CORE; i++)
+ {
+ g_amec->proc[0].core[i_core].thread[i].util4ms_thread = 0;
+ }
+
+ // Make updates for rolling average
+ // Determine the time interval for the rolling average calculation
+ l_time_interval = AMEC_DPS_SAMPLING_RATE * AMEC_IPS_AVRG_INTERVAL;
+
+ // Increment sample count
+ if(g_amec->proc[0].core[i_core].sample_count < UINT16_MAX)
+ {
+ g_amec->proc[0].core[i_core].sample_count++;
+ }
+
+ if(g_amec->proc[0].core[i_core].sample_count == l_time_interval)
+ {
+ // Increase resolution of the UTIL accumulator by two decimal places
+ l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(UTILC0,i_core)->accumulator * 100;
+ // Calculate average utilization of this core
+ l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count;
+ g_amec->proc[0].core[i_core].avg_util = l_temp32;
+
+ // Increase resolution of the FREQA accumulator by two decimal places
+ l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(FREQAC0,i_core)->accumulator * 100;
+ // Calculate average frequency of this core
+ l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count;
+ g_amec->proc[0].core[i_core].avg_freq = l_temp32;
+ }
+ else if(g_amec->proc[0].core[i_core].sample_count > l_time_interval)
+ {
+ // Calculate average utilization for this core
+ l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_util;
+ l_temp32 = l_temp32 * (l_time_interval-1);
+ l_temp32 = l_temp32 + l_core_util*100;
+ g_amec->proc[0].core[i_core].avg_util = l_temp32 / l_time_interval;
+
+ // Calculate average frequency for this core
+ l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_freq;
+ l_temp32 = l_temp32 * (l_time_interval-1);
+ l_temp32 = l_temp32 + l_core_freq*100;
+ g_amec->proc[0].core[i_core].avg_freq = l_temp32 / l_time_interval;
+ }
+ } // else if core offline
}
// Function Specification
diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c
index 6f25118..e8e9eba 100755
--- a/src/occ_405/proc/proc_data.c
+++ b/src/occ_405/proc/proc_data.c
@@ -86,6 +86,9 @@ uint32_t G_updated_core_mask = 0;
// without error.
uint32_t G_empath_error_core_mask = 0;
+//AMEC needs to know cores that are offline
+uint32_t G_core_offline_mask = 0;
+
//Global G_present_cores is bitmask of all cores
//(1 = present, 0 = not present. Core 0 has the most significant bit)
uint32_t G_present_cores = 0;
@@ -128,6 +131,7 @@ void task_core_data( task_t * i_task )
CoreData * l_temp = NULL; // Used for pointer swapping
bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr;
ipc_core_data_parms_t * l_parms = (ipc_core_data_parms_t*)(l_bulk_core_data_ptr->gpe_req.cmd_data);
+ static uint32_t L_trace_core_failure = 0;
do
{
@@ -160,8 +164,8 @@ void task_core_data( task_t * i_task )
//A request is not considered complete until both the engine job
//has finished without error and any callback has run to completion.
- if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request)
- &&
+ if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) &&
+ (l_parms->error.rc == 0) &&
CORE_PRESENT(l_bulk_core_data_ptr->current_core) )
{
//If the previous GPE request succeeded then swap core_data_ptr
@@ -184,7 +188,7 @@ void task_core_data( task_t * i_task )
//Core data has been collected so set the bit in global mask.
//AMEC code will know which cores to update sensors for. AMEC is
//responsible for clearing the bit later on.
- G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core);
+ G_updated_core_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
// Presumptively clear the empath error mask
G_empath_error_core_mask &=
@@ -197,6 +201,24 @@ void task_core_data( task_t * i_task )
{
G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = &G_core_data[MAX_NUM_FW_CORES+NUM_CORE_DATA_DOUBLE_BUF+NUM_CORE_DATA_EMPTY_BUF-1];
}
+ else if(l_parms->error.rc != 0)
+ {
+ // Check if failure is due to being offline (in stop 2 or greater)
+ if(l_parms->error.ffdc == PCB_ERROR_CHIPLET_OFFLINE)
+ {
+ // Mark core offline so it is ignored in control loops and to avoid health monitor logging error
+ G_core_offline_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
+ }
+ else if( !(L_trace_core_failure & (1 << l_bulk_core_data_ptr->current_core)) )
+ {
+ // trace error, if it continues health monitor will see and log error
+ INTR_TRAC_ERR("task_core_data: core %d data collection failed RC[0x%08X] FFDC[0x%08X%08X]",
+ l_bulk_core_data_ptr->current_core, l_parms->error.rc,
+ (uint32_t)(l_parms->error.ffdc >> 32),
+ (uint32_t)l_parms->error.ffdc);
+ L_trace_core_failure |= (1 << l_bulk_core_data_ptr->current_core);
+ }
+ }
//Update current core
if ( l_bulk_core_data_ptr->current_core >= l_bulk_core_data_ptr->end_core )
@@ -217,6 +239,8 @@ void task_core_data( task_t * i_task )
//1. Setup the get core data parms
l_parms->core_num = l_bulk_core_data_ptr->current_core;
l_parms->data = (CoreData*) l_bulk_core_data_ptr->core_data_ptr;
+ l_parms->error.error = 0; // default no error
+ l_parms->error.ffdc = 0;
// Static array to record the last timestamp a get_per_core_data task was
// scheduled for a core.
@@ -243,13 +267,6 @@ void task_core_data( task_t * i_task )
{
G_get_per_core_data_max_schedule_intervals[l_current_core] = l_elapsed_us;
}
- // Also sniff if the request has actually completed, it is checked above but
- // the schedule proceeds regardless which could be dangerous...
- if (!async_request_completed(&l_bulk_core_data_ptr->gpe_req.request))
- {
- INTR_TRAC_ERR("Async get_per_core_data task for core=%d not complete!",
- l_current_core);
- }
}
//2. Schedule the GPE Request to get the core data
diff --git a/src/occ_405/proc/proc_data.h b/src/occ_405/proc/proc_data.h
index ebef6d0..f8b7870 100755
--- a/src/occ_405/proc/proc_data.h
+++ b/src/occ_405/proc/proc_data.h
@@ -50,9 +50,10 @@
#define NUM_CORE_DATA_DOUBLE_BUF 2
#define NUM_CORE_DATA_EMPTY_BUF 1
-#define LO_CORES_MASK 0xfff00000
-#define HI_CORES_MASK 0x000fff00
-#define HW_CORES_MASK 0xffffff00
+#define LO_CORES_MASK 0xfff00000
+#define HI_CORES_MASK 0x000fff00
+#define HW_CORES_MASK 0xffffff00
+#define QUAD0_CORES_PRESENT_MASK 0xf0000000
enum eOccProcCores
{
@@ -103,6 +104,9 @@ extern uint32_t G_present_cores;
//AMEC needs to know when data for a core has been collected.
extern uint32_t G_updated_core_mask;
+//AMEC needs to know when a core is offline
+extern uint32_t G_core_offline_mask;
+
// External reference to empath error mask
extern uint32_t G_empath_error_core_mask;
@@ -120,6 +124,17 @@ extern bool G_nest_dts_data_valid;
#define CORE_EMPATH_ERROR(occ_core_id) \
((CORE0_PRESENT_MASK >> occ_core_id) & G_empath_error_core_mask)
+// Evaluates to true if the specified core is offline
+#define CORE_OFFLINE(occ_core_id) \
+ ((CORE0_PRESENT_MASK >> occ_core_id) & G_core_offline_mask)
+
+#define CLEAR_CORE_OFFLINE(occ_core_id) \
+ G_core_offline_mask &= ~(CORE0_PRESENT_MASK >> occ_core_id)
+
+// Evaluates to true if the specified quad has at least 1 active present core
+#define QUAD_ONLINE(occ_quad_id) \
+ ( (QUAD0_CORES_PRESENT_MASK >> (occ_quad_id*4)) & ((~G_core_offline_mask) & G_present_cores) )
+
//Collect bulk core data for all cores in specified range
void task_core_data( task_t * i_task );
OpenPOWER on IntegriCloud