diff options
author | mbroyles <mbroyles@us.ibm.com> | 2017-11-01 08:44:03 -0500 |
---|---|---|
committer | Christopher J. Cain <cjcain@us.ibm.com> | 2017-11-03 15:16:58 -0400 |
commit | bb703e413c363255a578ffdb35e954a5b71de4c9 (patch) | |
tree | 7c804d5d9b1b6853b832526cefdfa391e1bf3e48 | |
parent | d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (diff) | |
download | talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.tar.gz talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.zip |
Prevent logging 2A11 when quad is offline
Change-Id: Ic8cda9a6b8b311057ba2c4f0d9dc7e228e700c27
CQ: SW404469
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/49093
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
-rwxr-xr-x | src/occ_405/amec/amec_dps.c | 4 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_freq.c | 79 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_health.c | 2 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_sensors_core.c | 96 | ||||
-rwxr-xr-x | src/occ_405/proc/proc_data.c | 37 | ||||
-rwxr-xr-x | src/occ_405/proc/proc_data.h | 21 |
6 files changed, 181 insertions, 58 deletions
diff --git a/src/occ_405/amec/amec_dps.c b/src/occ_405/amec/amec_dps.c index 6f5ff15..e41783a 100755 --- a/src/occ_405/amec/amec_dps.c +++ b/src/occ_405/amec/amec_dps.c @@ -84,7 +84,7 @@ void amec_dps_update_core_util(void) // Update moving average of util_slack and util_active for all cores for(l_idx=0; l_idx<MAX_NUM_CORES; l_idx++) { - if (!CORE_PRESENT(l_idx)) + if (!CORE_PRESENT(l_idx) || CORE_OFFLINE(l_idx)) { continue; //nothing to do if the core's disabled } @@ -351,7 +351,7 @@ void amec_dps_main(void) for (l_idx=0; l_idx<MAX_NUM_CORES; l_idx++) { // Find the first valid core and send its frequency - if (CORE_PRESENT(l_idx)) + if (CORE_PRESENT(l_idx) && !CORE_OFFLINE(l_idx)) { G_dcom_slv_outbox_tx.factual = AMECSENSOR_ARRAY_PTR(FREQREQC0,l_idx)->sample; diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c index 9150328..7b02ba7 100755 --- a/src/occ_405/amec/amec_freq.c +++ b/src/occ_405/amec/amec_freq.c @@ -387,7 +387,7 @@ void amec_slv_proc_voting_box(void) for (k=0; k<MAX_NUM_CORES; k++) { - if(CORE_PRESENT(k)) + if( CORE_PRESENT(k) && !CORE_OFFLINE(k) ) { l_core_freq = l_chip_fmax; l_core_reason = l_chip_reason; @@ -556,11 +556,12 @@ void amec_slv_proc_voting_box(void) { g_amec->proc[0].core_max_freq = l_core_freq; } - } + } // if core present and not offline else { - l_core_freq = 0; - l_core_reason = 0; + //Set f_request to 0 so this core is ignored in amec_slv_freq_smh() + g_amec->proc[0].core[k].f_request = 0; + g_amec->proc[0].core[k].f_reason = 0; } }//End of for loop @@ -600,7 +601,8 @@ void amec_slv_freq_smh(void) uint8_t core_idx = 0; // loop through cores within each quad Pstate pmax[MAXIMUM_QUADS] = {0}; // max pstate (min frequency) within each quad Pstate pmax_chip = 0; // highest Pstate (lowest frequency) across all quads - bool l_atLeast1Core[MAXIMUM_QUADS] = {FALSE}; // at least 1 core present in quad + bool l_atLeast1Core[MAXIMUM_QUADS] = {FALSE}; // at least 1 core present and online in quad + bool l_atLeast1Quad = FALSE; // at least 1 quad online static bool L_mfg_set_trace[MAXIMUM_QUADS] = {FALSE}; static bool L_mfg_clear_trace[MAXIMUM_QUADS] = {FALSE}; @@ -619,6 +621,7 @@ void amec_slv_freq_smh(void) if(g_amec->proc[0].core[core_num].f_request != 0) { l_atLeast1Core[quad] = TRUE; + l_atLeast1Quad = TRUE; // The higher the pstate number, the lower the frequency if(pmax[quad] < proc_freq2pstate(g_amec->proc[0].core[core_num].f_request)) { @@ -630,45 +633,49 @@ void amec_slv_freq_smh(void) } } - // check for mfg quad Pstate request and set Pstate for each quad - for (quad = 0; quad < MAXIMUM_QUADS; quad++) + // Skip determining new frequency if all cores in all quads are offline + if(l_atLeast1Quad) { - // set quad with no cores present to lowest frequency for the chip - if(l_atLeast1Core[quad] == FALSE) - pmax[quad] = pmax_chip; - - // check if there is a mnfg Pstate request for this quad - if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF) + // check for mfg quad Pstate request and set Pstate for each quad + for (quad = 0; quad < MAXIMUM_QUADS; quad++) { - // use mnfg request if it is a lower frequency (higher pState) - if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad]) - pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad]; + // set quad with no cores present to lowest frequency for the chip + if(l_atLeast1Core[quad] == FALSE) + pmax[quad] = pmax_chip; - if(L_mfg_clear_trace[quad] == FALSE) - L_mfg_set_trace[quad] = TRUE; - } - else if(L_mfg_clear_trace[quad] == TRUE) - { - TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]); - L_mfg_clear_trace[quad] = FALSE; - } + // check if there is a mnfg Pstate request for this quad + if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF) + { + // use mnfg request if it is a lower frequency (higher pState) + if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad]) + pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad]; + + if(L_mfg_clear_trace[quad] == FALSE) + L_mfg_set_trace[quad] = TRUE; + } + else if(L_mfg_clear_trace[quad] == TRUE) + { + TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]); + L_mfg_clear_trace[quad] = FALSE; + } #ifdef PROC_DEBUG - if (G_desired_pstate[quad] != pmax[quad]) - { - TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]); - } + if (G_desired_pstate[quad] != pmax[quad]) + { + TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]); + } #endif - // update quad pstate request - G_desired_pstate[quad] = pmax[quad]; + // update quad pstate request + G_desired_pstate[quad] = pmax[quad]; - if(L_mfg_set_trace[quad] == TRUE) - { - TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]); - L_mfg_set_trace[quad] = FALSE; - L_mfg_clear_trace[quad] = TRUE; + if(L_mfg_set_trace[quad] == TRUE) + { + TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]); + L_mfg_set_trace[quad] = FALSE; + L_mfg_clear_trace[quad] = TRUE; + } } - } + } // if at least 1 core online } diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index f0b2609..60d5a81 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -920,7 +920,7 @@ void amec_health_check_proc_timeout() { for(i=0; i<MAX_NUM_CORES; i++) { - if(!CORE_PRESENT(i)) + if(!CORE_PRESENT(i) || CORE_OFFLINE(i)) { // If this core is not present, move on continue; diff --git a/src/occ_405/amec/amec_sensors_core.c b/src/occ_405/amec/amec_sensors_core.c index d2eddec..b723366 100755 --- a/src/occ_405/amec/amec_sensors_core.c +++ b/src/occ_405/amec/amec_sensors_core.c @@ -74,9 +74,14 @@ void amec_calc_droop_sensors(CoreData * i_core_data_ptr, uint8_t i_core); void amec_update_proc_core_sensors(uint8_t i_core) { CoreData *l_core_data_ptr; - uint16_t l_temp16 = 0; uint32_t l_temp32 = 0; + uint16_t l_core_temp = 0; + uint16_t l_temp16 = 0; + uint16_t l_core_util = 0; + uint16_t l_core_freq = 0; + uint16_t l_time_interval = 0; uint8_t i = 0; + uint8_t l_quad = i_core / 4; // Quad this core resides in // Make sure the core is present, and that it has updated data. if(CORE_PRESENT(i_core) && CORE_UPDATED(i_core)) @@ -95,8 +100,8 @@ void amec_update_proc_core_sensors(uint8_t i_core) //------------------------------------------------------- // Util / Freq //------------------------------------------------------- - // Skip this update if there was an empath collection error - if (!CORE_EMPATH_ERROR(i_core)) + // Skip this update if there was an empath collection error or if previously offline + if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core)) { amec_calc_freq_and_util_sensors(l_core_data_ptr,i_core); } @@ -105,13 +110,16 @@ void amec_update_proc_core_sensors(uint8_t i_core) // Performance counter - This function should be called // after amec_calc_freq_and_util_sensors(). //------------------------------------------------------- - amec_calc_dps_util_counters(i_core); + if(!CORE_OFFLINE(i_core)) + { + amec_calc_dps_util_counters(i_core); + } //------------------------------------------------------- // IPS //------------------------------------------------------- // Skip this update if there was an empath collection error - if (!CORE_EMPATH_ERROR(i_core)) + if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core)) { amec_calc_ips_sensors(l_core_data_ptr,i_core); } @@ -162,7 +170,83 @@ void amec_update_proc_core_sensors(uint8_t i_core) l_temp16 = (uint16_t)(G_dcom_slv_inbox_doorbell_rx.tod>>45); // hi 3 bits in 0.796 day resolution with 512MHz TOD clock sensor_update( AMECSENSOR_PTR(TODclock2), l_temp16); - } + + // Core must be online that it was updated and now that the sensors have been updated make sure + // the core offline bit is off for this core. Clearing this prior to updating the temperature + // sensors may result in a false processor timeout error in health monitor + CLEAR_CORE_OFFLINE(i_core); + } // if core present and updated + + else if(CORE_OFFLINE(i_core)) + { + // core wasn't updated due to being offline, update sensors accordingly + + // Determine "core" temperature that will be returned in the poll for fan control + // If there is at least 1 core online within the same quad use the quad temp else use the nest + if(QUAD_ONLINE(l_quad)) + { + l_core_temp = AMECSENSOR_ARRAY_PTR(TEMPQ0, l_quad)->sample; + } + else + { + l_core_temp = getSensorByGsid(TEMPNEST)->sample; + } + if(l_core_temp) + { + sensor_update(AMECSENSOR_ARRAY_PTR(TEMPPROCTHRMC0,i_core), l_core_temp); + } + + // Update utilization and frequency sensors to 0 + sensor_update(AMECSENSOR_ARRAY_PTR(NUTILC0, i_core), 0); + sensor_update(AMECSENSOR_ARRAY_PTR(UTILC0, i_core), 0); + sensor_update(AMECSENSOR_ARRAY_PTR(IPSC0, i_core), 0); + sensor_update(AMECSENSOR_ARRAY_PTR(NOTBZEC0, i_core), 0); + sensor_update(AMECSENSOR_ARRAY_PTR(NOTFINC0, i_core), 0); + sensor_update(AMECSENSOR_ARRAY_PTR(FREQAC0, i_core), 0); + for(i=0; i<MAX_THREADS_PER_CORE; i++) + { + g_amec->proc[0].core[i_core].thread[i].util4ms_thread = 0; + } + + // Make updates for rolling average + // Determine the time interval for the rolling average calculation + l_time_interval = AMEC_DPS_SAMPLING_RATE * AMEC_IPS_AVRG_INTERVAL; + + // Increment sample count + if(g_amec->proc[0].core[i_core].sample_count < UINT16_MAX) + { + g_amec->proc[0].core[i_core].sample_count++; + } + + if(g_amec->proc[0].core[i_core].sample_count == l_time_interval) + { + // Increase resolution of the UTIL accumulator by two decimal places + l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(UTILC0,i_core)->accumulator * 100; + // Calculate average utilization of this core + l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count; + g_amec->proc[0].core[i_core].avg_util = l_temp32; + + // Increase resolution of the FREQA accumulator by two decimal places + l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(FREQAC0,i_core)->accumulator * 100; + // Calculate average frequency of this core + l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count; + g_amec->proc[0].core[i_core].avg_freq = l_temp32; + } + else if(g_amec->proc[0].core[i_core].sample_count > l_time_interval) + { + // Calculate average utilization for this core + l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_util; + l_temp32 = l_temp32 * (l_time_interval-1); + l_temp32 = l_temp32 + l_core_util*100; + g_amec->proc[0].core[i_core].avg_util = l_temp32 / l_time_interval; + + // Calculate average frequency for this core + l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_freq; + l_temp32 = l_temp32 * (l_time_interval-1); + l_temp32 = l_temp32 + l_core_freq*100; + g_amec->proc[0].core[i_core].avg_freq = l_temp32 / l_time_interval; + } + } // else if core offline } // Function Specification diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c index 6f25118..e8e9eba 100755 --- a/src/occ_405/proc/proc_data.c +++ b/src/occ_405/proc/proc_data.c @@ -86,6 +86,9 @@ uint32_t G_updated_core_mask = 0; // without error. uint32_t G_empath_error_core_mask = 0; +//AMEC needs to know cores that are offline +uint32_t G_core_offline_mask = 0; + //Global G_present_cores is bitmask of all cores //(1 = present, 0 = not present. Core 0 has the most significant bit) uint32_t G_present_cores = 0; @@ -128,6 +131,7 @@ void task_core_data( task_t * i_task ) CoreData * l_temp = NULL; // Used for pointer swapping bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr; ipc_core_data_parms_t * l_parms = (ipc_core_data_parms_t*)(l_bulk_core_data_ptr->gpe_req.cmd_data); + static uint32_t L_trace_core_failure = 0; do { @@ -160,8 +164,8 @@ void task_core_data( task_t * i_task ) //A request is not considered complete until both the engine job //has finished without error and any callback has run to completion. - if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) - && + if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) && + (l_parms->error.rc == 0) && CORE_PRESENT(l_bulk_core_data_ptr->current_core) ) { //If the previous GPE request succeeded then swap core_data_ptr @@ -184,7 +188,7 @@ void task_core_data( task_t * i_task ) //Core data has been collected so set the bit in global mask. //AMEC code will know which cores to update sensors for. AMEC is //responsible for clearing the bit later on. - G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core); + G_updated_core_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core)); // Presumptively clear the empath error mask G_empath_error_core_mask &= @@ -197,6 +201,24 @@ void task_core_data( task_t * i_task ) { G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = &G_core_data[MAX_NUM_FW_CORES+NUM_CORE_DATA_DOUBLE_BUF+NUM_CORE_DATA_EMPTY_BUF-1]; } + else if(l_parms->error.rc != 0) + { + // Check if failure is due to being offline (in stop 2 or greater) + if(l_parms->error.ffdc == PCB_ERROR_CHIPLET_OFFLINE) + { + // Mark core offline so it is ignored in control loops and to avoid health monitor logging error + G_core_offline_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core)); + } + else if( !(L_trace_core_failure & (1 << l_bulk_core_data_ptr->current_core)) ) + { + // trace error, if it continues health monitor will see and log error + INTR_TRAC_ERR("task_core_data: core %d data collection failed RC[0x%08X] FFDC[0x%08X%08X]", + l_bulk_core_data_ptr->current_core, l_parms->error.rc, + (uint32_t)(l_parms->error.ffdc >> 32), + (uint32_t)l_parms->error.ffdc); + L_trace_core_failure |= (1 << l_bulk_core_data_ptr->current_core); + } + } //Update current core if ( l_bulk_core_data_ptr->current_core >= l_bulk_core_data_ptr->end_core ) @@ -217,6 +239,8 @@ void task_core_data( task_t * i_task ) //1. Setup the get core data parms l_parms->core_num = l_bulk_core_data_ptr->current_core; l_parms->data = (CoreData*) l_bulk_core_data_ptr->core_data_ptr; + l_parms->error.error = 0; // default no error + l_parms->error.ffdc = 0; // Static array to record the last timestamp a get_per_core_data task was // scheduled for a core. @@ -243,13 +267,6 @@ void task_core_data( task_t * i_task ) { G_get_per_core_data_max_schedule_intervals[l_current_core] = l_elapsed_us; } - // Also sniff if the request has actually completed, it is checked above but - // the schedule proceeds regardless which could be dangerous... - if (!async_request_completed(&l_bulk_core_data_ptr->gpe_req.request)) - { - INTR_TRAC_ERR("Async get_per_core_data task for core=%d not complete!", - l_current_core); - } } //2. Schedule the GPE Request to get the core data diff --git a/src/occ_405/proc/proc_data.h b/src/occ_405/proc/proc_data.h index ebef6d0..f8b7870 100755 --- a/src/occ_405/proc/proc_data.h +++ b/src/occ_405/proc/proc_data.h @@ -50,9 +50,10 @@ #define NUM_CORE_DATA_DOUBLE_BUF 2 #define NUM_CORE_DATA_EMPTY_BUF 1 -#define LO_CORES_MASK 0xfff00000 -#define HI_CORES_MASK 0x000fff00 -#define HW_CORES_MASK 0xffffff00 +#define LO_CORES_MASK 0xfff00000 +#define HI_CORES_MASK 0x000fff00 +#define HW_CORES_MASK 0xffffff00 +#define QUAD0_CORES_PRESENT_MASK 0xf0000000 enum eOccProcCores { @@ -103,6 +104,9 @@ extern uint32_t G_present_cores; //AMEC needs to know when data for a core has been collected. extern uint32_t G_updated_core_mask; +//AMEC needs to know when a core is offline +extern uint32_t G_core_offline_mask; + // External reference to empath error mask extern uint32_t G_empath_error_core_mask; @@ -120,6 +124,17 @@ extern bool G_nest_dts_data_valid; #define CORE_EMPATH_ERROR(occ_core_id) \ ((CORE0_PRESENT_MASK >> occ_core_id) & G_empath_error_core_mask) +// Evaluates to true if the specified core is offline +#define CORE_OFFLINE(occ_core_id) \ + ((CORE0_PRESENT_MASK >> occ_core_id) & G_core_offline_mask) + +#define CLEAR_CORE_OFFLINE(occ_core_id) \ + G_core_offline_mask &= ~(CORE0_PRESENT_MASK >> occ_core_id) + +// Evaluates to true if the specified quad has at least 1 active present core +#define QUAD_ONLINE(occ_quad_id) \ + ( (QUAD0_CORES_PRESENT_MASK >> (occ_quad_id*4)) & ((~G_core_offline_mask) & G_present_cores) ) + //Collect bulk core data for all cores in specified range void task_core_data( task_t * i_task ); |