From 2397cb606cda005ae0fdd8455a827450fb4d8b4f Mon Sep 17 00:00:00 2001 From: mbroyles Date: Wed, 10 Jan 2018 16:18:59 -0600 Subject: Handle PGPE timeouts as workaround for prolonged droop events Add "CLIP" information to poll response Fix incorrectly throttling due to power when all cores are in stop 2 or greater Change-Id: I502cc65ad8c4cffd7f9a1442fd4de185f3cac6e2 RTC: 183700 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/51741 Tested-by: FSP CI Jenkins Reviewed-by: Christopher J. Cain Reviewed-by: Andres A. Lugo-Reyes Reviewed-by: Martha Broyles --- src/include/registers/ocb_firmware_registers.h | 6 +- src/occ_405/amec/amec_controller.c | 29 ++- src/occ_405/amec/amec_freq.c | 56 +++++- src/occ_405/amec/amec_master_smh.c | 202 ++++++++++--------- src/occ_405/amec/amec_sys.h | 4 + src/occ_405/cmdh/cmdh_fsp_cmds.c | 22 ++ src/occ_405/cmdh/cmdh_fsp_cmds.h | 1 + src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c | 31 +-- src/occ_405/cmdh/cmdh_snapshot.c | 8 - src/occ_405/common.c | 28 +++ src/occ_405/common.h | 3 + src/occ_405/errl/errl.h | 4 + src/occ_405/occ_service_codes.h | 2 +- src/occ_405/occbuildname.c | 2 +- src/occ_405/pgpe/pgpe_interface.c | 268 +++++++++++++------------ src/occ_405/proc/proc_data_control.c | 128 ++++++++---- src/occ_405/state.c | 206 +++++++++++-------- src/occ_405/wof/wof.c | 76 ++++++- 18 files changed, 682 insertions(+), 394 deletions(-) (limited to 'src') diff --git a/src/include/registers/ocb_firmware_registers.h b/src/include/registers/ocb_firmware_registers.h index 010ad02..82e94d5 100644 --- a/src/include/registers/ocb_firmware_registers.h +++ b/src/include/registers/ocb_firmware_registers.h @@ -1414,7 +1414,8 @@ typedef union ocb_occflg { uint32_t gpu0_reset_status : 1; uint32_t gpu1_reset_status : 1; uint32_t gpu2_reset_status : 1; - uint32_t reserved_occ : 3; + uint32_t reserved_occ : 2; + uint32_t pm_reset_suppress : 1; uint32_t wof_hcode_mode : 2; uint32_t active_quad_update : 1; uint32_t request_occ_safe : 1; @@ -1422,7 +1423,8 @@ typedef union ocb_occflg { uint32_t request_occ_safe : 1; uint32_t active_quad_update : 1; uint32_t wof_hcode_mode : 2; - uint32_t reserved_occ : 3; + uint32_t pm_reset_suppress : 1; + uint32_t reserved_occ : 2; uint32_t gpu2_reset_status : 1; uint32_t gpu1_reset_status : 1; uint32_t gpu0_reset_status : 1; diff --git a/src/occ_405/amec/amec_controller.c b/src/occ_405/amec/amec_controller.c index f310208..530f53f 100644 --- a/src/occ_405/amec/amec_controller.c +++ b/src/occ_405/amec/amec_controller.c @@ -457,16 +457,25 @@ uint16_t amec_controller_speed2freq (const uint16_t i_speed, const uint16_t i_fm /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ - l_temp16 = i_fmax; - l_tempreg = (uint16_t)i_speed; - l_temp32 = ((uint32_t)l_tempreg)*((uint32_t)l_temp16); - l_temp16 = (uint16_t)1000; - l_divide32[1] = (uint32_t)l_temp16; - l_divide32[0] = (uint32_t)l_temp32; - l_divide32[0] /= l_divide32[1]; - l_temp32 = l_divide32[0]; - l_freq = (uint16_t)l_temp32; /* freq will always fit in 16 bits */ - + // to handle max freq changing (i.e. mode change) between now and running amec_slv_proc_voting_box + // if speed is unconstrained set freq to unconstrained so voting box will use + // the most recent maximum frequency + if(i_speed >= g_amec->sys.max_speed) + { + l_freq = 0xFFFF; + } + else + { + l_temp16 = i_fmax; + l_tempreg = (uint16_t)i_speed; + l_temp32 = ((uint32_t)l_tempreg)*((uint32_t)l_temp16); + l_temp16 = (uint16_t)1000; + l_divide32[1] = (uint32_t)l_temp16; + l_divide32[0] = (uint32_t)l_temp32; + l_divide32[0] /= l_divide32[1]; + l_temp32 = l_divide32[0]; + l_freq = (uint16_t)l_temp32; /* freq will always fit in 16 bits */ + } return l_freq; } diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c index d1e8aad..51b04dc 100755 --- a/src/occ_405/amec/amec_freq.c +++ b/src/occ_405/amec/amec_freq.c @@ -282,6 +282,10 @@ void amec_slv_proc_voting_box(void) uint16_t k = 0; uint16_t l_chip_fmax = g_amec->sys.fmax; uint16_t l_core_freq = 0; + uint16_t l_core_freq_max = 0; // max freq across all cores + uint16_t l_core_freq_min = g_amec->sys.fmax; // min freq across all cores + uint32_t l_current_reason = 0; // used for debug purposes + static uint32_t L_last_reason = 0; // used for debug purposes uint32_t l_chip_reason = 0; uint32_t l_core_reason = 0; amec_proc_voting_reason_t l_kvm_throt_reason = NO_THROTTLE; @@ -306,9 +310,6 @@ void amec_slv_proc_voting_box(void) // This function implements the voting box to decide which input gets the right // to actuate the system. - //Reset the maximum core frequency requested prior to recalculation. - g_amec->proc[0].core_max_freq = 0; - // PPB_FMAX if(g_amec->proc[0].pwr_votes.ppb_fmax < l_chip_fmax) { @@ -516,6 +517,12 @@ void amec_slv_proc_voting_box(void) //STORE core frequency and reason g_amec->proc[0].core[k].f_request = l_core_freq; g_amec->proc[0].core[k].f_reason = l_core_reason; + if(l_core_freq < l_core_freq_min) + { + // store the new lowest frequency and reason to be used after all cores checked + l_core_freq_min = l_core_freq; + l_current_reason = l_core_reason; + } // Update the Amester parameter telling us the reason. Needed for // parameter array. @@ -557,9 +564,9 @@ void amec_slv_proc_voting_box(void) } #endif - if(l_core_freq > g_amec->proc[0].core_max_freq) + if(l_core_freq > l_core_freq_max) { - g_amec->proc[0].core_max_freq = l_core_freq; + l_core_freq_max = l_core_freq; } } // if core present and not offline else @@ -570,6 +577,15 @@ void amec_slv_proc_voting_box(void) } }//End of for loop + // update max core frequency if not 0 i.e. all cores offline (stop 2 or greater) + // this is used by power capping alg, updating to 0 will cause power throttling when not needed + if(l_core_freq_max) + { + g_amec->proc[0].core_max_freq = l_core_freq_max; + // update the overall reason driving frequency across all cores + g_amec->proc[0].f_reason = l_current_reason; + } + //check if there was a throttle reason change if(l_kvm_throt_reason != G_amec_opal_proc_throt_reason) { @@ -582,6 +598,36 @@ void amec_slv_proc_voting_box(void) ssx_semaphore_post(&G_dcomThreadWakeupSem); } } + // For debug... if lower than max update vars returned in poll response to give clipping reason + g_amec->proc[0].core_min_freq = l_core_freq_min; + if(l_core_freq_min < g_amec->sys.fmax) + { + if(l_current_reason == L_last_reason) + { + // same reason INC counter + if(g_amec->proc[0].current_clip_count != 0xFF) + { + g_amec->proc[0].current_clip_count++; + } + } + else + { + // new reason update history and set counter to 1 + L_last_reason = l_current_reason; + g_amec->proc[0].current_clip_count = 1; + if( (g_amec->proc[0].chip_f_reason_history & l_current_reason) == 0) + { + g_amec->proc[0].chip_f_reason_history |= l_current_reason; + TRAC_IMP("First time throttling for reason[0x%08X] History[0x%08X] freq = %d", + l_current_reason, g_amec->proc[0].chip_f_reason_history, l_core_freq_min); + } + } + } + else // no active clipping + { + L_last_reason = 0; + g_amec->proc[0].current_clip_count = 0; + } } // Function Specification diff --git a/src/occ_405/amec/amec_master_smh.c b/src/occ_405/amec/amec_master_smh.c index 586a465..33d6c13 100755 --- a/src/occ_405/amec/amec_master_smh.c +++ b/src/occ_405/amec/amec_master_smh.c @@ -40,6 +40,7 @@ #include "dcom.h" #include #include // For G_apss_ch_to_function +#include "common.h" // For ignore_pgpe_error() //*************************************************************************/ // Externs @@ -399,110 +400,121 @@ void amec_mst_check_under_pcap(void) /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ - - // Check if done everything possible to shed power and power still above a hard power cap - // ppb_fmax = Fmin and PWRSYS > Node power cap and - // Node power cap >= hard_min_pcap AND memory is throttled - if((g_amec->proc[0].pwr_votes.ppb_fmax == g_amec->sys.fmin) && - (AMECSENSOR_PTR(PWRSYS)->sample > g_amec->pcap.active_node_pcap) && - (g_amec->pcap.active_node_pcap >= G_sysConfigData.pcap.hard_min_pcap) && - (g_amec->pcap.active_mem_level != 0) ) + do { - - G_over_cap_count++; - - // GPUs take longer for power limit to take effect if GPUs are present need to use - // a longer wait time before logging an error and resetting - if( ( (!G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_FAILURE_THRESHOLD) ) || - ( (G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_GPU_FAILURE_THRESHOLD) ) ) + // Check if done everything possible to shed power and power still above a hard power cap + // ppb_fmax = Fmin and PWRSYS > Node power cap and + // Node power cap >= hard_min_pcap AND memory is throttled + if((g_amec->proc[0].pwr_votes.ppb_fmax == g_amec->sys.fmin) && + (AMECSENSOR_PTR(PWRSYS)->sample > g_amec->pcap.active_node_pcap) && + (g_amec->pcap.active_node_pcap >= G_sysConfigData.pcap.hard_min_pcap) && + (g_amec->pcap.active_mem_level != 0) ) { - TRAC_ERR("Failure to maintain power cap: Power Cap = %d ," - "PWRSYS = %d",g_amec->pcap.active_node_pcap, - AMECSENSOR_PTR(PWRSYS)->sample); + // Check if we are to ignore pgpe errors meaning the PGPE cannot set frequency which could + // cause this over power event. This will not cover if a different OCC is not able to shed + // power due to PGPE which would require to add this status to occ-occ communication + if(ignore_pgpe_error()) + { + // make sure count is cleared to give time for frequency to be set once PGPE can set it + G_over_cap_count = 0; + INCREMENT_ERR_HISTORY(ERRH_OVER_PCAP_IGNORED); + break; + } - // Trace power per APSS channel to have the best breakdown for debug - // compress traces to 4 max to save space on OP systems - for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++) + G_over_cap_count++; + + // GPUs take longer for power limit to take effect if GPUs are present need to use + // a longer wait time before logging an error and resetting + if( ( (!G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_FAILURE_THRESHOLD) ) || + ( (G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_GPU_FAILURE_THRESHOLD) ) ) { - l_apss_func_id = G_apss_ch_to_function[i]; + TRAC_ERR("Failure to maintain power cap: Power Cap = %d ," + "PWRSYS = %d",g_amec->pcap.active_node_pcap, + AMECSENSOR_PTR(PWRSYS)->sample); - if((l_apss_func_id != ADC_RESERVED) && - (l_apss_func_id != ADC_12V_SENSE) && - (l_apss_func_id != ADC_GND_REMOTE_SENSE) && - (l_apss_func_id != ADC_12V_STANDBY_CURRENT) ) + // Trace power per APSS channel to have the best breakdown for debug + // compress traces to 4 max to save space on OP systems + for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++) { - l_trace[l_trace_idx] = (i << 24) | (l_apss_func_id << 16) | (AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample); - l_trace_idx++; + l_apss_func_id = G_apss_ch_to_function[i]; + + if((l_apss_func_id != ADC_RESERVED) && + (l_apss_func_id != ADC_12V_SENSE) && + (l_apss_func_id != ADC_GND_REMOTE_SENSE) && + (l_apss_func_id != ADC_12V_STANDBY_CURRENT) ) + { + l_trace[l_trace_idx] = (i << 24) | (l_apss_func_id << 16) | (AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample); + l_trace_idx++; + } + } + while(l_trace_idx != 0) + { + if(l_trace_idx >=4) + { + TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X], [%08X], [%08X]", + l_trace[l_trace_idx-1], l_trace[l_trace_idx-2], l_trace[l_trace_idx-3], l_trace[l_trace_idx-4]); + l_trace_idx -= 4; + } + else if(l_trace_idx == 3) + { + TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X], [%08X]", + l_trace[l_trace_idx-1], l_trace[l_trace_idx-2], l_trace[l_trace_idx-3]); + l_trace_idx = 0; + } + else if(l_trace_idx == 2) + { + TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X]", + l_trace[l_trace_idx-1], l_trace[l_trace_idx-2]); + l_trace_idx = 0; + } + else // l_trace_idx == 1 + { + TRAC_ERR("APSS channel/FuncID/Power: [%08X]", + l_trace[l_trace_idx-1]); + l_trace_idx = 0; + } } - } - while(l_trace_idx != 0) - { - if(l_trace_idx >=4) - { - TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X], [%08X], [%08X]", - l_trace[l_trace_idx-1], l_trace[l_trace_idx-2], l_trace[l_trace_idx-3], l_trace[l_trace_idx-4]); - l_trace_idx -= 4; - } - else if(l_trace_idx == 3) - { - TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X], [%08X]", - l_trace[l_trace_idx-1], l_trace[l_trace_idx-2], l_trace[l_trace_idx-3]); - l_trace_idx = 0; - } - else if(l_trace_idx == 2) - { - TRAC_ERR("APSS channel/FuncID/Power: [%08X], [%08X]", - l_trace[l_trace_idx-1], l_trace[l_trace_idx-2]); - l_trace_idx = 0; - } - else // l_trace_idx == 1 - { - TRAC_ERR("APSS channel/FuncID/Power: [%08X]", - l_trace[l_trace_idx-1]); - l_trace_idx = 0; - } - } - /* @ - * @errortype - * @moduleid AMEC_MST_CHECK_UNDER_PCAP - * @reasoncode POWER_CAP_FAILURE - * @userdata1 Power Cap - * @userdata2 PWRSYS (Node Power) - * @devdesc Failure to maintain max power limits - * - */ - l_err = createErrl( AMEC_MST_CHECK_UNDER_PCAP, - POWER_CAP_FAILURE, - ERC_AMEC_UNDER_PCAP_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - g_amec->pcap.active_node_pcap, - AMECSENSOR_PTR(PWRSYS)->sample); - - //Callout to firmware - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); - - //Callout to APSS - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.apss_huid, - ERRL_CALLOUT_PRIORITY_HIGH); - - //Reset OCC - REQUEST_RESET(l_err); + /* @ + * @errortype + * @moduleid AMEC_MST_CHECK_UNDER_PCAP + * @reasoncode POWER_CAP_FAILURE + * @userdata1 Power Cap + * @userdata2 PWRSYS (Node Power) + * @devdesc Failure to maintain max power limits + * + */ + l_err = createErrl( AMEC_MST_CHECK_UNDER_PCAP, + POWER_CAP_FAILURE, + ERC_AMEC_UNDER_PCAP_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + g_amec->pcap.active_node_pcap, + AMECSENSOR_PTR(PWRSYS)->sample); + + //Callout to firmware + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + + //Callout to APSS + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.apss_huid, + ERRL_CALLOUT_PRIORITY_HIGH); + + //Reset OCC + REQUEST_RESET(l_err); + } } - } - else - { - // Clear counter - G_over_cap_count = 0; - } - + else + { + // Clear counter + G_over_cap_count = 0; + } + }while(0); return; } diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index e86a000..d253889 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -535,6 +535,10 @@ typedef struct // Calculations & Interim Data uint16_t core_max_freq; // Maximum requested freq for all cores on chip. + uint16_t core_min_freq; // for debug. Minimum requested freq for all cores on chip. + uint8_t current_clip_count; // for debug. #consecutive ticks core_max_freq is below max possible for same reason + uint32_t chip_f_reason_history; // for debug. bit mask history of all frequency reason(s) for the chip + uint32_t f_reason; // for debug. current reason across all cores driving the lowest f request // Parameters used through Amester interface // Note: keep core arrays here, not in per-cores structure so one parameter diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 90c6d3b..b1205f2 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -682,6 +682,28 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) } l_sensorHeader.count++; + l_extnSensorList[l_sensorHeader.count].name = EXTN_NAME_CLIP; + // get Pstate for the current minimum maximum frequency OCC is allowing + // actual frequency is driven down by the lowest max frequency across all cores + freq = g_amec->proc[0].core_min_freq; + if (freq > 0) + { + l_extnSensorList[l_sensorHeader.count].data[0] = proc_freq2pstate(freq); + } + else + { + l_extnSensorList[l_sensorHeader.count].data[0] = 0xFF; + } + + // current counter will be 0 if not currently clipping + l_extnSensorList[l_sensorHeader.count].data[1] = g_amec->proc[0].current_clip_count; + // clip history reason + l_extnSensorList[l_sensorHeader.count].data[2] = CONVERT_UINT32_UINT8_UPPER_HIGH(g_amec->proc[0].chip_f_reason_history); + l_extnSensorList[l_sensorHeader.count].data[3] = CONVERT_UINT32_UINT8_UPPER_LOW(g_amec->proc[0].chip_f_reason_history); + l_extnSensorList[l_sensorHeader.count].data[4] = CONVERT_UINT32_UINT8_LOWER_HIGH(g_amec->proc[0].chip_f_reason_history); + l_extnSensorList[l_sensorHeader.count].data[5] = CONVERT_UINT32_UINT8_LOWER_LOW(g_amec->proc[0].chip_f_reason_history); + l_sensorHeader.count++; + // add any non-0 error history counts for(l_err_hist_idx=0; l_err_hist_idx < ERR_HISTORY_SIZE; l_err_hist_idx++) { diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h index feb424a..ae85339 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.h +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h @@ -69,6 +69,7 @@ typedef enum #define EXTN_NAME_FNOM 0x464E4F4D // "FNOM" #define EXTN_NAME_FTURBO 0x46540000 // "FT" #define EXTN_NAME_FUTURBO 0x46555400 // "FUT" +#define EXTN_NAME_CLIP 0x434C4950 // "CLIP" #define EXTN_NAME_ERRHIST 0x45525248 // "ERRH" #define MAX_EXTN_SENSORS 32 diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c index e60f3c7..950466e 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c @@ -260,6 +260,7 @@ errlHndl_t data_store_freq_data(const cmdh_fsp_cmd_t * i_cmd_ptr, uint32_t l_mode_data_sz; uint16_t l_freq = 0; uint16_t l_table[OCC_MODE_COUNT] = {0}; + uint16_t l_pgpe_max_freq_mhz = (G_oppb.frequency_max_khz / 1000); do { @@ -303,13 +304,13 @@ errlHndl_t data_store_freq_data(const cmdh_fsp_cmd_t * i_cmd_ptr, break; } - // This should never happen but verify that nominal frequency is <= G_proc_fmax_mhz - if(l_freq > G_proc_fmax_mhz) + // This should never happen but verify that nominal frequency is <= OPPB max + if(l_freq > l_pgpe_max_freq_mhz) { CMDH_TRAC_ERR("Nominal Frequency[%d] (MHz)) is higher than " - "G_proc_fmax_mhz[%d], clipping Nominal Frequency", - l_freq, G_proc_fmax_mhz); - l_freq = G_proc_fmax_mhz; + "OPPB max[%d], clipping Nominal Frequency", + l_freq, l_pgpe_max_freq_mhz); + l_freq = l_pgpe_max_freq_mhz; } l_table[OCC_MODE_NOMINAL] = l_freq; @@ -325,13 +326,13 @@ errlHndl_t data_store_freq_data(const cmdh_fsp_cmd_t * i_cmd_ptr, l_table[OCC_MODE_NOMINAL]); l_freq = l_table[OCC_MODE_NOMINAL]; } - // Verify that turbo frequency is <= G_proc_fmax_mhz - else if(l_freq > G_proc_fmax_mhz) + // Verify that turbo frequency is <= OPPB max + else if(l_freq > l_pgpe_max_freq_mhz) { CMDH_TRAC_ERR("Turbo Frequency[%d] (MHz)) is higher than " - "G_proc_fmax_mhz[%d], clip Turbo Frequency", - l_freq, G_proc_fmax_mhz); - l_freq = G_proc_fmax_mhz; + "OPPB max[%d], clip Turbo Frequency", + l_freq, l_pgpe_max_freq_mhz); + l_freq = l_pgpe_max_freq_mhz; } l_table[OCC_MODE_TURBO] = l_freq; CMDH_TRAC_INFO("Turbo frequency = %d MHz", l_freq); @@ -351,13 +352,13 @@ errlHndl_t data_store_freq_data(const cmdh_fsp_cmd_t * i_cmd_ptr, // Bytes 9-10 Ultr Turbo Frequency Point l_freq = (l_buf[6] << 8 | l_buf[7]); - // Verify that ultra turbo frequency is <= G_proc_fmax_mhz - if(l_freq > G_proc_fmax_mhz) + // Verify that ultra turbo frequency is <= OPPB max + if(l_freq > l_pgpe_max_freq_mhz) { CMDH_TRAC_ERR("Ultra Turbo Frequency[%d] (MHz) is higher than PGPE's " - "Max freq (G_proc_fmax_mhz[%d]) clip Ultra Turbo Frequency", - l_freq, G_proc_fmax_mhz); - l_freq = G_proc_fmax_mhz; + "Max freq (OPPB max[%d]) clip Ultra Turbo Frequency", + l_freq, l_pgpe_max_freq_mhz); + l_freq = l_pgpe_max_freq_mhz; } // Check if (H)TMGT will let WOF run, else clear flags diff --git a/src/occ_405/cmdh/cmdh_snapshot.c b/src/occ_405/cmdh/cmdh_snapshot.c index ecb9afc..b52858f 100755 --- a/src/occ_405/cmdh/cmdh_snapshot.c +++ b/src/occ_405/cmdh/cmdh_snapshot.c @@ -74,7 +74,6 @@ VOID cmdh_snapshot_find_oldest_newest(uint8_t *o_oldest, (g_cmdh_snapshot_array[0].current_id == 0) && (g_cmdh_snapshot_array[CMDH_SNAPSHOT_MAX_INDEX].current_id == 0)) { - TRAC_INFO("cmdh_snapshot_find_oldest_newest: Entry 0 is the oldest and newest"); *o_oldest = 0; *o_newest = 0; break; @@ -141,7 +140,6 @@ ERRL_RC cmdh_snapshot_buffer_nonite(const cmdh_fsp_cmd_t *i_cmd_ptr, // Check case where there are no snapshot buffers available. if (g_cmdh_snapshot_cur_index == CMDH_SNAPSHOT_DEFAULT_CUR_INDEX) { - TRAC_INFO("cmdh_snapshot_buffer_nonite: No snapshot buffer available."); break; } @@ -170,8 +168,6 @@ ERRL_RC cmdh_snapshot_buffer_nonite(const cmdh_fsp_cmd_t *i_cmd_ptr, if (i == CMDH_SNAPSHOT_MAX) { - TRAC_INFO("cmdh_snapshot_buffer_nonite: Requested buffer:%u not found so sending back %u", - l_cmd_ptr->requested_id, l_rsp_ptr->newest_id); l_req_idx = l_newest; } } @@ -319,8 +315,6 @@ errlHndl_t cmdh_snapshot_sync(const cmdh_fsp_cmd_t * i_cmd_ptr, break; } - TRAC_INFO("cmdh_snapshot_sync: Snapshot buffer has been reset!"); - l_resp_ptr->data_length[0] = 0; l_resp_ptr->data_length[1] = 0; G_rsp_status = 0; @@ -370,8 +364,6 @@ void cmdh_snapshot_callback(void * arg) if (g_cmdh_snapshot_reset) { - TRAC_INFO("cmdh_snapshot_callback: Initializing snapshot buffer and data."); - memset(g_cmdh_snapshot_array, 0, sizeof(g_cmdh_snapshot_array)); g_cmdh_snapshot_cur_id = 0; memset(L_cim_buf,0,sizeof(cmdh_snapshot_buffer_t)); diff --git a/src/occ_405/common.c b/src/occ_405/common.c index fd031e2..e7eb6d7 100755 --- a/src/occ_405/common.c +++ b/src/occ_405/common.c @@ -246,4 +246,32 @@ bool notify_host(const ext_intr_reason_t i_reason) return notify_success; } +// Called prior to logging any error related to the PGPE or Pstate control +// i.e. PGPE communication, maintaining power cap... +// During prolonged droop events the PGPE can be non-responsive and don't have frequency control so doing a pm reset will +// not help. The PGPE will set a bit in the OCC FLAGS register to indicate when in this condition for the OCC to ignore errors +// Returns true if the error should be ignored +bool ignore_pgpe_error(void) +{ + static bool L_last_ignore_error = false; + bool l_ignore_error = false; + ocb_occflg_t occ_flags = {0}; + + // Check if the bit to ignore errors is set in the OCC Flags register + occ_flags.value = in32(OCB_OCCFLG); + + if (occ_flags.fields.pm_reset_suppress == 1) + { + l_ignore_error = true; + } + + // Trace if this is a change from the last time this was called + if (L_last_ignore_error != l_ignore_error) + { + TRAC_ERR("ignore_pgpe_error: OCCFLG pm_reset_suppress was %d and is now %d", L_last_ignore_error, l_ignore_error); + L_last_ignore_error = l_ignore_error; + } + + return l_ignore_error; +} diff --git a/src/occ_405/common.h b/src/occ_405/common.h index 787af35..06ab5df 100644 --- a/src/occ_405/common.h +++ b/src/occ_405/common.h @@ -49,4 +49,7 @@ void task_misc_405_checks(task_t *i_self); // Returns true if notification was sent, false if interrupt already outstanding bool notify_host(const ext_intr_reason_t i_reason); +// Returns true if PGPE error should be ignored +bool ignore_pgpe_error(void); + #endif // _common_h diff --git a/src/occ_405/errl/errl.h b/src/occ_405/errl/errl.h index 1218b92..042841e 100755 --- a/src/occ_405/errl/errl.h +++ b/src/occ_405/errl/errl.h @@ -295,6 +295,10 @@ typedef enum { ERRH_24X7_DISABLED = 0x18, ERRH_CEFF_RATIO_VDD_EXCURSION = 0x19, ERRH_AVSBUS_VDD_TEMPERATURE = 0x1A, + ERRH_OVER_PCAP_IGNORED = 0x1B, + ERRH_VFRT_TIMEOUT_IGNORED = 0x1C, + ERRH_WOF_CONTROL_TIMEOUT_IGNORED = 0x1D, + ERRH_PSTATE_CHANGE_IGNORED = 0x1E, ERR_HISTORY_SIZE = 0x20 } ERR_HISTORY_INDEX; diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index 1c00381..f2d3417 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -287,7 +287,7 @@ enum occExtReasonCode ERC_GPU_READ_PWR_LIMIT_FAILURE = 0x0101, ERC_GPU_SET_PWR_LIMIT_FAILURE = 0x0102, - ERC_STATE_FROM_ALL_TO_STB_FAILURE = 0x0123, + ERC_STATE_FROM_OBS_TO_ACT_FAILURE = 0x0123, ERC_STATE_FROM_ACT_TO_CHR_FAILURE = 0x0124, ERC_STATE_FROM_CHR_TO_ACT_FAILURE = 0x0125, ERC_STATE_FROM_CHR_TO_OBS_FAILURE = 0x0126, diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c index a6e6ec5..9248044 100755 --- a/src/occ_405/occbuildname.c +++ b/src/occ_405/occbuildname.c @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = #else -volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /**/ "op_occ_171215a\0" /**/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /**/ "op_occ_180115a\0" /**/ ; #endif diff --git a/src/occ_405/pgpe/pgpe_interface.c b/src/occ_405/pgpe/pgpe_interface.c index eaaa340..13e1957 100644 --- a/src/occ_405/pgpe/pgpe_interface.c +++ b/src/occ_405/pgpe/pgpe_interface.c @@ -37,6 +37,7 @@ #include "ssx.h" #include "wof.h" #include "amec_sys.h" +#include "common.h" // For ignore_pgpe_error() // Maximum waiting time (usec) for clip update IPC task #define CLIP_UPDATE_TIMEOUT 100 // maximum waiting time (usec) for clip update IPC task @@ -420,26 +421,29 @@ int pgpe_set_clip_blocking(Pstate i_pstate) if(wait_time > CLIP_UPDATE_TIMEOUT) { // an earlier clip update IPC call has not completed, trace and log an error - TRAC_ERR("pgpe_set_clip_blocking: clip update IPC task is not Idle"); - - /* - * @errortype - * @moduleid PGPE_SET_CLIP_BLOCKING_MOD - * @reasoncode PGPE_FAILURE - * @userdata4 ERC_PGPE_CLIP_NOT_IDLE - * @devdesc pgpe clip update not idle - */ - err = createErrl( - PGPE_SET_CLIP_BLOCKING_MOD, //ModId - PGPE_FAILURE, //Reasoncode - ERC_PGPE_CLIP_NOT_IDLE, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - 0, //Userdata1 - 0 //Userdata2 - ); - + // only trace and log an error if we are not to ignore + if(!ignore_pgpe_error()) + { + TRAC_ERR("pgpe_set_clip_blocking: clip update IPC task is not Idle"); + + /* + * @errortype + * @moduleid PGPE_SET_CLIP_BLOCKING_MOD + * @reasoncode PGPE_FAILURE + * @userdata4 ERC_PGPE_CLIP_NOT_IDLE + * @devdesc pgpe clip update not idle + */ + err = createErrl( + PGPE_SET_CLIP_BLOCKING_MOD, //ModId + PGPE_FAILURE, //Reasoncode + ERC_PGPE_CLIP_NOT_IDLE, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //Userdata1 + 0 //Userdata2 + ); + } rc = PGPE_FAILURE; break; } @@ -475,26 +479,29 @@ int pgpe_set_clip_blocking(Pstate i_pstate) { if(wait_time > CLIP_UPDATE_TIMEOUT) { - TRAC_ERR("pgpe_set_clip_blocking: clip update IPC task timeout!"); - - /* - * @errortype - * @moduleid PGPE_SET_CLIP_BLOCKING_MOD - * @reasoncode GPE_REQUEST_TASK_TIMEOUT - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc pgpe clip update timeout - */ - err = createErrl( - PGPE_SET_CLIP_BLOCKING_MOD, //ModId - GPE_REQUEST_TASK_TIMEOUT, //Reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - 0, //Userdata1 - 0 //Userdata2 - ); - + // only trace and log an error if we are not to ignore + if(!ignore_pgpe_error()) + { + TRAC_ERR("pgpe_set_clip_blocking: clip update IPC task timeout!"); + + /* + * @errortype + * @moduleid PGPE_SET_CLIP_BLOCKING_MOD + * @reasoncode GPE_REQUEST_TASK_TIMEOUT + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc pgpe clip update timeout + */ + err = createErrl( + PGPE_SET_CLIP_BLOCKING_MOD, //ModId + GPE_REQUEST_TASK_TIMEOUT, //Reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //Userdata1 + 0 //Userdata2 + ); + } rc = GPE_REQUEST_TASK_TIMEOUT; break; } @@ -514,7 +521,7 @@ int pgpe_set_clip_blocking(Pstate i_pstate) // IPC task completed. check for errors if ( G_clip_update_parms.msg_cb.rc != PGPE_RC_SUCCESS ) { - // clip update IPC call has not completed, trace and log an error + // clip update IPC call failed, trace and log an error TRAC_ERR("pgpe_set_clip_blocking: clip update IPC task " "returned an error [0x%08X]", G_clip_update_parms.msg_cb.rc); @@ -542,10 +549,11 @@ int pgpe_set_clip_blocking(Pstate i_pstate) } } while (0); + // request reset if(err) { REQUEST_RESET(err); - } + } return(rc); } @@ -577,35 +585,39 @@ int pgpe_clip_update(void) { if(l_wait_time > CLIP_UPDATE_TIMEOUT) { - // an earlier clip update IPC call has not completed, trace and log an error - TRAC_ERR("pgpe_clip_update: clip update IPC task is not Idle"); - - /* - * @errortype - * @moduleid PGPE_CLIP_UPDATE_MOD - * @reasoncode PGPE_FAILURE - * @userdata1 0 - * @userdata4 ERC_PGPE_CLIP_NOT_IDLE - * @devdesc pgpe clip update not idle - */ - err = createErrl( - PGPE_CLIP_UPDATE_MOD, //ModId - PGPE_FAILURE, //Reasoncode - ERC_PGPE_CLIP_NOT_IDLE, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - 0, //Userdata1 - 0 //Userdata2 - ); + // only trace and log an error if we are not to ignore + if(!ignore_pgpe_error()) + { + // an earlier clip update IPC call has not completed, trace and log an error + TRAC_ERR("pgpe_clip_update: clip update IPC task is not Idle"); + + /* + * @errortype + * @moduleid PGPE_CLIP_UPDATE_MOD + * @reasoncode PGPE_FAILURE + * @userdata1 0 + * @userdata4 ERC_PGPE_CLIP_NOT_IDLE + * @devdesc pgpe clip update not idle + */ + err = createErrl( + PGPE_CLIP_UPDATE_MOD, //ModId + PGPE_FAILURE, //Reasoncode + ERC_PGPE_CLIP_NOT_IDLE, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //Userdata1 + 0 //Userdata2 + ); - // Callout firmware - addCalloutToErrl(err, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); + // Callout firmware + addCalloutToErrl(err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); - commitErrl(&err); + commitErrl(&err); + } ext_rc = ERC_PGPE_CLIP_NOT_IDLE; break; @@ -765,35 +777,38 @@ int pgpe_start_suspend(uint8_t action, PMCR_OWNER owner) // be idle when called. if(!async_request_is_idle(&G_start_suspend_req.request)) { - TRAC_ERR("pgpe_start_suspend: Start suspend task NOT Idle"); - - /* - * @errortype - * @moduleid PGPE_START_SUSPEND_MOD - * @reasoncode PGPE_FAILURE - * @userdata1 0 - * @userdata4 ERC_PGPE_START_SUSPEND_NOT_IDLE - * @devdesc pgpe start suspend task not idle - */ - err = createErrl( - PGPE_START_SUSPEND_MOD, //ModId - PGPE_FAILURE, //Reasoncode - ERC_PGPE_START_SUSPEND_NOT_IDLE, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - 0, //Userdata1 - 0 //Userdata2 - ); + // only trace and log an error if we are not to ignore + if(!ignore_pgpe_error()) + { + TRAC_ERR("pgpe_start_suspend: Start suspend task NOT Idle"); - // Callout firmware - addCalloutToErrl(err, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); + /* + * @errortype + * @moduleid PGPE_START_SUSPEND_MOD + * @reasoncode PGPE_FAILURE + * @userdata1 0 + * @userdata4 ERC_PGPE_START_SUSPEND_NOT_IDLE + * @devdesc pgpe start suspend task not idle + */ + err = createErrl( + PGPE_START_SUSPEND_MOD, //ModId + PGPE_FAILURE, //Reasoncode + ERC_PGPE_START_SUSPEND_NOT_IDLE, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //Userdata1 + 0 //Userdata2 + ); - commitErrl(&err); + // Callout firmware + addCalloutToErrl(err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + commitErrl(&err); + } ext_rc = ERC_PGPE_START_SUSPEND_NOT_IDLE; } @@ -895,36 +910,39 @@ int pgpe_pmcr_set(void) // This check is a safety feature in case caller didn't check IPC is idle. if(!async_request_is_idle(&G_pmcr_set_req.request)) { - // an earlier PMCR update IPC call has not completed, trace and log an error - TRAC_ERR("pgpe_pmcr_set: PMCR update IPC task is not Idle"); - - /* - * @errortype - * @moduleid PGPE_PMCR_SET_MOD - * @reasoncode PGPE_FAILURE - * @userdata1 0 - * @userdata4 ERC_PGPE_SET_PMCR_NOT_IDLE - * @devdesc pgpe pmcr set not idle - */ - err = createErrl( - PGPE_PMCR_SET_MOD, //ModId - PGPE_FAILURE, //Reasoncode - ERC_PGPE_SET_PMCR_NOT_IDLE, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - 0, //Userdata1 - 0 //Userdata2 - ); + // an earlier PMCR update IPC call has not completed + // only trace and log an error if we are not to ignore + if(!ignore_pgpe_error()) + { + TRAC_ERR("pgpe_pmcr_set: PMCR update IPC task is not Idle"); - // Callout firmware - addCalloutToErrl(err, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); + /* + * @errortype + * @moduleid PGPE_PMCR_SET_MOD + * @reasoncode PGPE_FAILURE + * @userdata1 0 + * @userdata4 ERC_PGPE_SET_PMCR_NOT_IDLE + * @devdesc pgpe pmcr set not idle + */ + err = createErrl( + PGPE_PMCR_SET_MOD, //ModId + PGPE_FAILURE, //Reasoncode + ERC_PGPE_SET_PMCR_NOT_IDLE, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + 0, //Userdata1 + 0 //Userdata2 + ); - commitErrl(&err); + // Callout firmware + addCalloutToErrl(err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + commitErrl(&err); + } ext_rc = ERC_PGPE_SET_PMCR_NOT_IDLE; break; } @@ -1007,7 +1025,7 @@ int set_nominal_pstate(void) // Make sure the set PMCR task is idle. if(!async_request_is_idle(&G_pmcr_set_req.request)) { - TRAC_ERR("set_nominal_pstate: Set PMCR task not idle!"); + TRAC_ERR("set_nominal_pstate: Set PMCR task not idle! OCCFLG[0x%08X]", in32(OCB_OCCFLG)); l_rc = ERC_PGPE_SET_PMCR_NOT_IDLE; break; } @@ -1019,7 +1037,7 @@ int set_nominal_pstate(void) // This should not be called if Pstate protocol is in transition if(G_proc_pstate_status == PSTATES_IN_TRANSITION) { - TRAC_ERR("set_nominal_pstate: Pstate protocol in transtion!"); + TRAC_ERR("set_nominal_pstate: Pstate protocol in transtion! OCCFLG[0x%08X]", in32(OCB_OCCFLG)); l_rc = ERC_PGPE_START_SUSPEND_NOT_IDLE; break; } @@ -1037,7 +1055,7 @@ int set_nominal_pstate(void) if((ssx_timebase_get() - l_start) > l_timeout) { l_rc = ERC_PGPE_TASK_TIMEOUT; - TRAC_ERR("set_nominal_pstate: Timeout waiting for Pstates to be enabled"); + TRAC_ERR("set_nominal_pstate: Timeout waiting for Pstates to be enabled! OCCFLG[0x%08X]", in32(OCB_OCCFLG)); break; } ssx_sleep(SSX_MICROSECONDS(10)); diff --git a/src/occ_405/proc/proc_data_control.c b/src/occ_405/proc/proc_data_control.c index 891f80c..ebd907d 100755 --- a/src/occ_405/proc/proc_data_control.c +++ b/src/occ_405/proc/proc_data_control.c @@ -40,14 +40,19 @@ #include "rtls_service_codes.h" #include "proc_pstate.h" #include "occ_util.h" +#include "common.h" // For ignore_pgpe_error() -// The the GPE parameter fields for PGPE IPC calls. +// The GPE parameter fields for PGPE IPC calls. extern GPE_BUFFER(ipcmsg_clip_update_t G_clip_update_parms); extern GPE_BUFFER(ipcmsg_set_pmcr_t G_pmcr_set_parms); extern GpeRequest G_clip_update_req; extern GpeRequest G_pmcr_set_req; +// number of ticks to wait on clip/pmcr request to complete before checking to log an error +// this must give the PGPE at least 1ms, doubling that time to 2ms to be safe +#define SUPPRESS_PGPE_ERR_WAIT_TICKS 4 // 2ms + extern bool G_state_transition_occuring; // A state transition is currently going on? // a global flag used by task_core_data_control() to indicate @@ -68,9 +73,15 @@ bool G_allowPstates = FALSE; void task_core_data_control( task_t * i_task ) { errlHndl_t err = NULL; //Error handler - static bool L_trace_logged = false; // trace logging to avoid unnecessarily repeatig logs + static bool L_trace_logged = false; // trace logging to avoid unnecessarily repeating logs + static bool L_current_timeout_recorded = FALSE; Pstate l_pstate; static uint64_t L_last = 0xFFFFFFFFFFFFFFFF; + static uint64_t L_ignore_wait_count = 0; // number of consecutive ticks IPC task failed + bool l_check_failure = false; + int l_request_is_idle = 0; + uint8_t l_request_rc = 0; + enum occExtReasonCode l_ext_rc = OCC_NO_EXTENDED_RC; // Once a state transition process starts, task data control // stops updating the PMCR/CLIPS updates, this way, the state @@ -123,7 +134,7 @@ void task_core_data_control( task_t * i_task ) G_active_to_observation_ready = true; } } - } + } // if in state transition else { L_trace_logged = false; @@ -131,24 +142,23 @@ void task_core_data_control( task_t * i_task ) if (G_allowPstates) { // perform Pstate/clip control if previous IPC call completed successfully - // if not idle, ignore cycle - // if an error was returned, log an error, and request reset - if(G_sysConfigData.system_type.kvm) // OPAL system + if(G_sysConfigData.system_type.kvm) // OPAL system uses clip update request { + l_request_is_idle = async_request_is_idle(&G_clip_update_req.request); + l_request_rc = G_clip_update_parms.msg_cb.rc; + // confirm that the clip update IPC from last cycle // has successfully completed on PGPE (with no errors) - if( async_request_is_idle(&G_clip_update_req.request) && //clip_update/set_clip_ranges completed - (G_clip_update_parms.msg_cb.rc == PGPE_RC_SUCCESS) ) // with no errors + if( (l_request_is_idle) && //clip_update/set_clip_ranges completed + (l_request_rc == PGPE_RC_SUCCESS) ) // with no errors { //call PGPE IPC function to update the clips pgpe_clip_update(); } - else if(G_clip_update_parms.msg_cb.rc != PGPE_RC_SUCCESS) + else { - // an earlier clip update IPC call has not completed, trace and log an error - TRAC_ERR("task_core_data_control: clip update IPC task returned an error, %d", - G_clip_update_parms.msg_cb.rc); - + l_check_failure = true; + l_ext_rc = ERC_PGPE_CLIP_FAILURE; /* * @errortype * @moduleid RTLS_TASK_CORE_DATA_CONTROL_MOD @@ -158,23 +168,15 @@ void task_core_data_control( task_t * i_task ) * @userdata4 ERC_PGPE_CLIP_FAILURE * @devdesc pgpe clip update returned an error */ - err = createErrl( - RTLS_TASK_CORE_DATA_CONTROL_MOD, //ModId - PGPE_FAILURE, //Reasoncode - ERC_PGPE_CLIP_FAILURE, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - G_clip_update_parms.msg_cb.rc, //Userdata1 - async_request_is_idle(&G_clip_update_req.request) //Userdata2 - ); } } else { - // NON OPAL System, OCC owns PMCR: - if( async_request_is_idle(&G_pmcr_set_req.request) && // PMCR IPC from last TICK completed - (G_pmcr_set_parms.msg_cb.rc == PGPE_RC_SUCCESS) ) // with no errors + // NON OPAL System, OCC owns PMCR and uses PMCR set request + l_request_is_idle = async_request_is_idle(&G_pmcr_set_req.request); + l_request_rc = G_pmcr_set_parms.msg_cb.rc; + if( (l_request_is_idle) && // PMCR IPC from last TICK completed + (l_request_rc == PGPE_RC_SUCCESS) ) // with no errors { //The previous Non-OPAL PGPE request succeeded uint64_t pstateList = 0; @@ -190,19 +192,17 @@ void task_core_data_control( task_t * i_task ) if (L_last != pstateList) { L_last = pstateList; - TRAC_IMP("task_core_data_control: calling pmcr_set() w/pstates: 0x%08X%04X", - WORD_HIGH(pstateList), WORD_LOW(pstateList)>>16); + TRAC_INFO("task_core_data_control: calling pmcr_set() w/pstates: 0x%08X%04X", + WORD_HIGH(pstateList), WORD_LOW(pstateList)>>16); //call PGPE IPC function to update Pstates pgpe_pmcr_set(); } } - else if(G_pmcr_set_parms.msg_cb.rc != PGPE_RC_SUCCESS) + else { - // an earlier clip update IPC call has not completed, trace and log an error - TRAC_ERR("task_core_data_control: pstate update IPC task returned an error, %d", - G_pmcr_set_parms.msg_cb.rc); - + l_check_failure = true; + l_ext_rc = ERC_PGPE_SET_PMCR_FAILURE; /* * @errortype * @moduleid RTLS_TASK_CORE_DATA_CONTROL_MOD @@ -212,27 +212,69 @@ void task_core_data_control( task_t * i_task ) * @userdata4 ERC_PGPE_SET_PMCR_FAILURE * @devdesc pgpe PMCR set returned an error */ + } + } + } // if pstates allowed + + // Common error handling for all systems + if(l_check_failure) + { + // an earlier clip update IPC call has not completed + L_ignore_wait_count++; + + // Only log the error if we are not to ignore PGPE errors and have + // waited enough time for the PGPE to give this indication + if(L_ignore_wait_count >= SUPPRESS_PGPE_ERR_WAIT_TICKS) + { + if(!ignore_pgpe_error()) + { + TRAC_ERR("task_core_data_control: pstate update IPC task did not complete successfully, idle?[%d] rc[%08X]", + l_request_is_idle, l_request_rc); + err = createErrl( RTLS_TASK_CORE_DATA_CONTROL_MOD, //ModId PGPE_FAILURE, //Reasoncode - ERC_PGPE_SET_PMCR_FAILURE, //Extended reason code + l_ext_rc, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size - G_pmcr_set_parms.msg_cb.rc, //Userdata1 - async_request_is_idle(&G_pmcr_set_req.request) //Userdata2 + l_request_rc, //Userdata1 + l_request_is_idle //Userdata2 ); + + //Add firmware callout + addCalloutToErrl(err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + + //Add processor callout + addCalloutToErrl(err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.proc_huid, + ERRL_CALLOUT_PRIORITY_MED); + + // commit error log + REQUEST_RESET(err); + } + else + { + // Wait forever for PGPE to respond + // Put a mark on the wall so we know we hit this state + if(!L_current_timeout_recorded) + { + INCREMENT_ERR_HISTORY(ERRH_PSTATE_CHANGE_IGNORED); + L_current_timeout_recorded = TRUE; + } } } } - // else pstates not allowed yet - - if(err) + else { - // commit error log - REQUEST_RESET(err); + // no error, clear the error wait count + L_ignore_wait_count = 0; + L_current_timeout_recorded = FALSE; } - } - + } // else not in a state transition return; } diff --git a/src/occ_405/state.c b/src/occ_405/state.c index 4c0e169..ea8219a 100755 --- a/src/occ_405/state.c +++ b/src/occ_405/state.c @@ -224,7 +224,8 @@ errlHndl_t SMGR_standby_to_characterization() if(rc) { - TRAC_ERR("SMGR: failed to set pstate clips."); + TRAC_ERR("SMGR: failed to set pstate clips. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // successfully set clips; enable pstates, then start transition @@ -235,7 +236,8 @@ errlHndl_t SMGR_standby_to_characterization() if(rc) { - TRAC_ERR("SMGR: failed to start the pstate protocol for char owner on PGPE."); + TRAC_ERR("SMGR: failed to start the pstate protocol for char owner on PGPE. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // Clips set and pstates started successfully, start transition @@ -318,7 +320,8 @@ errlHndl_t SMGR_all_to_standby() // check for timeout while waiting for pgpe_start_suspend() IPC completion if(wait_time >= WAIT_PGPE_TASK_TIMEOUT) { - TRAC_ERR("SMGR_all_to_standby: Timeout waiting for Pstates start/suspend IPC task"); + TRAC_ERR("SMGR_all_to_standby: Timeout waiting for Pstates start/suspend IPC task. OCCFLG[0x%08X]", + in32(OCB_OCCFLG)); } // Stop Pstates if enabled else if(G_proc_pstate_status == PSTATES_ENABLED) @@ -326,7 +329,8 @@ errlHndl_t SMGR_all_to_standby() rc = pgpe_start_suspend(PGPE_ACTION_PSTATE_STOP, G_proc_pmcr_owner); if(rc) { - TRAC_ERR("SMGR_all_to_standby: Failed to stop the pstate protocol on PGPE. rc[%08X]", rc); + TRAC_ERR("SMGR_all_to_standby: Failed to stop the pstate protocol on PGPE. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); } } @@ -372,7 +376,8 @@ errlHndl_t SMGR_characterization_to_observation() rc = pgpe_set_clip_blocking(l_pstate); if(rc) { - TRAC_ERR("SMGR_char_to_obs: failed to set pstate clip to legacy turbo rc[%08X]", rc); + TRAC_ERR("SMGR_char_to_obs: failed to set pstate clip to legacy turbo rc[%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // clips set to legacy turbo; stop pstate protocol @@ -380,7 +385,8 @@ errlHndl_t SMGR_characterization_to_observation() rc = pgpe_start_suspend(PGPE_ACTION_PSTATE_STOP, G_proc_pmcr_owner); if(rc) { - TRAC_ERR("SMGR_char_to_obs: Failed to stop pstate protocol rc[%08X]", rc); + TRAC_ERR("SMGR_char_to_obs: Failed to stop pstate protocol rc[%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // Clips tightened successfully, and pstates disabled: perform transition @@ -452,13 +458,15 @@ errlHndl_t SMGR_observation_to_characterization() "since OCC is not active ready."); break; } + // set pstate clips l_pstate = proc_freq2pstate(G_proc_fmax_mhz); rc = pgpe_set_clip_blocking(l_pstate); if(rc) { - TRAC_ERR("SMGR_obs_to_char: failed to set pstate clips rc[%08X]", rc); + TRAC_ERR("SMGR_obs_to_char: failed to set pstate clips rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // successfully set clips; enable pstates, then start transition @@ -468,7 +476,8 @@ errlHndl_t SMGR_observation_to_characterization() if(rc) { - TRAC_ERR("SMGR_obs_to_char: failed to start pstate protocol rc[%08X]", rc); + TRAC_ERR("SMGR_obs_to_char: failed to start pstate protocol rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // Clips set successfully and pstates enabled; complete transition @@ -527,6 +536,7 @@ errlHndl_t SMGR_observation_to_active() static bool L_error_logged = FALSE; // To prevent trace and error log happened over and over int l_extRc = OCC_NO_EXTENDED_RC; int l_rc = 0; + uint32_t l_user_data = 0; Pstate l_pstate; // clear mnfg quad pstate request to default OCC to control all quads @@ -559,7 +569,20 @@ errlHndl_t SMGR_observation_to_active() if(l_rc) { - TRAC_ERR("SMGR_obs_to_active: Set Pstate clips failed rc[%08X]", l_rc); + /* @ + * @errortype + * @moduleid MAIN_STATE_TRANSITION_MID + * @reasoncode INTERNAL_FAILURE + * @userdata1 OCB_OCCFLG + * @userdata2 l_rc + * @userdata4 ERC_PGPE_CLIP_FAILURE + * @devdesc Failure seting Pstate clips on observation to active transition + */ + + l_extRc = ERC_PGPE_CLIP_FAILURE; + l_user_data = in32(OCB_OCCFLG); + TRAC_ERR("SMGR_obs_to_active: Set Pstate clips failed rc[0x%08X] OCCFLG[0x%08X]", + l_rc, l_user_data); break; } else // Clips set with no errors, enable Pstates on PGPE @@ -609,7 +632,20 @@ errlHndl_t SMGR_observation_to_active() if(l_rc) { - TRAC_ERR("SMGR_obs_to_active: Failed to start pstate protocol rc[%08X]", l_rc); + /* @ + * @errortype + * @moduleid MAIN_STATE_TRANSITION_MID + * @reasoncode INTERNAL_FAILURE + * @userdata1 OCB_OCCFLG + * @userdata2 l_rc + * @userdata4 ERC_PGPE_START_SUSPEND_FAILURE + * @devdesc Failure enabling pstates on observation to active transition + */ + + l_extRc = ERC_PGPE_START_SUSPEND_FAILURE; + l_user_data = in32(OCB_OCCFLG); + TRAC_ERR("SMGR_obs_to_active: Failed to start pstate protocol rc[0x%08X] OCCFLG[0x%08X]", + l_rc, l_user_data); break; } } @@ -621,15 +657,24 @@ errlHndl_t SMGR_observation_to_active() { if ((ssx_timebase_get() - start) > timeout) { + /* @ + * @errortype + * @moduleid MAIN_STATE_TRANSITION_MID + * @reasoncode INTERNAL_FAILURE + * @userdata1 OCB_OCCFLG + * @userdata2 l_rc + * @userdata4 ERC_GENERIC_TIMEOUT + * @devdesc Timeout waiting for pstate enable on observation to active transition + */ + l_rc = 1; + l_extRc = ERC_GENERIC_TIMEOUT; + l_user_data = in32(OCB_OCCFLG); if(FALSE == L_error_logged) { TRAC_ERR("SMGR_obs_to_active: Timeout waiting for Pstates to be enabled, " - "chips_present[%02x], Cores Present [%08x]", - G_sysConfigData.is_occ_present, - (uint32_t) ((in64(OCB_CCSR)) >> 32)); + "OCCFLG[0x%08X]", l_user_data); } - l_extRc = ERC_GENERIC_TIMEOUT; break; } ssx_sleep(SSX_MICROSECONDS(10)); @@ -660,35 +705,6 @@ errlHndl_t SMGR_observation_to_active() { TRAC_ERR("SMGR: Observation to Active Transition Failed, because pstates are not enabled"); } - - if(l_rc && FALSE == L_error_logged) - { - L_error_logged = TRUE; - /* @ - * @errortype - * @moduleid MAIN_STATE_TRANSITION_MID - * @reasoncode INTERNAL_FAILURE - * @userdata1 SMGR_MASK_ACTIVE_READY - * @userdata2 valid states - * @userdata4 ERC_GENERIC_TIMEOUT - * @devdesc Failed changing from observation to active - */ - l_errlHndl = createErrl(MAIN_STATE_TRANSITION_MID, //modId - INTERNAL_FAILURE, //reasoncode - l_extRc, //Extended reason code - ERRL_SEV_UNRECOVERABLE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - SMGR_MASK_ACTIVE_READY, //userdata1 - SMGR_validate_get_valid_states());//userdata2 - - // Callout firmware - addCalloutToErrl(l_errlHndl, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); - } - } else // We have no cores configured { @@ -705,6 +721,19 @@ errlHndl_t SMGR_observation_to_active() } // Active Ready else { + /* @ + * @errortype + * @moduleid MAIN_STATE_TRANSITION_MID + * @reasoncode INTERNAL_FAILURE + * @userdata1 SMGR_MASK_ACTIVE_READY + * @userdata2 l_rc + * @userdata4 ERC_STATE_FROM_OBS_TO_ACT_FAILURE + * @devdesc Failed changing from observation to active due to OCC not ready + */ + + l_rc = 2; + l_extRc = ERC_STATE_FROM_OBS_TO_ACT_FAILURE; + l_user_data = SMGR_MASK_ACTIVE_READY; TRAC_ERR("SMGR: Observation to Active Transition Failed, " "OCC is not Active Ready cnfgdata=0x%08x, reqd=0x%08x", DATA_get_present_cnfgdata(), @@ -712,6 +741,24 @@ errlHndl_t SMGR_observation_to_active() } } while (0); + if(l_rc && (FALSE == L_error_logged)) + { + L_error_logged = TRUE; + l_errlHndl = createErrl(MAIN_STATE_TRANSITION_MID, //modId + INTERNAL_FAILURE, //reasoncode + l_extRc, //Extended reason code + ERRL_SEV_UNRECOVERABLE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + l_user_data, //userdata1 + l_rc); //userdata2 + + // Callout firmware + addCalloutToErrl(l_errlHndl, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + } return l_errlHndl; } @@ -746,7 +793,8 @@ errlHndl_t SMGR_characterization_to_active() } if(rc) { - TRAC_ERR("SMGR_char_to_active: Failed to change PMCR ownership rc[%08X]", rc); + TRAC_ERR("SMGR_char_to_active: Failed to change PMCR ownership rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } @@ -762,7 +810,8 @@ errlHndl_t SMGR_characterization_to_active() if ((ssx_timebase_get() - start) > timeout) { rc = 1; - TRAC_ERR("SMGR_char_to_active: Timeout waiting for PMCR ownership change"); + TRAC_ERR("SMGR_char_to_active: Timeout waiting for PMCR ownership change. OCCFLG[0x%08X]", + in32(OCB_OCCFLG)); break; } ssx_sleep(SSX_MICROSECONDS(10)); @@ -838,6 +887,7 @@ errlHndl_t SMGR_characterization_to_active() errlHndl_t SMGR_active_to_observation() { int rc = 0; + enum occExtReasonCode ext_rc = OCC_NO_EXTENDED_RC; errlHndl_t l_errlHndl = NULL; uint8_t wait_time = 0; @@ -864,7 +914,8 @@ errlHndl_t SMGR_active_to_observation() // check for timeout while waiting for pgpe_start_suspend() IPC completion if(wait_time > WAIT_PGPE_TASK_TIMEOUT) { - TRAC_ERR("SMGR_act_to_obs: Timeout waiting for G_active_to_observation_ready flag."); + TRAC_ERR("SMGR_act_to_obs: Timeout waiting for G_active_to_observation_ready flag. OCCFLG[0x%08X]", + in32(OCB_OCCFLG)); /* @ * @errortype @@ -874,21 +925,7 @@ errlHndl_t SMGR_active_to_observation() * @userdata4 ERC_PGPE_ACTIVE_TO_OBSERVATION_TIMEOUT * @devdesc timeout waiting for pstates start/suspend task */ - l_errlHndl = createErrl(MAIN_STATE_TRANSITION_MID, //modId - PGPE_FAILURE, //reasoncode - ERC_PGPE_ACTIVE_TO_OBSERVATION_TIMEOUT, //Extended reason code - ERRL_SEV_UNRECOVERABLE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - wait_time, //userdata1 - 0); //userdata2 - - // Callout firmware - addCalloutToErrl(l_errlHndl, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); - + ext_rc = ERC_PGPE_ACTIVE_TO_OBSERVATION_TIMEOUT; rc = PGPE_FAILURE; break; } @@ -912,7 +949,8 @@ errlHndl_t SMGR_active_to_observation() // check for timeout while waiting for Pstate clips IPC completion if(wait_time > WAIT_PGPE_TASK_TIMEOUT) { - TRAC_ERR("SMGR_act_to_obs: Timeout waiting for clip update IPC task"); + TRAC_ERR("SMGR_act_to_obs: Timeout waiting for clip update IPC task OCCFLG[0x%08X]", + in32(OCB_OCCFLG)); /* @ * @errortype @@ -922,21 +960,7 @@ errlHndl_t SMGR_active_to_observation() * @userdata4 ERC_PGPE_TASK_TIMEOUT * @devdesc timeout waiting for pstates start/suspend task */ - l_errlHndl = createErrl(MAIN_STATE_TRANSITION_MID, //modId - PGPE_FAILURE, //reasoncode - ERC_PGPE_TASK_TIMEOUT, //Extended reason code - ERRL_SEV_UNRECOVERABLE, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - wait_time, //userdata1 - 0); //userdata2 - - // Callout firmware - addCalloutToErrl(l_errlHndl, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_HIGH); - + ext_rc = ERC_PGPE_TASK_TIMEOUT; rc = PGPE_FAILURE; break; } @@ -944,7 +968,10 @@ errlHndl_t SMGR_active_to_observation() rc = pgpe_start_suspend(PGPE_ACTION_PSTATE_STOP, G_proc_pmcr_owner); if(rc) { - TRAC_ERR("SMGR_act_to_obs: failed to stop the pstate protocol on PGPE."); + TRAC_ERR("SMGR_act_to_obs: failed to stop the pstate protocol on PGPE rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); + ext_rc = ERC_PGPE_TASK_TIMEOUT; + rc = PGPE_FAILURE; break; } else // Pstates Disabled and clips set successfully, perform state transition @@ -968,6 +995,20 @@ errlHndl_t SMGR_active_to_observation() if(rc) { TRAC_ERR("SMGR: Failed with rc = %d to switch to Observation state", rc); + l_errlHndl = createErrl(MAIN_STATE_TRANSITION_MID, //modId + rc, //reasoncode + ext_rc, //Extended reason code + ERRL_SEV_UNRECOVERABLE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + wait_time, //userdata1 + 0); //userdata2 + + // Callout firmware + addCalloutToErrl(l_errlHndl, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); } else { @@ -1003,7 +1044,8 @@ errlHndl_t SMGR_active_to_characterization() if(rc) { - TRAC_ERR("SMGR_act_to_char: failed to set pstate clips."); + TRAC_ERR("SMGR_act_to_char: failed to set pstate clips. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // clips set successfully, keep pstates enabled, but change ownership @@ -1012,7 +1054,8 @@ errlHndl_t SMGR_active_to_characterization() if(rc) { - TRAC_ERR("SMGR: failed to change PMCR ownership."); + TRAC_ERR("SMGR: failed to change PMCR ownership. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } else // Request successfully scheduled on PGPE now verify it completed @@ -1025,7 +1068,8 @@ errlHndl_t SMGR_active_to_characterization() if ((ssx_timebase_get() - start) > timeout) { rc = 1; - TRAC_ERR("SMGR_active_to_char: Timeout waiting for PMCR ownership change"); + TRAC_ERR("SMGR_active_to_char: Timeout waiting for PMCR ownership change. rc[0x%08X] OCCFLG[0x%08X]", + rc, in32(OCB_OCCFLG)); break; } ssx_sleep(SSX_MICROSECONDS(10)); diff --git a/src/occ_405/wof/wof.c b/src/occ_405/wof/wof.c index f6d1050..9d58239 100644 --- a/src/occ_405/wof/wof.c +++ b/src/occ_405/wof/wof.c @@ -36,6 +36,7 @@ #include #include #include +#include "common.h" // For ignore_pgpe_error() //****************************************************************************** // External Globals //****************************************************************************** @@ -132,6 +133,11 @@ void call_wof_main( void ) // Variable to ensure we do not keep trying to send the wof control static bool L_wof_control_last_chance = false; + // Variable to keep track of logging timeouts being ignored + // Since WOF runs every 4ms we have already waited more than the required 1ms for PGPE + // to set the bit to give ignore indication so no additional timer needed before checking + static bool L_current_timeout_recorded = false; + // Variable to keep track of PState enablement to prevent setting/clearing // wof_disabled bit every iteration. static uint8_t L_pstate_protocol_off = 0; @@ -222,19 +228,29 @@ void call_wof_main( void ) if( (!async_request_is_idle(&G_wof_vfrt_req.request)) || (g_wof->vfrt_state != STANDBY) ) { - if( L_vfrt_last_chance == 0 ) + if( (L_vfrt_last_chance == 0) && (!ignore_pgpe_error()) ) { INTR_TRAC_ERR("WOF Disabled!" " Init VFRT request timeout"); set_clear_wof_disabled( SET, WOF_RC_VFRT_REQ_TIMEOUT); } - else + else if(L_vfrt_last_chance != 0) { INTR_TRAC_INFO("initial VFRT NOT idle." " %d more chance(s)", L_vfrt_last_chance ); L_vfrt_last_chance--; } + else + { + // Wait forever for PGPE to respond + // Put a mark on the wall so we know we hit this state + if(!L_current_timeout_recorded) + { + INCREMENT_ERR_HISTORY(ERRH_VFRT_TIMEOUT_IGNORED); + L_current_timeout_recorded = TRUE; + } + } } break; @@ -247,23 +263,36 @@ void call_wof_main( void ) enable_success = enable_wof(); if( !enable_success ) { - if( L_wof_control_last_chance ) + // Treat as an error only if not currently ignoring PGPE failures + if( L_wof_control_last_chance && (!ignore_pgpe_error()) ) { INTR_TRAC_ERR("WOF Disabled! Control req timeout(1)"); set_clear_wof_disabled(SET, WOF_RC_CONTROL_REQ_TIMEOUT); } - else + else if(!L_wof_control_last_chance) { INTR_TRAC_ERR("One more chance for WOF " "control request(1)"); L_wof_control_last_chance = true; } + else + { + // Wait forever for PGPE to respond + // Put a mark on the wall so we know we hit this state + if(!L_current_timeout_recorded) + { + INCREMENT_ERR_HISTORY(ERRH_WOF_CONTROL_TIMEOUT_IGNORED); + L_current_timeout_recorded = TRUE; + } + } } else { // Reset the last chance variable // Init state updated in enable_wof L_wof_control_last_chance = false; + + L_current_timeout_recorded = FALSE; } break; @@ -271,17 +300,32 @@ void call_wof_main( void ) // check if request is still processing. if( !async_request_is_idle(&G_wof_control_req.request) ) { - if( L_wof_control_last_chance ) + // Treat as an error only if not currently ignoring PGPE failures + if( L_wof_control_last_chance && (!ignore_pgpe_error()) ) { INTR_TRAC_ERR("WOF Disabled! Control req timeout(2)"); set_clear_wof_disabled(SET, WOF_RC_CONTROL_REQ_TIMEOUT); } - else + else if(!L_wof_control_last_chance) { INTR_TRAC_ERR("One more chance for WOF " "control request(2)"); L_wof_control_last_chance = true; } + else + { + // Wait forever for PGPE to respond + // Put a mark on the wall so we know we hit this state + if(!L_current_timeout_recorded) + { + INCREMENT_ERR_HISTORY(ERRH_WOF_CONTROL_TIMEOUT_IGNORED); + L_current_timeout_recorded = TRUE; + } + } + } + else + { + L_current_timeout_recorded = FALSE; } // Init state updated in wof_control_callback break; @@ -303,8 +347,22 @@ void call_wof_main( void ) { if( L_vfrt_last_chance == 0 ) { - INTR_TRAC_ERR("WOF Disabled! VFRT req timeout"); - set_clear_wof_disabled(SET,WOF_RC_VFRT_REQ_TIMEOUT); + // Treat as an error only if not currently ignoring PGPE failures + if(!ignore_pgpe_error()) + { + INTR_TRAC_ERR("WOF Disabled! VFRT req timeout"); + set_clear_wof_disabled(SET,WOF_RC_VFRT_REQ_TIMEOUT); + } + else + { + // Wait forever for PGPE to respond + // Put a mark on the wall so we know we hit this state + if(!L_current_timeout_recorded) + { + INCREMENT_ERR_HISTORY(ERRH_VFRT_TIMEOUT_IGNORED); + L_current_timeout_recorded = TRUE; + } + } } else { @@ -315,6 +373,8 @@ void call_wof_main( void ) } else { + L_current_timeout_recorded = FALSE; + // Request is idle. Run wof algorithm wof_main(); -- cgit v1.2.1