From 43b9907f1a84be57f678de6f496c3f7d05c0c40b Mon Sep 17 00:00:00 2001 From: Chris Cain Date: Thu, 27 Apr 2017 19:12:54 -0500 Subject: Add PBAX queue error handling and throttle of APSS failure traces Change-Id: I3ee189b1088ff48ab9743233c1a05072730699b9 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39790 Reviewed-by: Martha Broyles Reviewed-by: William A. Bryan Tested-by: FSP CI Jenkins Reviewed-by: Christopher J. Cain --- src/occ_405/dcom/dcom.c | 74 +++++++++++++++++++++ src/occ_405/dcom/dcom.h | 3 + src/occ_405/dcom/dcomMasterRx.c | 16 ++--- src/occ_405/dcom/dcomSlaveRx.c | 117 +++++++++++++++++----------------- src/occ_405/dcom/dcomSlaveTx.c | 17 +++-- src/occ_405/dcom/dcom_service_codes.h | 2 + 6 files changed, 154 insertions(+), 75 deletions(-) (limited to 'src/occ_405/dcom') diff --git a/src/occ_405/dcom/dcom.c b/src/occ_405/dcom/dcom.c index 32f9976..40d02a1 100755 --- a/src/occ_405/dcom/dcom.c +++ b/src/occ_405/dcom/dcom.c @@ -596,5 +596,79 @@ void task_dcom_parse_occfwmsg(task_t *i_self) G_slave_event_flags = (G_slave_event_flags & (~(G_dcom_slv_inbox_rx.occ_fw_mailbox[3]))); } + +// Function Specification +// +// Name: dcom_pbax_error_handler +// +// Description: Handle an error from a pbax_read call +// +// End Function Specification +void dcom_pbax_error_handler(const uint8_t i_queue) +{ + pba_xshcsn_t l_pba_shcs; + pba_xcfg_t l_pbax_cfg; + + SsxAddress l_pba_shcs_addr = PBA_XSHCS0; + errlHndl_t l_err = NULL; + static bool L_pba_reset_logged[2] = {FALSE}; + + // Skip if waiting for a reset, no sense in trying to recover when going to be reset anyway + if((TRUE == isSafeStateRequested()) || (CURRENT_STATE() == OCC_STATE_SAFE)) + return; + + if(i_queue == 1) + l_pba_shcs_addr = PBA_XSHCS1; + + l_pba_shcs.words.high_order = in32(l_pba_shcs_addr); + + TRAC_ERR("dcom_pbax_error_handler: Start error handler for queue %d PBA_XSHCS[0x%08x]", + i_queue, l_pba_shcs.words.high_order); + + do + { + // reset queue and clear the error condition to allow future pbax reads + + // 1. Disable the pushQ and reset the read & write pointer by writing 0 to push_enable (bit 31) + l_pba_shcs.fields.push_enable = 0; + out32(l_pba_shcs_addr, l_pba_shcs.words.high_order); + + // 2. Clear the error status by setting rcv_reset (bit 3) in the PBAX CFG register + l_pbax_cfg.value = in64(PBA_XCFG); + l_pbax_cfg.fields.rcv_reset = 1; + out64(PBA_XCFG, l_pbax_cfg.value); + + // 3. Reenable the pushQ (set push_enable bit 31) + l_pba_shcs.fields.push_enable = 1; + out32(l_pba_shcs_addr, l_pba_shcs.words.high_order); + + TRAC_INFO("dcom_pbax_error_handler: Success resetting queue %d PBA_XSHCS[0x%08x] PBA_XCFG[0x%08x]", + i_queue, in32(l_pba_shcs_addr), in32(PBA_XCFG)); + + if(L_pba_reset_logged[i_queue] == FALSE) + { + // log error to indicate queue was reset + /* @ + * @errortype + * @moduleid DCOM_MID_PBAX_ERROR_HANDLER + * @reasoncode PBAX_QUEUE_RESET + * @userdata1 PBA queue + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc PBAX queue reset + */ + l_err = createErrl( DCOM_MID_PBAX_ERROR_HANDLER, //modId + PBAX_QUEUE_RESET, //reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_INFORMATIONAL, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + i_queue, //userdata1 + 0); //userdata2 + commitErrl(&l_err); + L_pba_reset_logged[i_queue] = TRUE; + } + }while(0); +} + #endif //_DCOM_C diff --git a/src/occ_405/dcom/dcom.h b/src/occ_405/dcom/dcom.h index fe0c574..9eace60 100755 --- a/src/occ_405/dcom/dcom.h +++ b/src/occ_405/dcom/dcom.h @@ -511,4 +511,7 @@ void task_dcom_parse_occfwmsg(task_t *i_self); // Copy occ fw msg void dcom_build_occfw_msg( dcom_error_type_t i_which_msg ); +// Handle PBAX Error +void dcom_pbax_error_handler(const uint8_t i_queue); + #endif //_DCOM_H diff --git a/src/occ_405/dcom/dcomMasterRx.c b/src/occ_405/dcom/dcomMasterRx.c index 37a0a31..f9565f1 100644 --- a/src/occ_405/dcom/dcomMasterRx.c +++ b/src/occ_405/dcom/dcomMasterRx.c @@ -353,11 +353,10 @@ void task_dcom_rx_slv_outboxes( task_t *i_self) // End Function Specification uint32_t dcom_rx_slv_outbox_doorbell( void ) { - static bool l_error = FALSE; int l_pbarc = 0; uint32_t l_read = 0; - // Grab doorbells from slave + // Grab doorbells from slave, read out the whole queue to prevent overflow l_pbarc = pbax_read( &G_pbax_read_queue[1], &G_dcom_slv_outbox_doorbell_rx[0], @@ -365,19 +364,16 @@ uint32_t dcom_rx_slv_outbox_doorbell( void ) &l_read ); - if ( l_pbarc != 0 && l_error == FALSE ) + if (l_pbarc != 0) { // Failure occurred but only trace it once - TRAC_ERR("PBAX Read Failure in receiving unicast doorbell - RC[%08X]", l_pbarc); + TRAC_ERR("Master PBAX Read Failure in receiving unicast slave doorbells - RC[%08X]", l_pbarc); - l_error = TRUE; - } - else - { - l_error = FALSE; + // Handle pbax read failure on queue 1 + dcom_pbax_error_handler(1); } - // Return the number of doorbells read by dividing the bytes read by the number of occs slaves + // Return the number of doorbells read by dividing the bytes read by the doorbell size return (l_read/sizeof(dcom_slv_outbox_doorbell_t)); } diff --git a/src/occ_405/dcom/dcomSlaveRx.c b/src/occ_405/dcom/dcomSlaveRx.c index 748a2f8..a9e5ce8 100644 --- a/src/occ_405/dcom/dcomSlaveRx.c +++ b/src/occ_405/dcom/dcomSlaveRx.c @@ -334,7 +334,6 @@ void task_dcom_rx_slv_inbox( task_t *i_self) // End Function Specification uint32_t dcom_rx_slv_inbox_doorbell( void ) { - static bool l_trace_once = FALSE; int l_pbarc = 0; uint32_t l_read = 0; uint32_t l_bytes_so_far = 0; @@ -358,18 +357,18 @@ uint32_t dcom_rx_slv_inbox_doorbell( void ) if ( l_pbarc != 0 ) { G_dcomTime.slave.doorbellErrorFlags.hwError = 1; - if ( FALSE == l_trace_once ) - { - // Failure occurred but only trace it once - TRAC_ERR("PBAX Read Failure in receiving multicast doorbell - RC[%08X]", l_pbarc); - l_trace_once = TRUE; - } + // Failure occurred + TRAC_ERR("Slave PBAX Read Failure in receiving multicast doorbell from master - RC[%08X]", l_pbarc); + + // Handle pbax read failure on queue 0 + dcom_pbax_error_handler(0); break; } // Didn't read any bytes from pbax. We are either done, or we // simply don't have any data to read - if(0 == l_read){ + if(0 == l_read) + { if ((ssx_timebase_get() - l_start) > SSX_MICROSECONDS(3)) { if(l_bytes_so_far){ @@ -464,7 +463,7 @@ void task_dcom_wait_for_master( task_t *i_self) static bool L_queue_enabled = FALSE; static uint32_t L_pobid_retries_left = POBID_RETRIES; static uint16_t L_no_master_doorbell_cnt = 0; - static bool L_log_first_fail = FALSE; + static uint16_t L_trace_every_count = 1; DCOM_DBG("0. Wait for Master\n"); @@ -490,63 +489,63 @@ void task_dcom_wait_for_master( task_t *i_self) // counter L_no_master_doorbell_cnt++; - if (L_no_master_doorbell_cnt >= APSS_DATA_FAIL_PMAX_RAIL) + if (L_no_master_doorbell_cnt % L_trace_every_count == 0) { - // If we fail to receive the Master doorbell for this long, take - // action TRAC_INFO("task_dcom_wait_for_master: experiencing data collection problems! fail_count=%i", L_no_master_doorbell_cnt); + } + + if (L_no_master_doorbell_cnt == APSS_DATA_FAIL_PMAX_RAIL) + { + // Now only trace every 50th occurrence + L_trace_every_count = 50; // Inform AMEC that Pmax_rail needs to change G_apss_lower_pmax_rail = TRUE; - if (!L_log_first_fail) - { - // Create and commit this error only once. - L_log_first_fail = TRUE; - TRAC_ERR("Detected a problem with slave data collection: soft time-out[%d]. Lowering Pmax_rail!", - APSS_DATA_FAIL_PMAX_RAIL); + // Create and commit this error only once + TRAC_ERR("Detected a problem with slave data collection: soft time-out[%d]. Lowering Pmax_rail!", + APSS_DATA_FAIL_PMAX_RAIL); - /* @ - * @errortype - * @moduleid DCOM_MID_TASK_WAIT_FOR_MASTER - * @reasoncode APSS_SLV_SHORT_TIMEOUT - * @userdata1 Time-out value - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc Detected a problem with APSS data collection (short time-out) - */ - errlHndl_t l_errl = createErrl( - DCOM_MID_TASK_WAIT_FOR_MASTER, //modId - APSS_SLV_SHORT_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_INFORMATIONAL, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - APSS_DATA_FAIL_PMAX_RAIL, //userdata1 - 0 //userdata2 - ); + /* @ + * @errortype + * @moduleid DCOM_MID_TASK_WAIT_FOR_MASTER + * @reasoncode APSS_SLV_SHORT_TIMEOUT + * @userdata1 Time-out value + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc Detected a problem with APSS data collection (short time-out) + */ + errlHndl_t l_errl = createErrl( + DCOM_MID_TASK_WAIT_FOR_MASTER, //modId + APSS_SLV_SHORT_TIMEOUT, //reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_INFORMATIONAL, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + APSS_DATA_FAIL_PMAX_RAIL, //userdata1 + 0 //userdata2 + ); - // Callout to firmware - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_MED); - - // Callout to processor - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.proc_huid, - ERRL_CALLOUT_PRIORITY_LOW); - - // Callout to APSS - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.apss_huid, - ERRL_CALLOUT_PRIORITY_LOW); - - setErrlActions(l_errl, ERRL_ACTIONS_MANUFACTURING_ERROR); - commitErrl(&l_errl); - } + // Callout to firmware + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_MED); + + // Callout to processor + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.proc_huid, + ERRL_CALLOUT_PRIORITY_LOW); + + // Callout to APSS + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.apss_huid, + ERRL_CALLOUT_PRIORITY_LOW); + + setErrlActions(l_errl, ERRL_ACTIONS_MANUFACTURING_ERROR); + commitErrl(&l_errl); } if (L_no_master_doorbell_cnt == APSS_DATA_FAIL_MAX) @@ -698,12 +697,14 @@ void task_dcom_wait_for_master( task_t *i_self) } else { - TRAC_INFO("[%d] Restablished contact via doorbell from Master",(int) G_pbax_id.chip_id); + TRAC_INFO("[%d] Restablished contact via doorbell from Master (after %d ticks)", + (int) G_pbax_id.chip_id, L_no_master_doorbell_cnt); // Inform AMEC that Pmax_rail doesn't need to be lowered and reset // the no_master_doorbell counter G_apss_lower_pmax_rail = FALSE; L_no_master_doorbell_cnt = 0; + L_trace_every_count = 1; } // Got a multicast doorbell diff --git a/src/occ_405/dcom/dcomSlaveTx.c b/src/occ_405/dcom/dcomSlaveTx.c index 9f6f254..a92bb69 100755 --- a/src/occ_405/dcom/dcomSlaveTx.c +++ b/src/occ_405/dcom/dcomSlaveTx.c @@ -49,6 +49,7 @@ dcom_slv_outbox_doorbell_t G_dcom_slv_outbox_doorbell_tx; // Make sure that the Slave Outbox TX Buffer is 1kB, otherwise cause // error on the compile. STATIC_ASSERT( (NUM_BYTES_IN_SLAVE_OUTBOX != (sizeof(G_dcom_slv_outbox_tx))) ); +uint32_t G_slave_pbax_rc = 0; // Function Specification // @@ -156,6 +157,13 @@ void task_dcom_tx_slv_outbox( task_t *i_self) do { + // If there was a pbax_send failure, trace it here since we can't do it in the critical + // interrupt context. + if(G_slave_pbax_rc) + { + TRAC_ERR("task_dcom_tx_slv_outbox: PBAX Send Failure in transimitting doorbell - RC[%08X]", G_slave_pbax_rc); + } + // Build/setup outbox uint32_t l_addr_in_mem = dcom_build_slv_outbox(); uint32_t l_ssxrc = 0; @@ -324,7 +332,6 @@ void task_dcom_tx_slv_outbox( task_t *i_self) // End Function Specification void dcom_tx_slv_outbox_doorbell( void ) { - static bool l_error = FALSE; int l_pbarc = 0; uint64_t l_tmp =0; @@ -337,12 +344,8 @@ void dcom_tx_slv_outbox_doorbell( void ) l_tmp, SSX_MICROSECONDS(15)); - if ( l_pbarc != 0 && l_error == FALSE ) - { - // Failure occurred - // This is running in a critical interrupt context. Tracing not allowed! - l_error = TRUE; - } + // Set this global so we know to trace this in the non-critical interrupt context + G_slave_pbax_rc = l_pbarc; } #endif //_DCOMSLAVETOMASTER_C diff --git a/src/occ_405/dcom/dcom_service_codes.h b/src/occ_405/dcom/dcom_service_codes.h index 4c210de..2a28b04 100755 --- a/src/occ_405/dcom/dcom_service_codes.h +++ b/src/occ_405/dcom/dcom_service_codes.h @@ -40,6 +40,8 @@ enum dcomModuleId DCOM_MID_TASK_WAIT_FOR_MASTER = DCOM_COMP_ID | 0x07, DCOM_MID_ERROR_CHECK = DCOM_COMP_ID | 0x08, DCOM_MID_WAIT_FOR_MASTER = DCOM_COMP_ID | 0x09, + DCOM_MID_PBAX_ERROR_HANDLER = DCOM_COMP_ID | 0x0A, + DCOM_MID_BUILD_SLV_INBOX = DCOM_COMP_ID | 0x0B, }; #endif /* #ifndef _DCOM_SERVICE_CODES_H_ */ -- cgit v1.2.1