diff options
author | Chris Cain <cjcain@us.ibm.com> | 2017-04-27 19:12:54 -0500 |
---|---|---|
committer | Christopher J. Cain <cjcain@us.ibm.com> | 2017-04-28 15:49:55 -0400 |
commit | 43b9907f1a84be57f678de6f496c3f7d05c0c40b (patch) | |
tree | 61156b671ec757981331bd7f96cdd329b8336d4f /src/occ_405 | |
parent | eb9cdf57261295ee7c4c745acce6e15653bfa857 (diff) | |
download | talos-occ-43b9907f1a84be57f678de6f496c3f7d05c0c40b.tar.gz talos-occ-43b9907f1a84be57f678de6f496c3f7d05c0c40b.zip |
Add PBAX queue error handling and throttle of APSS failure traces
Change-Id: I3ee189b1088ff48ab9743233c1a05072730699b9
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39790
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Diffstat (limited to 'src/occ_405')
-rwxr-xr-x | src/occ_405/dcom/dcom.c | 74 | ||||
-rwxr-xr-x | src/occ_405/dcom/dcom.h | 3 | ||||
-rw-r--r-- | src/occ_405/dcom/dcomMasterRx.c | 16 | ||||
-rw-r--r-- | src/occ_405/dcom/dcomSlaveRx.c | 117 | ||||
-rwxr-xr-x | src/occ_405/dcom/dcomSlaveTx.c | 17 | ||||
-rwxr-xr-x | src/occ_405/dcom/dcom_service_codes.h | 2 | ||||
-rwxr-xr-x | src/occ_405/main.c | 13 | ||||
-rw-r--r-- | src/occ_405/occ_service_codes.h | 4 | ||||
-rwxr-xr-x | src/occ_405/occbuildname.c | 2 | ||||
-rwxr-xr-x | src/occ_405/pss/apss.c | 33 |
10 files changed, 168 insertions, 113 deletions
diff --git a/src/occ_405/dcom/dcom.c b/src/occ_405/dcom/dcom.c index 32f9976..40d02a1 100755 --- a/src/occ_405/dcom/dcom.c +++ b/src/occ_405/dcom/dcom.c @@ -596,5 +596,79 @@ void task_dcom_parse_occfwmsg(task_t *i_self) G_slave_event_flags = (G_slave_event_flags & (~(G_dcom_slv_inbox_rx.occ_fw_mailbox[3]))); } + +// Function Specification +// +// Name: dcom_pbax_error_handler +// +// Description: Handle an error from a pbax_read call +// +// End Function Specification +void dcom_pbax_error_handler(const uint8_t i_queue) +{ + pba_xshcsn_t l_pba_shcs; + pba_xcfg_t l_pbax_cfg; + + SsxAddress l_pba_shcs_addr = PBA_XSHCS0; + errlHndl_t l_err = NULL; + static bool L_pba_reset_logged[2] = {FALSE}; + + // Skip if waiting for a reset, no sense in trying to recover when going to be reset anyway + if((TRUE == isSafeStateRequested()) || (CURRENT_STATE() == OCC_STATE_SAFE)) + return; + + if(i_queue == 1) + l_pba_shcs_addr = PBA_XSHCS1; + + l_pba_shcs.words.high_order = in32(l_pba_shcs_addr); + + TRAC_ERR("dcom_pbax_error_handler: Start error handler for queue %d PBA_XSHCS[0x%08x]", + i_queue, l_pba_shcs.words.high_order); + + do + { + // reset queue and clear the error condition to allow future pbax reads + + // 1. Disable the pushQ and reset the read & write pointer by writing 0 to push_enable (bit 31) + l_pba_shcs.fields.push_enable = 0; + out32(l_pba_shcs_addr, l_pba_shcs.words.high_order); + + // 2. Clear the error status by setting rcv_reset (bit 3) in the PBAX CFG register + l_pbax_cfg.value = in64(PBA_XCFG); + l_pbax_cfg.fields.rcv_reset = 1; + out64(PBA_XCFG, l_pbax_cfg.value); + + // 3. Reenable the pushQ (set push_enable bit 31) + l_pba_shcs.fields.push_enable = 1; + out32(l_pba_shcs_addr, l_pba_shcs.words.high_order); + + TRAC_INFO("dcom_pbax_error_handler: Success resetting queue %d PBA_XSHCS[0x%08x] PBA_XCFG[0x%08x]", + i_queue, in32(l_pba_shcs_addr), in32(PBA_XCFG)); + + if(L_pba_reset_logged[i_queue] == FALSE) + { + // log error to indicate queue was reset + /* @ + * @errortype + * @moduleid DCOM_MID_PBAX_ERROR_HANDLER + * @reasoncode PBAX_QUEUE_RESET + * @userdata1 PBA queue + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc PBAX queue reset + */ + l_err = createErrl( DCOM_MID_PBAX_ERROR_HANDLER, //modId + PBAX_QUEUE_RESET, //reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_INFORMATIONAL, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + i_queue, //userdata1 + 0); //userdata2 + commitErrl(&l_err); + L_pba_reset_logged[i_queue] = TRUE; + } + }while(0); +} + #endif //_DCOM_C diff --git a/src/occ_405/dcom/dcom.h b/src/occ_405/dcom/dcom.h index fe0c574..9eace60 100755 --- a/src/occ_405/dcom/dcom.h +++ b/src/occ_405/dcom/dcom.h @@ -511,4 +511,7 @@ void task_dcom_parse_occfwmsg(task_t *i_self); // Copy occ fw msg void dcom_build_occfw_msg( dcom_error_type_t i_which_msg ); +// Handle PBAX Error +void dcom_pbax_error_handler(const uint8_t i_queue); + #endif //_DCOM_H diff --git a/src/occ_405/dcom/dcomMasterRx.c b/src/occ_405/dcom/dcomMasterRx.c index 37a0a31..f9565f1 100644 --- a/src/occ_405/dcom/dcomMasterRx.c +++ b/src/occ_405/dcom/dcomMasterRx.c @@ -353,11 +353,10 @@ void task_dcom_rx_slv_outboxes( task_t *i_self) // End Function Specification uint32_t dcom_rx_slv_outbox_doorbell( void ) { - static bool l_error = FALSE; int l_pbarc = 0; uint32_t l_read = 0; - // Grab doorbells from slave + // Grab doorbells from slave, read out the whole queue to prevent overflow l_pbarc = pbax_read( &G_pbax_read_queue[1], &G_dcom_slv_outbox_doorbell_rx[0], @@ -365,19 +364,16 @@ uint32_t dcom_rx_slv_outbox_doorbell( void ) &l_read ); - if ( l_pbarc != 0 && l_error == FALSE ) + if (l_pbarc != 0) { // Failure occurred but only trace it once - TRAC_ERR("PBAX Read Failure in receiving unicast doorbell - RC[%08X]", l_pbarc); + TRAC_ERR("Master PBAX Read Failure in receiving unicast slave doorbells - RC[%08X]", l_pbarc); - l_error = TRUE; - } - else - { - l_error = FALSE; + // Handle pbax read failure on queue 1 + dcom_pbax_error_handler(1); } - // Return the number of doorbells read by dividing the bytes read by the number of occs slaves + // Return the number of doorbells read by dividing the bytes read by the doorbell size return (l_read/sizeof(dcom_slv_outbox_doorbell_t)); } diff --git a/src/occ_405/dcom/dcomSlaveRx.c b/src/occ_405/dcom/dcomSlaveRx.c index 748a2f8..a9e5ce8 100644 --- a/src/occ_405/dcom/dcomSlaveRx.c +++ b/src/occ_405/dcom/dcomSlaveRx.c @@ -334,7 +334,6 @@ void task_dcom_rx_slv_inbox( task_t *i_self) // End Function Specification uint32_t dcom_rx_slv_inbox_doorbell( void ) { - static bool l_trace_once = FALSE; int l_pbarc = 0; uint32_t l_read = 0; uint32_t l_bytes_so_far = 0; @@ -358,18 +357,18 @@ uint32_t dcom_rx_slv_inbox_doorbell( void ) if ( l_pbarc != 0 ) { G_dcomTime.slave.doorbellErrorFlags.hwError = 1; - if ( FALSE == l_trace_once ) - { - // Failure occurred but only trace it once - TRAC_ERR("PBAX Read Failure in receiving multicast doorbell - RC[%08X]", l_pbarc); - l_trace_once = TRUE; - } + // Failure occurred + TRAC_ERR("Slave PBAX Read Failure in receiving multicast doorbell from master - RC[%08X]", l_pbarc); + + // Handle pbax read failure on queue 0 + dcom_pbax_error_handler(0); break; } // Didn't read any bytes from pbax. We are either done, or we // simply don't have any data to read - if(0 == l_read){ + if(0 == l_read) + { if ((ssx_timebase_get() - l_start) > SSX_MICROSECONDS(3)) { if(l_bytes_so_far){ @@ -464,7 +463,7 @@ void task_dcom_wait_for_master( task_t *i_self) static bool L_queue_enabled = FALSE; static uint32_t L_pobid_retries_left = POBID_RETRIES; static uint16_t L_no_master_doorbell_cnt = 0; - static bool L_log_first_fail = FALSE; + static uint16_t L_trace_every_count = 1; DCOM_DBG("0. Wait for Master\n"); @@ -490,63 +489,63 @@ void task_dcom_wait_for_master( task_t *i_self) // counter L_no_master_doorbell_cnt++; - if (L_no_master_doorbell_cnt >= APSS_DATA_FAIL_PMAX_RAIL) + if (L_no_master_doorbell_cnt % L_trace_every_count == 0) { - // If we fail to receive the Master doorbell for this long, take - // action TRAC_INFO("task_dcom_wait_for_master: experiencing data collection problems! fail_count=%i", L_no_master_doorbell_cnt); + } + + if (L_no_master_doorbell_cnt == APSS_DATA_FAIL_PMAX_RAIL) + { + // Now only trace every 50th occurrence + L_trace_every_count = 50; // Inform AMEC that Pmax_rail needs to change G_apss_lower_pmax_rail = TRUE; - if (!L_log_first_fail) - { - // Create and commit this error only once. - L_log_first_fail = TRUE; - TRAC_ERR("Detected a problem with slave data collection: soft time-out[%d]. Lowering Pmax_rail!", - APSS_DATA_FAIL_PMAX_RAIL); + // Create and commit this error only once + TRAC_ERR("Detected a problem with slave data collection: soft time-out[%d]. Lowering Pmax_rail!", + APSS_DATA_FAIL_PMAX_RAIL); - /* @ - * @errortype - * @moduleid DCOM_MID_TASK_WAIT_FOR_MASTER - * @reasoncode APSS_SLV_SHORT_TIMEOUT - * @userdata1 Time-out value - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc Detected a problem with APSS data collection (short time-out) - */ - errlHndl_t l_errl = createErrl( - DCOM_MID_TASK_WAIT_FOR_MASTER, //modId - APSS_SLV_SHORT_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_INFORMATIONAL, //Severity - NULL, //Trace Buf - DEFAULT_TRACE_SIZE, //Trace Size - APSS_DATA_FAIL_PMAX_RAIL, //userdata1 - 0 //userdata2 - ); + /* @ + * @errortype + * @moduleid DCOM_MID_TASK_WAIT_FOR_MASTER + * @reasoncode APSS_SLV_SHORT_TIMEOUT + * @userdata1 Time-out value + * @userdata4 OCC_NO_EXTENDED_RC + * @devdesc Detected a problem with APSS data collection (short time-out) + */ + errlHndl_t l_errl = createErrl( + DCOM_MID_TASK_WAIT_FOR_MASTER, //modId + APSS_SLV_SHORT_TIMEOUT, //reasoncode + OCC_NO_EXTENDED_RC, //Extended reason code + ERRL_SEV_INFORMATIONAL, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + APSS_DATA_FAIL_PMAX_RAIL, //userdata1 + 0 //userdata2 + ); - // Callout to firmware - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_COMPONENT_ID, - ERRL_COMPONENT_ID_FIRMWARE, - ERRL_CALLOUT_PRIORITY_MED); - - // Callout to processor - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.proc_huid, - ERRL_CALLOUT_PRIORITY_LOW); - - // Callout to APSS - addCalloutToErrl(l_errl, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.apss_huid, - ERRL_CALLOUT_PRIORITY_LOW); - - setErrlActions(l_errl, ERRL_ACTIONS_MANUFACTURING_ERROR); - commitErrl(&l_errl); - } + // Callout to firmware + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_MED); + + // Callout to processor + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.proc_huid, + ERRL_CALLOUT_PRIORITY_LOW); + + // Callout to APSS + addCalloutToErrl(l_errl, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.apss_huid, + ERRL_CALLOUT_PRIORITY_LOW); + + setErrlActions(l_errl, ERRL_ACTIONS_MANUFACTURING_ERROR); + commitErrl(&l_errl); } if (L_no_master_doorbell_cnt == APSS_DATA_FAIL_MAX) @@ -698,12 +697,14 @@ void task_dcom_wait_for_master( task_t *i_self) } else { - TRAC_INFO("[%d] Restablished contact via doorbell from Master",(int) G_pbax_id.chip_id); + TRAC_INFO("[%d] Restablished contact via doorbell from Master (after %d ticks)", + (int) G_pbax_id.chip_id, L_no_master_doorbell_cnt); // Inform AMEC that Pmax_rail doesn't need to be lowered and reset // the no_master_doorbell counter G_apss_lower_pmax_rail = FALSE; L_no_master_doorbell_cnt = 0; + L_trace_every_count = 1; } // Got a multicast doorbell diff --git a/src/occ_405/dcom/dcomSlaveTx.c b/src/occ_405/dcom/dcomSlaveTx.c index 9f6f254..a92bb69 100755 --- a/src/occ_405/dcom/dcomSlaveTx.c +++ b/src/occ_405/dcom/dcomSlaveTx.c @@ -49,6 +49,7 @@ dcom_slv_outbox_doorbell_t G_dcom_slv_outbox_doorbell_tx; // Make sure that the Slave Outbox TX Buffer is 1kB, otherwise cause // error on the compile. STATIC_ASSERT( (NUM_BYTES_IN_SLAVE_OUTBOX != (sizeof(G_dcom_slv_outbox_tx))) ); +uint32_t G_slave_pbax_rc = 0; // Function Specification // @@ -156,6 +157,13 @@ void task_dcom_tx_slv_outbox( task_t *i_self) do { + // If there was a pbax_send failure, trace it here since we can't do it in the critical + // interrupt context. + if(G_slave_pbax_rc) + { + TRAC_ERR("task_dcom_tx_slv_outbox: PBAX Send Failure in transimitting doorbell - RC[%08X]", G_slave_pbax_rc); + } + // Build/setup outbox uint32_t l_addr_in_mem = dcom_build_slv_outbox(); uint32_t l_ssxrc = 0; @@ -324,7 +332,6 @@ void task_dcom_tx_slv_outbox( task_t *i_self) // End Function Specification void dcom_tx_slv_outbox_doorbell( void ) { - static bool l_error = FALSE; int l_pbarc = 0; uint64_t l_tmp =0; @@ -337,12 +344,8 @@ void dcom_tx_slv_outbox_doorbell( void ) l_tmp, SSX_MICROSECONDS(15)); - if ( l_pbarc != 0 && l_error == FALSE ) - { - // Failure occurred - // This is running in a critical interrupt context. Tracing not allowed! - l_error = TRUE; - } + // Set this global so we know to trace this in the non-critical interrupt context + G_slave_pbax_rc = l_pbarc; } #endif //_DCOMSLAVETOMASTER_C diff --git a/src/occ_405/dcom/dcom_service_codes.h b/src/occ_405/dcom/dcom_service_codes.h index 4c210de..2a28b04 100755 --- a/src/occ_405/dcom/dcom_service_codes.h +++ b/src/occ_405/dcom/dcom_service_codes.h @@ -40,6 +40,8 @@ enum dcomModuleId DCOM_MID_TASK_WAIT_FOR_MASTER = DCOM_COMP_ID | 0x07, DCOM_MID_ERROR_CHECK = DCOM_COMP_ID | 0x08, DCOM_MID_WAIT_FOR_MASTER = DCOM_COMP_ID | 0x09, + DCOM_MID_PBAX_ERROR_HANDLER = DCOM_COMP_ID | 0x0A, + DCOM_MID_BUILD_SLV_INBOX = DCOM_COMP_ID | 0x0B, }; #endif /* #ifndef _DCOM_SERVICE_CODES_H_ */ diff --git a/src/occ_405/main.c b/src/occ_405/main.c index 2b8affc..10baabe 100755 --- a/src/occ_405/main.c +++ b/src/occ_405/main.c @@ -952,17 +952,17 @@ bool read_ppmr_header(void) * * End Function Specification */ -bool read_oppb_params(const OCCPstateParmBlock* oppb_offset) +bool read_oppb_params() { int l_ssxrc = SSX_OK; uint32_t l_reasonCode = 0; uint32_t l_extReasonCode = OCC_NO_EXTENDED_RC; uint32_t userdata1 = 0; uint32_t userdata2 = 0; + const uint32_t oppb_address = PPMR_ADDRESS_HOMER + G_ppmr_header.oppb_offset; - MAIN_TRAC_INFO("read_oppb_params(0x%08X)", PPMR_ADDRESS_HOMER + oppb_offset); - create_tlb_entry(((uint32_t)PPMR_ADDRESS_HOMER + (uint32_t)oppb_offset), - sizeof(OCCPstateParmBlock)); + MAIN_TRAC_INFO("read_oppb_params(0x%08X)", oppb_address); + create_tlb_entry(oppb_address, sizeof(OCCPstateParmBlock)); do{ // use block copy engine to read the OPPB header @@ -971,8 +971,7 @@ bool read_oppb_params(const OCCPstateParmBlock* oppb_offset) // Set up a copy request l_ssxrc = bce_request_create(&pba_copy, // block copy object &G_pba_bcde_queue, // mainstore to sram copy engine - (uint32_t)PPMR_ADDRESS_HOMER + - (uint32_t)oppb_offset, // mainstore address + oppb_address, // mainstore address (uint32_t) &G_oppb, // sram starting address (size_t) sizeof(OCCPstateParmBlock), // size of copy SSX_WAIT_FOREVER, // no timeout @@ -1122,7 +1121,7 @@ void read_hcode_headers() CHECKPOINT(PPMR_IMAGE_HEADER_READ); // Read OCC pstates parameter block - if (read_oppb_params((OCCPstateParmBlock*)G_ppmr_header.oppb_offset) == FALSE) break; + if (read_oppb_params() == FALSE) break; CHECKPOINT(OPPB_IMAGE_HEADER_READ); // Read PGPE header file, extract OCC/PGPE Shared SRAM address and size, diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index 4c6faf4..fadba70 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -94,6 +94,10 @@ enum occReasonCode INTERNAL_INVALID_INPUT_DATA = 0xB3, /// RTL detected a system checkstop OCC_SYSTEM_HALTED = 0xB5, + /// Reset PBA Queue due to PBAX Read Failure + PBAX_QUEUE_RESET = 0xBA, + /// PBAX failure sending doorbell to slaves + PBAX_ERROR = 0xBB, /// Request to read APSS data failed. APSS_GPE_FAILURE = 0xC0, /// AVS Bus output over-current reported diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c index 6d0067f..40e6b39 100755 --- a/src/occ_405/occbuildname.c +++ b/src/occ_405/occbuildname.c @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = #else -volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170425a\0" /*</BuildName>*/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170427a\0" /*</BuildName>*/ ; #endif diff --git a/src/occ_405/pss/apss.c b/src/occ_405/pss/apss.c index 0896184..83eff11 100755 --- a/src/occ_405/pss/apss.c +++ b/src/occ_405/pss/apss.c @@ -228,7 +228,6 @@ void task_apss_start_pwr_meas(struct task *i_self) { int l_rc = 0; static bool L_scheduled = FALSE; - static bool L_idle_traced = FALSE; static bool L_ffdc_collected = FALSE; // Create/schedule GPE_start_pwr_meas_read (non-blocking) @@ -238,17 +237,9 @@ void task_apss_start_pwr_meas(struct task *i_self) { if (!async_request_is_idle(&G_meas_start_request.request)) { - if (!L_idle_traced) - { - INTR_TRAC_INFO("E>task_apss_start_pwr_meas: request is not idle."); - L_idle_traced = TRUE; - } + INTR_TRAC_INFO("E>task_apss_start_pwr_meas: request is not idle."); break; } - else - { - L_idle_traced = FALSE; - } // Check if we need to try recovering the apss if(G_apss_recovery_requested) @@ -404,7 +395,6 @@ void task_apss_continue_pwr_meas(struct task *i_self) { int l_rc = 0; static bool L_scheduled = FALSE; - static bool L_idle_traced = FALSE; static bool L_ffdc_collected = FALSE; // Create/schedule GPE_apss_continue_pwr_meas_read (non-blocking) @@ -414,17 +404,9 @@ void task_apss_continue_pwr_meas(struct task *i_self) { if (!async_request_is_idle(&G_meas_cont_request.request)) { - if (!L_idle_traced) - { - INTR_TRAC_INFO("E>task_apss_continue_pwr_meas: request is not idle."); - L_idle_traced = TRUE; - } + INTR_TRAC_INFO("E>task_apss_continue_pwr_meas: request is not idle."); break; } - else - { - L_idle_traced = FALSE; - } //Don't run anything if apss recovery is in progress if(G_apss_recovery_requested) @@ -643,7 +625,6 @@ void task_apss_complete_pwr_meas(struct task *i_self) { int l_rc = 0; static bool L_scheduled = FALSE; - static bool L_idle_traced = FALSE; static bool L_ffdc_collected = FALSE; // Create/schedule GPE_apss_complete_pwr_meas_read (non-blocking) @@ -653,17 +634,9 @@ void task_apss_complete_pwr_meas(struct task *i_self) { if (!async_request_is_idle(&G_meas_complete_request.request)) { - if (!L_idle_traced) - { - INTR_TRAC_INFO("E>task_apss_complete_pwr_meas: request is not idle."); - L_idle_traced = TRUE; - } + INTR_TRAC_INFO("E>task_apss_complete_pwr_meas: request is not idle."); break; } - else - { - L_idle_traced = FALSE; - } if(G_apss_recovery_requested) { // Allow apss measurement to proceed on next tick |