diff options
author | mbroyles <mbroyles@us.ibm.com> | 2018-07-10 13:43:15 -0500 |
---|---|---|
committer | Martha Broyles <mbroyles@us.ibm.com> | 2018-07-12 11:14:57 -0400 |
commit | b8a8037ca194fc690ff1a859b5c0ddf08e708b81 (patch) | |
tree | 217876bae0867bb9374cfc04c44a71b19242a10c | |
parent | 6d556b9b95fd84ca8d4a652cf1a08ffb3b613d07 (diff) | |
download | talos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.tar.gz talos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.zip |
Prevent calling out Centaurs on clock failover
CQ: SW437405
Change-Id: I1057d70bc6673b9d08a95573f00c9268f00dd126
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/62157
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-x | src/occ_405/cent/centaur_control.c | 65 | ||||
-rwxr-xr-x | src/occ_405/cent/centaur_control.h | 2 | ||||
-rwxr-xr-x | src/occ_405/cent/centaur_data.c | 13 | ||||
-rwxr-xr-x | src/occ_405/main.c | 5 | ||||
-rw-r--r-- | src/occ_405/mem/memory.c | 28 | ||||
-rwxr-xr-x | src/occ_405/occbuildname.c | 2 |
6 files changed, 26 insertions, 89 deletions
diff --git a/src/occ_405/cent/centaur_control.c b/src/occ_405/cent/centaur_control.c index e002bed..407bbdd 100755 --- a/src/occ_405/cent/centaur_control.c +++ b/src/occ_405/cent/centaur_control.c @@ -440,69 +440,4 @@ void centaur_control_init( void ) return; } -bool check_centaur_checkstop(memory_control_task_t * i_memControlTask ) -{ - errlHndl_t l_err = NULL; - int cent = i_memControlTask->curMemIndex; - // Check if the centaur has a channel checkstop. If it does, - // then do not log any errors. We also don't want to throttle - // a centaur that is in this condition. - if(G_centaur_control_reg_parms.error.rc != CENTAUR_CHANNEL_CHECKSTOP) - { - TRAC_ERR("task_memory_control: IPC_ST_CENTAUR_SCOM failed. " - "cent=%d rc=%x, index=0x%08x", - cent, G_centaur_control_reg_parms.error.rc, - G_centaur_control_reg_parms.error.addr); - - /* @ - * @errortype - * @moduleid CENT_CONTROL_MOD - * @reasoncode CENT_SCOM_ERROR - * @userdata1 rc - Return code of scom operation - * @userdata2 index of scom operation that failed - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc OCC access to centaur failed - */ - l_err = createErrl( - CENT_CONTROL_MOD, // modId - CENT_SCOM_ERROR, // reasoncode - OCC_NO_EXTENDED_RC, // Extended reason code - ERRL_SEV_PREDICTIVE, // Severity - NULL, // Trace Buf - DEFAULT_TRACE_SIZE, // Trace Size - G_centaur_control_reg_parms.error.rc, // userdata1 - G_centaur_control_reg_parms.error.addr // userdata2 - ); - - addUsrDtlsToErrl(l_err, //io_err - (uint8_t *) &(i_memControlTask->gpe_req.ffdc), //i_dataPtr, - sizeof(GpeFfdc), //i_size - ERRL_USR_DTL_STRUCT_VERSION_1, //version - ERRL_USR_DTL_BINARY_DATA); //type - - //callout the centaur - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.centaur_huids[cent], - ERRL_CALLOUT_PRIORITY_MED); - - //callout the processor - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.proc_huid, - ERRL_CALLOUT_PRIORITY_MED); - - commitErrl(&l_err); - - return FALSE; // error was not a channel checkstop - } - else - { - // Remove the centaur sensor and all dimm sensors behind it. - cent_chan_checkstop(cent); - } - return TRUE; // Centaur channel checkstop - -} - diff --git a/src/occ_405/cent/centaur_control.h b/src/occ_405/cent/centaur_control.h index b6bb817..08832de 100755 --- a/src/occ_405/cent/centaur_control.h +++ b/src/occ_405/cent/centaur_control.h @@ -67,6 +67,4 @@ bool centaur_control( memory_control_task_t * i_memControlTask ); //void centaur_control_init( void ) INIT_SECTION; void centaur_control_init( void ); -bool check_centaur_checkstop( memory_control_task_t * i_memControlTask ); - #endif //_CENTAUR_CONTROL_H diff --git a/src/occ_405/cent/centaur_data.c b/src/occ_405/cent/centaur_data.c index 8b08030..ec27b9d 100755 --- a/src/occ_405/cent/centaur_data.c +++ b/src/occ_405/cent/centaur_data.c @@ -328,7 +328,9 @@ void cent_recovery(uint32_t i_cent) ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); - commitErrl(&l_err); + + // recovery is failing, ask for OCC reset to try to recover + REQUEST_RESET(l_err); } } @@ -671,9 +673,9 @@ void centaur_data( void ) else // log the error if it was not a CENTAUR_CHANNEL_CHECKSTOP { //log an error the first time this happens but keep on running. + //This should be informational (except mfg) since we are going to retry //eventually, we will timeout on the dimm & centaur temps not being updated - //and fans will go to max speed (probably won't be able to throttle for - //same reason we can't access the centaur here). + //if this is a hard failure which will call out the Centaur at that point. if(!L_gpe_error_logged) { L_gpe_error_logged = TRUE; @@ -696,13 +698,16 @@ void centaur_data( void ) CENT_TASK_DATA_MOD, //modId CENT_SCOM_ERROR, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity + ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_parms->error.rc, //userdata1 0 //userdata2 ); + //force severity to predictive if mfg ipl (allows callout to be added to info error) + setErrlActions(l_err, ERRL_ACTIONS_MANUFACTURING_ERROR); + addUsrDtlsToErrl(l_err, //io_err (uint8_t *) &(l_centaur_data_ptr->gpe_req.ffdc), //i_dataPtr, sizeof(GpeFfdc), //i_size diff --git a/src/occ_405/main.c b/src/occ_405/main.c index 2e36807..623fc87 100755 --- a/src/occ_405/main.c +++ b/src/occ_405/main.c @@ -1597,11 +1597,12 @@ void Main_thread_routine(void *private) // Look for FIR collection flag and status if (G_fir_collection_required && !L_fir_collection_completed) { - TRAC_IMP("fir data collection starting"); // If this OCC is the FIR master and PNOR access is allowed perform // FIR collection if (OCC_IS_FIR_MASTER()) { + TRAC_IMP("fir data collection starting"); + //Need to schedule a task on GPE to start fir collection if(!G_fir_collection_request_created) //Only need to create request once { @@ -1629,6 +1630,7 @@ void Main_thread_routine(void *private) G_fir_collection_required = FALSE; } } + TRAC_IMP("fir data collection done"); } // Error reporting is skipped while FIR collection is required so we @@ -1640,7 +1642,6 @@ void Main_thread_routine(void *private) { notify_host(INTR_REASON_HTMGT_SERVICE_REQUIRED); } - TRAC_IMP("fir data collection done"); } if( l_ssxrc == SSX_OK) diff --git a/src/occ_405/mem/memory.c b/src/occ_405/mem/memory.c index d485fc8..f7f692d 100644 --- a/src/occ_405/mem/memory.c +++ b/src/occ_405/mem/memory.c @@ -87,7 +87,6 @@ void task_memory_control( task_t * i_task ) int rc = 0; // Return code uint8_t memIndex; static bool L_gpe_scheduled = FALSE; - static uint8_t L_gpe_fail_logged = 0; static bool L_gpe_idle_traced = FALSE; static bool L_gpe_had_1_tick = FALSE; @@ -147,24 +146,23 @@ void task_memory_control( task_t * i_task ) { if(!async_request_completed(&memControlTask->gpe_req.request) || gpe_rc) { - if (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type) + // ignore error and stop monitoring this centaur if there is a channel checkstop + if( (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type) && + (gpe_rc == CENTAUR_CHANNEL_CHECKSTOP) ) { - if(!(L_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> memIndex))) - { - if (!check_centaur_checkstop(memControlTask)) - { - L_gpe_fail_logged |= CENTAUR0_PRESENT_MASK >> memIndex; - } - } + // Remove the centaur sensor and all dimm sensors behind it. + cent_chan_checkstop(memControlTask->curMemIndex); } - //Request failed. Keep count of failures and request a reset if we reach a - //max retry count - L_scom_timeout[memIndex]++; - if(L_scom_timeout[memIndex] == MEMORY_CONTROL_SCOM_TIMEOUT) + else { - break; + //Request failed. Keep count of failures and request a reset if we reach a + //max retry count + L_scom_timeout[memIndex]++; + if(L_scom_timeout[memIndex] == MEMORY_CONTROL_SCOM_TIMEOUT) + { + break; + } } - }//if(!async_request_completed(&memControlTask->gpe_req.request) || l_parms->rc) else { diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c index 21c2ecf..eb61723 100755 --- a/src/occ_405/occbuildname.c +++ b/src/occ_405/occbuildname.c @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = #else -volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180629a\0" /*</BuildName>*/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180711a\0" /*</BuildName>*/ ; #endif |