diff options
author | mbroyles <mbroyles@us.ibm.com> | 2018-07-10 13:43:15 -0500 |
---|---|---|
committer | Martha Broyles <mbroyles@us.ibm.com> | 2018-07-12 11:14:57 -0400 |
commit | b8a8037ca194fc690ff1a859b5c0ddf08e708b81 (patch) | |
tree | 217876bae0867bb9374cfc04c44a71b19242a10c /src/occ_405/cent | |
parent | 6d556b9b95fd84ca8d4a652cf1a08ffb3b613d07 (diff) | |
download | talos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.tar.gz talos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.zip |
Prevent calling out Centaurs on clock failover
CQ: SW437405
Change-Id: I1057d70bc6673b9d08a95573f00c9268f00dd126
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/62157
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Diffstat (limited to 'src/occ_405/cent')
-rwxr-xr-x | src/occ_405/cent/centaur_control.c | 65 | ||||
-rwxr-xr-x | src/occ_405/cent/centaur_control.h | 2 | ||||
-rwxr-xr-x | src/occ_405/cent/centaur_data.c | 13 |
3 files changed, 9 insertions, 71 deletions
diff --git a/src/occ_405/cent/centaur_control.c b/src/occ_405/cent/centaur_control.c index e002bed..407bbdd 100755 --- a/src/occ_405/cent/centaur_control.c +++ b/src/occ_405/cent/centaur_control.c @@ -440,69 +440,4 @@ void centaur_control_init( void ) return; } -bool check_centaur_checkstop(memory_control_task_t * i_memControlTask ) -{ - errlHndl_t l_err = NULL; - int cent = i_memControlTask->curMemIndex; - // Check if the centaur has a channel checkstop. If it does, - // then do not log any errors. We also don't want to throttle - // a centaur that is in this condition. - if(G_centaur_control_reg_parms.error.rc != CENTAUR_CHANNEL_CHECKSTOP) - { - TRAC_ERR("task_memory_control: IPC_ST_CENTAUR_SCOM failed. " - "cent=%d rc=%x, index=0x%08x", - cent, G_centaur_control_reg_parms.error.rc, - G_centaur_control_reg_parms.error.addr); - - /* @ - * @errortype - * @moduleid CENT_CONTROL_MOD - * @reasoncode CENT_SCOM_ERROR - * @userdata1 rc - Return code of scom operation - * @userdata2 index of scom operation that failed - * @userdata4 OCC_NO_EXTENDED_RC - * @devdesc OCC access to centaur failed - */ - l_err = createErrl( - CENT_CONTROL_MOD, // modId - CENT_SCOM_ERROR, // reasoncode - OCC_NO_EXTENDED_RC, // Extended reason code - ERRL_SEV_PREDICTIVE, // Severity - NULL, // Trace Buf - DEFAULT_TRACE_SIZE, // Trace Size - G_centaur_control_reg_parms.error.rc, // userdata1 - G_centaur_control_reg_parms.error.addr // userdata2 - ); - - addUsrDtlsToErrl(l_err, //io_err - (uint8_t *) &(i_memControlTask->gpe_req.ffdc), //i_dataPtr, - sizeof(GpeFfdc), //i_size - ERRL_USR_DTL_STRUCT_VERSION_1, //version - ERRL_USR_DTL_BINARY_DATA); //type - - //callout the centaur - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.centaur_huids[cent], - ERRL_CALLOUT_PRIORITY_MED); - - //callout the processor - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.proc_huid, - ERRL_CALLOUT_PRIORITY_MED); - - commitErrl(&l_err); - - return FALSE; // error was not a channel checkstop - } - else - { - // Remove the centaur sensor and all dimm sensors behind it. - cent_chan_checkstop(cent); - } - return TRUE; // Centaur channel checkstop - -} - diff --git a/src/occ_405/cent/centaur_control.h b/src/occ_405/cent/centaur_control.h index b6bb817..08832de 100755 --- a/src/occ_405/cent/centaur_control.h +++ b/src/occ_405/cent/centaur_control.h @@ -67,6 +67,4 @@ bool centaur_control( memory_control_task_t * i_memControlTask ); //void centaur_control_init( void ) INIT_SECTION; void centaur_control_init( void ); -bool check_centaur_checkstop( memory_control_task_t * i_memControlTask ); - #endif //_CENTAUR_CONTROL_H diff --git a/src/occ_405/cent/centaur_data.c b/src/occ_405/cent/centaur_data.c index 8b08030..ec27b9d 100755 --- a/src/occ_405/cent/centaur_data.c +++ b/src/occ_405/cent/centaur_data.c @@ -328,7 +328,9 @@ void cent_recovery(uint32_t i_cent) ERRL_CALLOUT_TYPE_HUID, G_sysConfigData.proc_huid, ERRL_CALLOUT_PRIORITY_MED); - commitErrl(&l_err); + + // recovery is failing, ask for OCC reset to try to recover + REQUEST_RESET(l_err); } } @@ -671,9 +673,9 @@ void centaur_data( void ) else // log the error if it was not a CENTAUR_CHANNEL_CHECKSTOP { //log an error the first time this happens but keep on running. + //This should be informational (except mfg) since we are going to retry //eventually, we will timeout on the dimm & centaur temps not being updated - //and fans will go to max speed (probably won't be able to throttle for - //same reason we can't access the centaur here). + //if this is a hard failure which will call out the Centaur at that point. if(!L_gpe_error_logged) { L_gpe_error_logged = TRUE; @@ -696,13 +698,16 @@ void centaur_data( void ) CENT_TASK_DATA_MOD, //modId CENT_SCOM_ERROR, //reasoncode OCC_NO_EXTENDED_RC, //Extended reason code - ERRL_SEV_PREDICTIVE, //Severity + ERRL_SEV_INFORMATIONAL, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size l_parms->error.rc, //userdata1 0 //userdata2 ); + //force severity to predictive if mfg ipl (allows callout to be added to info error) + setErrlActions(l_err, ERRL_ACTIONS_MANUFACTURING_ERROR); + addUsrDtlsToErrl(l_err, //io_err (uint8_t *) &(l_centaur_data_ptr->gpe_req.ffdc), //i_dataPtr, sizeof(GpeFfdc), //i_size |