summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2018-07-10 13:43:15 -0500
committerMartha Broyles <mbroyles@us.ibm.com>2018-07-12 11:14:57 -0400
commitb8a8037ca194fc690ff1a859b5c0ddf08e708b81 (patch)
tree217876bae0867bb9374cfc04c44a71b19242a10c
parent6d556b9b95fd84ca8d4a652cf1a08ffb3b613d07 (diff)
downloadtalos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.tar.gz
talos-occ-b8a8037ca194fc690ff1a859b5c0ddf08e708b81.zip
Prevent calling out Centaurs on clock failover
CQ: SW437405 Change-Id: I1057d70bc6673b9d08a95573f00c9268f00dd126 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/62157 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-xsrc/occ_405/cent/centaur_control.c65
-rwxr-xr-xsrc/occ_405/cent/centaur_control.h2
-rwxr-xr-xsrc/occ_405/cent/centaur_data.c13
-rwxr-xr-xsrc/occ_405/main.c5
-rw-r--r--src/occ_405/mem/memory.c28
-rwxr-xr-xsrc/occ_405/occbuildname.c2
6 files changed, 26 insertions, 89 deletions
diff --git a/src/occ_405/cent/centaur_control.c b/src/occ_405/cent/centaur_control.c
index e002bed..407bbdd 100755
--- a/src/occ_405/cent/centaur_control.c
+++ b/src/occ_405/cent/centaur_control.c
@@ -440,69 +440,4 @@ void centaur_control_init( void )
return;
}
-bool check_centaur_checkstop(memory_control_task_t * i_memControlTask )
-{
- errlHndl_t l_err = NULL;
- int cent = i_memControlTask->curMemIndex;
- // Check if the centaur has a channel checkstop. If it does,
- // then do not log any errors. We also don't want to throttle
- // a centaur that is in this condition.
- if(G_centaur_control_reg_parms.error.rc != CENTAUR_CHANNEL_CHECKSTOP)
- {
- TRAC_ERR("task_memory_control: IPC_ST_CENTAUR_SCOM failed. "
- "cent=%d rc=%x, index=0x%08x",
- cent, G_centaur_control_reg_parms.error.rc,
- G_centaur_control_reg_parms.error.addr);
-
- /* @
- * @errortype
- * @moduleid CENT_CONTROL_MOD
- * @reasoncode CENT_SCOM_ERROR
- * @userdata1 rc - Return code of scom operation
- * @userdata2 index of scom operation that failed
- * @userdata4 OCC_NO_EXTENDED_RC
- * @devdesc OCC access to centaur failed
- */
- l_err = createErrl(
- CENT_CONTROL_MOD, // modId
- CENT_SCOM_ERROR, // reasoncode
- OCC_NO_EXTENDED_RC, // Extended reason code
- ERRL_SEV_PREDICTIVE, // Severity
- NULL, // Trace Buf
- DEFAULT_TRACE_SIZE, // Trace Size
- G_centaur_control_reg_parms.error.rc, // userdata1
- G_centaur_control_reg_parms.error.addr // userdata2
- );
-
- addUsrDtlsToErrl(l_err, //io_err
- (uint8_t *) &(i_memControlTask->gpe_req.ffdc), //i_dataPtr,
- sizeof(GpeFfdc), //i_size
- ERRL_USR_DTL_STRUCT_VERSION_1, //version
- ERRL_USR_DTL_BINARY_DATA); //type
-
- //callout the centaur
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- G_sysConfigData.centaur_huids[cent],
- ERRL_CALLOUT_PRIORITY_MED);
-
- //callout the processor
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- G_sysConfigData.proc_huid,
- ERRL_CALLOUT_PRIORITY_MED);
-
- commitErrl(&l_err);
-
- return FALSE; // error was not a channel checkstop
- }
- else
- {
- // Remove the centaur sensor and all dimm sensors behind it.
- cent_chan_checkstop(cent);
- }
- return TRUE; // Centaur channel checkstop
-
-}
-
diff --git a/src/occ_405/cent/centaur_control.h b/src/occ_405/cent/centaur_control.h
index b6bb817..08832de 100755
--- a/src/occ_405/cent/centaur_control.h
+++ b/src/occ_405/cent/centaur_control.h
@@ -67,6 +67,4 @@ bool centaur_control( memory_control_task_t * i_memControlTask );
//void centaur_control_init( void ) INIT_SECTION;
void centaur_control_init( void );
-bool check_centaur_checkstop( memory_control_task_t * i_memControlTask );
-
#endif //_CENTAUR_CONTROL_H
diff --git a/src/occ_405/cent/centaur_data.c b/src/occ_405/cent/centaur_data.c
index 8b08030..ec27b9d 100755
--- a/src/occ_405/cent/centaur_data.c
+++ b/src/occ_405/cent/centaur_data.c
@@ -328,7 +328,9 @@ void cent_recovery(uint32_t i_cent)
ERRL_CALLOUT_TYPE_HUID,
G_sysConfigData.proc_huid,
ERRL_CALLOUT_PRIORITY_MED);
- commitErrl(&l_err);
+
+ // recovery is failing, ask for OCC reset to try to recover
+ REQUEST_RESET(l_err);
}
}
@@ -671,9 +673,9 @@ void centaur_data( void )
else // log the error if it was not a CENTAUR_CHANNEL_CHECKSTOP
{
//log an error the first time this happens but keep on running.
+ //This should be informational (except mfg) since we are going to retry
//eventually, we will timeout on the dimm & centaur temps not being updated
- //and fans will go to max speed (probably won't be able to throttle for
- //same reason we can't access the centaur here).
+ //if this is a hard failure which will call out the Centaur at that point.
if(!L_gpe_error_logged)
{
L_gpe_error_logged = TRUE;
@@ -696,13 +698,16 @@ void centaur_data( void )
CENT_TASK_DATA_MOD, //modId
CENT_SCOM_ERROR, //reasoncode
OCC_NO_EXTENDED_RC, //Extended reason code
- ERRL_SEV_PREDICTIVE, //Severity
+ ERRL_SEV_INFORMATIONAL, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
l_parms->error.rc, //userdata1
0 //userdata2
);
+ //force severity to predictive if mfg ipl (allows callout to be added to info error)
+ setErrlActions(l_err, ERRL_ACTIONS_MANUFACTURING_ERROR);
+
addUsrDtlsToErrl(l_err, //io_err
(uint8_t *) &(l_centaur_data_ptr->gpe_req.ffdc), //i_dataPtr,
sizeof(GpeFfdc), //i_size
diff --git a/src/occ_405/main.c b/src/occ_405/main.c
index 2e36807..623fc87 100755
--- a/src/occ_405/main.c
+++ b/src/occ_405/main.c
@@ -1597,11 +1597,12 @@ void Main_thread_routine(void *private)
// Look for FIR collection flag and status
if (G_fir_collection_required && !L_fir_collection_completed)
{
- TRAC_IMP("fir data collection starting");
// If this OCC is the FIR master and PNOR access is allowed perform
// FIR collection
if (OCC_IS_FIR_MASTER())
{
+ TRAC_IMP("fir data collection starting");
+
//Need to schedule a task on GPE to start fir collection
if(!G_fir_collection_request_created) //Only need to create request once
{
@@ -1629,6 +1630,7 @@ void Main_thread_routine(void *private)
G_fir_collection_required = FALSE;
}
}
+ TRAC_IMP("fir data collection done");
}
// Error reporting is skipped while FIR collection is required so we
@@ -1640,7 +1642,6 @@ void Main_thread_routine(void *private)
{
notify_host(INTR_REASON_HTMGT_SERVICE_REQUIRED);
}
- TRAC_IMP("fir data collection done");
}
if( l_ssxrc == SSX_OK)
diff --git a/src/occ_405/mem/memory.c b/src/occ_405/mem/memory.c
index d485fc8..f7f692d 100644
--- a/src/occ_405/mem/memory.c
+++ b/src/occ_405/mem/memory.c
@@ -87,7 +87,6 @@ void task_memory_control( task_t * i_task )
int rc = 0; // Return code
uint8_t memIndex;
static bool L_gpe_scheduled = FALSE;
- static uint8_t L_gpe_fail_logged = 0;
static bool L_gpe_idle_traced = FALSE;
static bool L_gpe_had_1_tick = FALSE;
@@ -147,24 +146,23 @@ void task_memory_control( task_t * i_task )
{
if(!async_request_completed(&memControlTask->gpe_req.request) || gpe_rc)
{
- if (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type)
+ // ignore error and stop monitoring this centaur if there is a channel checkstop
+ if( (MEM_TYPE_CUMULUS == G_sysConfigData.mem_type) &&
+ (gpe_rc == CENTAUR_CHANNEL_CHECKSTOP) )
{
- if(!(L_gpe_fail_logged & (CENTAUR0_PRESENT_MASK >> memIndex)))
- {
- if (!check_centaur_checkstop(memControlTask))
- {
- L_gpe_fail_logged |= CENTAUR0_PRESENT_MASK >> memIndex;
- }
- }
+ // Remove the centaur sensor and all dimm sensors behind it.
+ cent_chan_checkstop(memControlTask->curMemIndex);
}
- //Request failed. Keep count of failures and request a reset if we reach a
- //max retry count
- L_scom_timeout[memIndex]++;
- if(L_scom_timeout[memIndex] == MEMORY_CONTROL_SCOM_TIMEOUT)
+ else
{
- break;
+ //Request failed. Keep count of failures and request a reset if we reach a
+ //max retry count
+ L_scom_timeout[memIndex]++;
+ if(L_scom_timeout[memIndex] == MEMORY_CONTROL_SCOM_TIMEOUT)
+ {
+ break;
+ }
}
-
}//if(!async_request_completed(&memControlTask->gpe_req.request) || l_parms->rc)
else
{
diff --git a/src/occ_405/occbuildname.c b/src/occ_405/occbuildname.c
index 21c2ecf..eb61723 100755
--- a/src/occ_405/occbuildname.c
+++ b/src/occ_405/occbuildname.c
@@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =
#else
-volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180629a\0" /*</BuildName>*/ ;
+volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_180711a\0" /*</BuildName>*/ ;
#endif
OpenPOWER on IntegriCloud