summaryrefslogtreecommitdiffstats
path: root/src/usr/fsi
diff options
context:
space:
mode:
authorDan Crowell <dcrowell@us.ibm.com>2014-03-21 12:23:11 -0500
committerA. Patrick Williams III <iawillia@us.ibm.com>2014-04-03 12:23:28 -0500
commit8aca5b75e857cb46eb34dc8739cffce88ae29a1e (patch)
tree0fef36d14f0ed7cc7c69c1a3f46f9cae464e099c /src/usr/fsi
parentb7ae1209c26699f076b2c8b14db761f625d69784 (diff)
downloadtalos-hostboot-8aca5b75e857cb46eb34dc8739cffce88ae29a1e.tar.gz
talos-hostboot-8aca5b75e857cb46eb34dc8739cffce88ae29a1e.zip
Improved handling for bad scom/fsi failures
Added more error recovery to the FSI logic to handle cases where a bad scom would leave the hardware in an error state for the next IPL. See SW252028 for details on the failing scenario that exposed the problem. CQ: SW253637 Backport: release-fips810 Change-Id: I37c5625414247c7a65c4b1a7d631c6764c3606fc Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/9840 Tested-by: Jenkins Server Reviewed-by: Michael Baiocchi <baiocchi@us.ibm.com> Reviewed-by: Corey V. Swenson <cswenson@us.ibm.com> Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src/usr/fsi')
-rw-r--r--src/usr/fsi/fsidd.C77
-rw-r--r--src/usr/fsi/fsidd.H16
2 files changed, 79 insertions, 14 deletions
diff --git a/src/usr/fsi/fsidd.C b/src/usr/fsi/fsidd.C
index 9b51799af..6b25bf6dd 100644
--- a/src/usr/fsi/fsidd.C
+++ b/src/usr/fsi/fsidd.C
@@ -678,7 +678,8 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type,
// Add data to error log where possible
uint32_t data = 0;
- ERRORLOG::ErrlUserDetailsLogRegister l_eud_fsiT(addr_info.opbTarg);
+ ERRORLOG::ErrlUserDetailsLogRegister
+ l_eud_fsiT(addr_info.accessInfo.master);
uint64_t dump_regs[] = {
ctl_reg|FSI_MATRB0_1D8,
@@ -691,7 +692,7 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type,
for( size_t x=0; x<(sizeof(dump_regs)/sizeof(dump_regs[0])); x++ )
{
- tmp_err = read( addr_info.opbTarg, dump_regs[x], &data );
+ tmp_err = read( addr_info.accessInfo.master, dump_regs[x], &data );
if( tmp_err )
{
delete tmp_err;
@@ -708,7 +709,7 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type,
for( size_t p = 0; p < 8; p++ )
{
uint32_t addr1 = ctl_reg|(FSI_MSTAP0_0D0+p*0x4);
- tmp_err = read( addr_info.opbTarg, addr1, &data );
+ tmp_err = read( addr_info.accessInfo.master, addr1, &data );
if( tmp_err )
{
delete tmp_err;
@@ -1023,6 +1024,7 @@ errlHndl_t FsiDD::read(FsiAddrInfo_t& i_addrInfo,
l_err = checkForErrors( i_addrInfo );
if( l_err )
{
+ TRACFCOMP(g_trac_fsi, "FsiDD::read> FSI Errors after doing read operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr );
break;
}
@@ -1111,6 +1113,7 @@ errlHndl_t FsiDD::write(FsiAddrInfo_t& i_addrInfo,
l_err = checkForErrors( i_addrInfo );
if( l_err )
{
+ TRACFCOMP(g_trac_fsi, "FsiDD::write> FSI Errors after doing write operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr );
break;
}
@@ -1326,7 +1329,7 @@ errlHndl_t FsiDD::handleOpbErrors(FsiAddrInfo_t& i_addrInfo,
l_err->addHwCallout( i_addrInfo.fsiTarg,
HWAS::SRCI_PRIORITY_HIGH,
HWAS::DELAYED_DECONFIG,
- HWAS::GARD_Predictive );
+ HWAS::GARD_NULL );
root_cause_found = true;
break;
@@ -1962,13 +1965,19 @@ errlHndl_t FsiDD::initMasterControl(TARGETING::Target* i_master,
scom_size,
DEVICE_XSCOM_ADDRESS(opbaddr) );
if( l_err ) { break; }
+
+ // Temporarily ignore the master-specific errors
+ uint32_t old_mask = iv_opbErrorMask;
+ iv_opbErrorMask &= ~OPB_STAT_ERR_MFSI;
+ iv_opbErrorMask &= ~OPB_STAT_ERR_CMFSI;
l_err = handleOpbErrors( addr_info, scom_data[0] );
if( l_err )
{
TRACFCOMP(g_trac_fsi,"Unclearable FSI Errors present at the beginning, no choice but to fail");
+ iv_opbErrorMask = old_mask;
break;
}
-
+ iv_opbErrorMask = old_mask;
// Initialize the FSI Master regs if they aren't already setup
if( hb_doing_init )
@@ -2383,12 +2392,64 @@ errlHndl_t FsiDD::errorCleanup( FsiAddrInfo_t& i_addrInfo,
}
else if( FSI::RC_ERROR_IN_MAEB == i_errType )
{
+ uint32_t data = 0;
+
//Reset the bridge to clear up the residual errors
// 0=Bridge: General reset
- uint32_t data = 0x80000000;
+ data = 0x80000000;
uint64_t mesrb0_reg = getControlReg(i_addrInfo.accessInfo.type)
| FSI_MESRB0_1D0;
- l_err = write( i_addrInfo.opbTarg, mesrb0_reg, &data );
+ l_err = write( i_addrInfo.accessInfo.master, mesrb0_reg, &data );
+ if(l_err) break;
+
+ //perform error reset on Centaur fsi slave:
+ // write 0x4000000 to addr=834.
+ data = 0x4000000;
+ l_err = write( i_addrInfo.fsiTarg, FSI::SLRES_34, &data );
+ if(l_err) break;
+
+ //further step is to issue a PIB reset to the FSI2PIB engine
+ //in busy state, i.e. write arbitrary data to 101c
+ //(putcfam 1007) register of the previously failed FSI2PIB
+ //engine on Centaur.
+ data = 0xFFFFFFFF;
+ l_err = write( i_addrInfo.fsiTarg,FSI:: FSI2PIB_STATUS, &data );
+ if(l_err) break;
+
+ //then, write arbitrary data to 1018 (putcfam 1006) to
+ //reset any pending FSI2PIB errors.
+ data = 0xFFFFFFFF;
+ l_err = write( i_addrInfo.fsiTarg, FSI::FSI2PIB_RESET, &data );
+ if(l_err) break;
+
+ //Reset the master's bridge to clear up the residual errors
+ // unless the FSI master has no master above it
+ if( i_addrInfo.accessInfo.master != iv_master )
+ {
+ // 0=Bridge: General reset
+ data = 0x80000000;
+ mesrb0_reg = MFSI_CONTROL_REG | FSI_MESRB0_1D0;
+ l_err = write( iv_master, mesrb0_reg, &data );
+ if(l_err) break;
+ }
+
+ //Trace some values for FFDC in case this cleanup
+ // didn't really work
+ uint32_t grabregs[] = {
+ MFSI_CONTROL_REG|FSI_MSIEP0_030,
+ CMFSI_CONTROL_REG|FSI_MSIEP0_030,
+ MFSI_CONTROL_REG|FSI_MAEB_070,
+ CMFSI_CONTROL_REG|FSI_MAEB_070
+ };
+ for( size_t r = 0;
+ r < (sizeof(grabregs)/sizeof(grabregs[0]));
+ r++ )
+ {
+ l_err = read( i_addrInfo.accessInfo.master,
+ MFSI_CONTROL_REG|FSI_MSIEP0_030, &data );
+ if(l_err) break;
+ TRACFCOMP( g_trac_fsi, "errorCleanup> %.8X->%.6X = %.8X", TARGETING::get_huid(i_addrInfo.accessInfo.master), grabregs[r], data );
+ }
if(l_err) break;
}
@@ -2449,7 +2510,7 @@ errlHndl_t FsiDD::checkForErrors( FsiAddrInfo_t& i_addrInfo )
l_err = read( i_addrInfo.accessInfo.master, maeb_reg, &maeb_data );
if( !l_err && (maeb_data != 0) )
{
- TRACFCOMP( g_trac_fsi, "FsiDD::checkForErrors> After op to %.8X, MAEB=%.8X (Master=%.8X)", TARGETING::get_huid(i_addrInfo.fsiTarg), maeb_data, TARGETING::get_huid(i_addrInfo.opbTarg) );
+ TRACFCOMP( g_trac_fsi, "FsiDD::checkForErrors> After op to %.8X, MAEB(%.4X)=%.8X (Master=%.8X)", TARGETING::get_huid(i_addrInfo.fsiTarg), maeb_reg, maeb_data, TARGETING::get_huid(i_addrInfo.accessInfo.master) );
/*@
* @errortype
* @moduleid FSI::MOD_FSIDD_CHECKFORERRORS
diff --git a/src/usr/fsi/fsidd.H b/src/usr/fsi/fsidd.H
index d9901277f..b29ff9f9a 100644
--- a/src/usr/fsi/fsidd.H
+++ b/src/usr/fsi/fsidd.H
@@ -403,15 +403,18 @@ class FsiDD
FSI2OPB_OFFSET_1 = 0x00030000, /**< cMFSI 1 */
// Bit masks
- OPB_STAT_BUSY = 0x00010000, /**< Bit 15 is the Busy bit */
- OPB_STAT_READ_VALID = 0x00020000, /**< Bit 14 is the Valid Read bit */
+ OPB_STAT_BUSY = 0x00010000, /**< 15 is the Busy bit */
+ OPB_STAT_READ_VALID = 0x00020000, /**< 14 is the Valid Read bit */
OPB_STAT_ERRACK = 0x00100000, /**< 11 is OPB errAck */
- OPB_STAT_ERR_OPB = 0xFFFC0000, /**< 0-14 are OPB errors */
- OPB_STAT_ERR_CMFSI = 0x0000FC00, /**< 16-21 are cMFSI errors */
- OPB_STAT_ERR_MFSI = 0x000000FC, /**< 24-29 are MFSI errors */
+ OPB_STAT_ANYERR = 0x80000000, /**< 0 is Any error */
+ OPB_STAT_ERR_OPB = 0x7FEC0000, /**< 1:10,12:13 are OPB errors */
+ OPB_STAT_ERR_CMFSI = 0x0000FC00, /**< 16:21 are cMFSI errors */
+ OPB_STAT_ERR_MFSI = 0x000000FC, /**< 24:29 are MFSI errors */
OPB_STAT_ERR_ANY = (OPB_STAT_ERR_OPB |
OPB_STAT_ERR_CMFSI |
- OPB_STAT_ERR_MFSI),
+ OPB_STAT_ERR_MFSI |
+ OPB_STAT_ERRACK |
+ OPB_STAT_ANYERR ),
};
/**
@@ -428,6 +431,7 @@ class FsiDD
FSI_MSIEP0_030 = 0x030,
FSI_MAESP0_050 = 0x050,
FSI_MAEB_070 = 0x070, //MREFP0
+ FSI_MBSYP0_078 = 0x078,
FSI_MRESP0_0D0 = 0x0D0,
FSI_MSTAP0_0D0 = 0x0D0,
FSI_MRESP0_0D1 = 0x0D1,
OpenPOWER on IntegriCloud