From f3c513e40abb822d78c0a83d7bf874d30eb733a1 Mon Sep 17 00:00:00 2001 From: Dan Crowell Date: Fri, 28 Feb 2014 13:14:36 -0600 Subject: Improve FSI PIB2OPB Error Recovery Modified which error bits are checked in the PIB2OPB status as well as changing a few error reset functions. Change-Id: I27676947983f0b66c940d68bbd5f134912749ad9 CQ: SW248395 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/9238 Tested-by: Jenkins Server Reviewed-by: Michael Baiocchi Reviewed-by: A. Patrick Williams III --- src/build/tools/listdeps.pl | 3 + src/include/usr/fsi/fsiif.H | 14 +- src/include/usr/hwas/hwasplatreasoncodes.H | 2 + src/usr/fsi/fsidd.C | 283 +++++++++++++++------ src/usr/fsi/fsidd.H | 22 +- src/usr/fsiscom/fsiscom.C | 46 +++- src/usr/hwas/hwasPlat.C | 47 ++++ .../hwpf/hwp/activate_powerbus/activate_powerbus.C | 15 ++ src/usr/pore/poreve/porevesrc/pib2cfam.C | 6 +- 9 files changed, 339 insertions(+), 99 deletions(-) diff --git a/src/build/tools/listdeps.pl b/src/build/tools/listdeps.pl index 124c9e2a4..35bacd5b8 100755 --- a/src/build/tools/listdeps.pl +++ b/src/build/tools/listdeps.pl @@ -202,6 +202,9 @@ my %resident_modules = ( "libi2c.so" => '1', "libutil.so" => '1', "libibscom.so" => '1', + "libfsiscom.so" => '1', + "libfsi.so" => '1', + "libscan.so" => '1', ); diff --git a/src/include/usr/fsi/fsiif.H b/src/include/usr/fsi/fsiif.H index 8d1b5f15b..4aa67f4fb 100644 --- a/src/include/usr/fsi/fsiif.H +++ b/src/include/usr/fsi/fsiif.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2011,2013 */ +/* COPYRIGHT International Business Machines Corp. 2011,2014 */ /* */ /* p1 */ /* */ @@ -71,6 +71,7 @@ enum fsiFFDCType_t FFDC_READWRITE_FAIL = 1, FFDC_PIB_FAIL = 2, FFDC_OPB_FAIL = 3, + FFDC_OPB_FAIL_SLAVE = 4, }; /** @@ -86,7 +87,16 @@ enum fsiFFDCType_t */ void getFsiFFDC( fsiFFDCType_t i_ffdc_type, errlHndl_t &io_log, - TARGETING::Target* i_target); + TARGETING::Target* i_target ); + +/** + * @brief Cleanup the FSI PIB2OPB logic on the procs + * + * @param[in] i_target Proc Chip Target to reset + * + * @return errlHndl_t NULL on success + */ +errlHndl_t resetPib2Opb( TARGETING::Target* i_target ); /** diff --git a/src/include/usr/hwas/hwasplatreasoncodes.H b/src/include/usr/hwas/hwasplatreasoncodes.H index fc14e2a0d..48c8c41f2 100644 --- a/src/include/usr/hwas/hwasplatreasoncodes.H +++ b/src/include/usr/hwas/hwasplatreasoncodes.H @@ -35,6 +35,7 @@ namespace HWAS MOD_HOST_DISCOVER_TARGETS = 0x80, MOD_HOST_GARD = 0x81, MOD_PLAT_DECONFIG_GARD = 0x82, + MOD_PLAT_READIDEC = 0x83, }; enum HwasPlatReasonCode @@ -44,6 +45,7 @@ namespace HWAS RC_TOP_LEVEL_TARGET_NULL = HWAS_COMP_ID | 0x80, RC_TARGET_NOT_GARDABLE = HWAS_COMP_ID | 0x81, RC_GARD_REPOSITORY_FULL = HWAS_COMP_ID | 0x82, + RC_BAD_CHIPID = HWAS_COMP_ID | 0x83, }; }; diff --git a/src/usr/fsi/fsidd.C b/src/usr/fsi/fsidd.C index c45b4fbf9..a12ca306e 100644 --- a/src/usr/fsi/fsidd.C +++ b/src/usr/fsi/fsidd.C @@ -296,6 +296,14 @@ void getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, errlHndl_t &i_log, } } +/** + * @brief Cleanup the FSI PIB2OPB logic on the procs + */ +errlHndl_t resetPib2Opb( TARGETING::Target* i_target ) +{ + return Singleton::instance().resetPib2Opb( i_target ); +} + }; //end FSI namespace @@ -384,12 +392,6 @@ errlHndl_t FsiDD::write(TARGETING::Target* i_target, return l_err; } - - -/******************** - Internal Methods - ********************/ - /** * @brief Initialize the FSI hardware */ @@ -492,6 +494,10 @@ errlHndl_t FsiDD::initializeHardware() ++t_itr; } + // Cleanup any initial error states + l_err = resetPib2Opb( iv_master ); + if( l_err ) { break; } + // setup the local master control regs for the MFSI l_err = initMasterControl(iv_master,TARGETING::FSI_MASTER_TYPE_MFSI); if( l_err ) @@ -645,26 +651,34 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, { errlHndl_t tmp_err = NULL; + // Use this call to find the OPB Master to read + FsiAddrInfo_t addr_info( i_target, 0x12345678 ); + tmp_err = genFullFsiAddr( addr_info ); + if( tmp_err ) + { + delete tmp_err; + return; + } + // Figure out which control regs to use for FFDC regs - FsiChipInfo_t fsi_info = getFsiInfo( i_target ); - uint64_t ctl_reg = getControlReg(fsi_info.type); + uint64_t ctl_reg = getControlReg(addr_info.accessInfo.type); // Add data to error log where possible uint32_t data = 0; - ERRORLOG::ErrlUserDetailsLogRegister l_eud_fsiT(i_target); + ERRORLOG::ErrlUserDetailsLogRegister l_eud_fsiT(addr_info.opbTarg); uint64_t dump_regs[] = { + ctl_reg|FSI_MATRB0_1D8, + ctl_reg|FSI_MDTRB0_1DC, ctl_reg|FSI_MESRB0_1D0, ctl_reg|FSI_MAESP0_050, ctl_reg|FSI_MAEB_070, ctl_reg|FSI_MSCSB0_1D4, - ctl_reg|FSI_MATRB0_1D8, - ctl_reg|FSI_MDTRB0_1DC }; for( size_t x=0; x<(sizeof(dump_regs)/sizeof(dump_regs[0])); x++ ) { - tmp_err = read( dump_regs[x], &data ); + tmp_err = read( addr_info.opbTarg, dump_regs[x], &data ); if( tmp_err ) { delete tmp_err; @@ -681,7 +695,7 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, for( size_t p = 0; p < 8; p++ ) { uint32_t addr1 = ctl_reg|(FSI_MSTAP0_0D0+p*0x4); - tmp_err = read( addr1, &data ); + tmp_err = read( addr_info.opbTarg, addr1, &data ); if( tmp_err ) { delete tmp_err; @@ -701,6 +715,8 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, else if( FSI::FFDC_PIB_FAIL == i_ffdc_type ) { errlHndl_t tmp_err = NULL; + FsiChipInfo_t fsi_info = getFsiInfo( i_target ); + ERRORLOG::ErrlUserDetailsLogRegister regdata(iv_master); regdata.addData(DEVICE_XSCOM_ADDRESS(0x00020001ull)); regdata.addToLog(io_log); @@ -718,7 +734,7 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, uint32_t databuf = 32; for( size_t x=0; x<(sizeof(dump_regs)/sizeof(dump_regs[0])); x++ ) { - tmp_err = read( i_target, dump_regs[x], &databuf ); + tmp_err = read( fsi_info.master, dump_regs[x], &databuf ); if( tmp_err ) { delete tmp_err; @@ -744,6 +760,10 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, { // Read some error regs from scom ERRORLOG::ErrlUserDetailsLogRegister l_scom_data(i_target); + // What I thought I wrote last... + l_scom_data.addDataBuffer(&iv_lastOpbCmd, + sizeof(iv_lastOpbCmd), + DEVICE_XSCOM_ADDRESS(0xFF00000000020000ull)); // OPB Regs l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x00020000ull)); l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x00020001ull)); @@ -756,14 +776,81 @@ void FsiDD::getFsiFFDC(FSI::fsiFFDCType_t i_ffdc_type, l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x0002000Aull)); // Other suggestions from Markus Cebulla l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x0005001Cull));//SBE_VITAL - l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x0005001Cull));//SBE_VITAL + l_scom_data.addData(DEVICE_XSCOM_ADDRESS(0x00010005ull));//Secure reg l_scom_data.addToLog(io_log); } + else if( FSI::FFDC_OPB_FAIL_SLAVE == i_ffdc_type ) + { + errlHndl_t tmp_err = NULL; + // Find the OPB Master and then collect FFDC_OPB_FAIL + FsiAddrInfo_t addr_info( i_target, 0x12345678 ); + tmp_err = genFullFsiAddr( addr_info ); + if( tmp_err ) + { + delete tmp_err; + } + else + { + getFsiFFDC( FSI::FFDC_OPB_FAIL, + io_log, + addr_info.opbTarg ); + } + } return; } +/** + * @brief Cleanup the FSI PIB2OPB logic on the procs + * + * @param[in] i_target Proc Chip Target to reset + * + * @return errlHndl_t NULL on success + */ +errlHndl_t FsiDD::resetPib2Opb( TARGETING::Target* i_target ) +{ + errlHndl_t errhdl = NULL; + TRACFCOMP(g_trac_fsi, "FsiDD::resetPib2Opb(%.8X)>", TARGETING::get_huid(i_target) ); + + do { + // Clear out OPB error + uint64_t scom_data = 0; + size_t scom_size = sizeof(scom_data); + + uint64_t opbaddr = FSI2OPB_OFFSET_0 | OPB_REG_RES; + scom_data = 0x8000000000000000; //0=Unit Reset + errhdl = deviceOp( DeviceFW::WRITE, + i_target, + &scom_data, + scom_size, + DEVICE_XSCOM_ADDRESS(opbaddr) ); + if( errhdl ) { break; } + + opbaddr = FSI2OPB_OFFSET_0 | OPB_REG_STAT; + errhdl = deviceOp( DeviceFW::WRITE, + i_target, + &scom_data, + scom_size, + DEVICE_XSCOM_ADDRESS(opbaddr) ); + if( errhdl ) { break; } + + // Check if we have any errors left + opbaddr = FSI2OPB_OFFSET_0 | OPB_REG_STAT; + scom_data = 0; + errhdl = deviceOp( DeviceFW::READ, + i_target, + &scom_data, + scom_size, + DEVICE_XSCOM_ADDRESS(opbaddr) ); + if( errhdl ) { break; } + TRACFCOMP( g_trac_fsi, "PIB2OPB Status (%.8X->%.8X) after cleanup = %.16X", TARGETING::get_huid(i_target), opbaddr, scom_data ); + } while(0); + + return errhdl; +} + + /******************** Internal Methods ********************/ @@ -775,6 +862,7 @@ FsiDD::FsiDD() :iv_master(NULL) ,iv_ffdcTask(0) ,iv_opbErrorMask(OPB_STAT_ERR_ANY) +,iv_lastOpbCmd(0) { TRACFCOMP(g_trac_fsi, "FsiDD::FsiDD()>"); @@ -852,6 +940,7 @@ errlHndl_t FsiDD::read(FsiAddrInfo_t& i_addrInfo, errlHndl_t l_err = NULL; bool need_unlock = false; mutex_t* l_mutex = NULL; + *o_buffer = 0xDEADBEEF; do { // setup the OPB command register @@ -872,11 +961,20 @@ errlHndl_t FsiDD::read(FsiAddrInfo_t& i_addrInfo, need_unlock = true; } + // make sure there are no other ops running before we start + l_err = pollForComplete( i_addrInfo, NULL ); + if( l_err ) + { + TRACFCOMP(g_trac_fsi, "FsiDD::read> FSI Errors before doing read operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr ); + break; + } + // always read/write 64 bits to SCOM size_t scom_size = sizeof(uint64_t); // write the OPB command register to trigger the read - TRACUCOMP(g_trac_fsi, "FsiDD::read> ScomWRITE : opbaddr=%.16llX, data=%.16llX", opbaddr, fsicmd ); + iv_lastOpbCmd = fsicmd; + TRACUCOMP(g_trac_fsi, "FsiDD::read> ScomWRITE to %.8X: opbaddr=%.16llX, data=%.16llX", TARGETING::get_huid(i_addrInfo.opbTarg), opbaddr, fsicmd ); l_err = deviceOp( DeviceFW::WRITE, i_addrInfo.opbTarg, &fsicmd, @@ -892,6 +990,7 @@ errlHndl_t FsiDD::read(FsiAddrInfo_t& i_addrInfo, l_err = pollForComplete( i_addrInfo, o_buffer ); if( l_err ) { + TRACFCOMP(g_trac_fsi, "FsiDD::read> FSI Errors after doing read operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr ); break; } @@ -903,10 +1002,10 @@ errlHndl_t FsiDD::read(FsiAddrInfo_t& i_addrInfo, } // atomic section << - - TRACRCOMP(g_trac_fsir, "FSI READ : %.6X = %.8X", i_addrInfo.absAddr, *o_buffer ); } while(0); + TRACRCOMP(g_trac_fsir, "FSI READ : %.8X->%.6X = %.8X", TARGETING::get_huid(i_addrInfo.opbTarg), i_addrInfo.absAddr, *o_buffer ); + if( need_unlock ) { mutex_unlock(l_mutex); @@ -928,7 +1027,7 @@ errlHndl_t FsiDD::write(FsiAddrInfo_t& i_addrInfo, mutex_t* l_mutex = NULL; do { - TRACRCOMP(g_trac_fsir, "FSI WRITE : %.6X = %.8X", i_addrInfo.absAddr, *i_buffer ); + TRACRCOMP(g_trac_fsir, "FSI WRITE : %.8X->%.6X = %.8X", TARGETING::get_huid(i_addrInfo.opbTarg), i_addrInfo.absAddr, *i_buffer ); // pull out the data to write (length has been verified) uint32_t fsidata = *i_buffer; @@ -943,8 +1042,8 @@ errlHndl_t FsiDD::write(FsiAddrInfo_t& i_addrInfo, uint64_t opbaddr = genOpbScomAddr(i_addrInfo,OPB_REG_CMD); // atomic section >> - l_mutex - = (i_addrInfo.opbTarg)->getHbMutexAttr(); + l_mutex = (i_addrInfo.opbTarg)-> + getHbMutexAttr(); if( (iv_ffdcTask == 0) // performance hack for typical case || (iv_ffdcTask != task_gettid()) ) @@ -953,8 +1052,17 @@ errlHndl_t FsiDD::write(FsiAddrInfo_t& i_addrInfo, need_unlock = true; } + // make sure there are no other ops running before we start + l_err = pollForComplete( i_addrInfo, NULL ); + if( l_err ) + { + TRACFCOMP(g_trac_fsi, "FsiDD::write> FSI Errors before doing write operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr ); + break; + } + // write the OPB command register - TRACUCOMP(g_trac_fsi, "FsiDD::write> ScomWRITE : opbaddr=%.16llX, data=%.16llX", opbaddr, fsicmd ); + iv_lastOpbCmd = fsicmd; + TRACUCOMP(g_trac_fsi, "FsiDD::write> ScomWRITE to %.8X: opbaddr=%.16llX, data=%.16llX", TARGETING::get_huid(i_addrInfo.opbTarg), opbaddr, fsicmd ); l_err = deviceOp( DeviceFW::WRITE, i_addrInfo.opbTarg, &fsicmd, @@ -970,6 +1078,7 @@ errlHndl_t FsiDD::write(FsiAddrInfo_t& i_addrInfo, l_err = pollForComplete( i_addrInfo, NULL ); if( l_err ) { + TRACFCOMP(g_trac_fsi, "FsiDD::write> FSI Errors after doing write operation : %.8X->%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr ); break; } @@ -1003,7 +1112,19 @@ errlHndl_t FsiDD::handleOpbErrors(FsiAddrInfo_t& i_addrInfo, { errlHndl_t l_err = NULL; - if( (i_opbStatReg & iv_opbErrorMask) + // Do not look at error bits for the Master we're not using + uint32_t l_opbErrorMask = iv_opbErrorMask; + if( i_addrInfo.accessInfo.type == TARGETING::FSI_MASTER_TYPE_CMFSI ) + { + l_opbErrorMask &= ~OPB_STAT_ERR_MFSI; + } + else + { + l_opbErrorMask &= ~OPB_STAT_ERR_CMFSI; + } + + // Fail if there is a relevant error bit or the op never finished + if( (i_opbStatReg & l_opbErrorMask) || (i_opbStatReg & OPB_STAT_BUSY) ) { // If we're already in the middle of handling an error and we failed @@ -1018,14 +1139,15 @@ errlHndl_t FsiDD::handleOpbErrors(FsiAddrInfo_t& i_addrInfo, return l_err; // just leave } - TRACFCOMP( g_trac_fsi, "FsiDD::handleOpbErrors> Error during FSI access to %.8X : relAddr=0x%X, absAddr=0x%X, OPB Status=0x%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr, i_addrInfo.absAddr, i_opbStatReg ); + TRACFCOMP( g_trac_fsi, "FsiDD::handleOpbErrors> Error during FSI access to %.8X : relAddr=0x%X, absAddr=%.8X->%.6X, OPB Status=0x%.8X, l_opbErrorMask=%.8X", TARGETING::get_huid(i_addrInfo.fsiTarg), i_addrInfo.relAddr, TARGETING::get_huid(i_addrInfo.opbTarg), i_addrInfo.absAddr, i_opbStatReg, l_opbErrorMask ); /*@ * @errortype * @moduleid FSI::MOD_FSIDD_HANDLEOPBERRORS * @reasoncode FSI::RC_OPB_ERROR - * @userdata1[0:31] Relative FSI Address + * @userdata1[00:31] Relative FSI Address * @userdata1[32:63] Absolute FSI Address - * @userdata2 OPB Status Register + * @userdata2[00:31] OPB Status Register + * @userdata2[32:63] FSI Master HUID * @devdesc FsiDD::handleOpbErrors> Error during FSI access */ l_err = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_UNRECOVERABLE, @@ -1034,10 +1156,11 @@ errlHndl_t FsiDD::handleOpbErrors(FsiAddrInfo_t& i_addrInfo, TWO_UINT32_TO_UINT64( i_addrInfo.relAddr, i_addrInfo.absAddr), - TWO_UINT32_TO_UINT64(i_opbStatReg,0)); + TWO_UINT32_TO_UINT64(i_opbStatReg, + TARGETING::get_huid(i_addrInfo.opbTarg))); //mask off the bits we're ignoring before looking closer - uint32_t l_opb_stat = (i_opbStatReg & iv_opbErrorMask); + uint32_t l_opb_stat = (i_opbStatReg & l_opbErrorMask); /* OPB_errAck @@ -1098,9 +1221,11 @@ errlHndl_t FsiDD::handleOpbErrors(FsiAddrInfo_t& i_addrInfo, if( !root_cause_found ) { // read the Status Bridge0 Register + FsiChipInfo_t fsi_info = getFsiInfo( i_addrInfo.fsiTarg ); + uint64_t ctl_reg = getControlReg(fsi_info.type); uint32_t mesrb0_data = 0; tmp_err = read( i_addrInfo.accessInfo.master, - FSI_MESRB0_1D0, + ctl_reg|FSI_MESRB0_1D0, &mesrb0_data ); if( tmp_err ) { @@ -1226,9 +1351,20 @@ errlHndl_t FsiDD::pollForComplete(FsiAddrInfo_t& i_addrInfo, uint32_t* o_readData) { errlHndl_t l_err = NULL; - enum { MAX_OPB_TIMEOUT_NS = 15*NS_PER_MSEC }; //=15ms + enum { MAX_OPB_TIMEOUT_NS = 10*NS_PER_MSEC }; //=10ms do { + // Do not look at error bits for the Master we're not using + uint32_t l_opbErrorMask = iv_opbErrorMask; + if( i_addrInfo.accessInfo.type == TARGETING::FSI_MASTER_TYPE_CMFSI ) + { + l_opbErrorMask &= ~OPB_STAT_ERR_MFSI; + } + else + { + l_opbErrorMask &= ~OPB_STAT_ERR_CMFSI; + } + // poll for complete uint32_t read_data[2]; size_t scom_size = sizeof(uint64_t); @@ -1260,7 +1396,7 @@ errlHndl_t FsiDD::pollForComplete(FsiAddrInfo_t& i_addrInfo, // check for completion or error TRACUCOMP(g_trac_fsi, "FsiDD::pollForComplete> ScomREAD : read_data[0]=%.8llX", read_data[0] ); if( ((read_data[0] & OPB_STAT_BUSY) == 0) //not busy - || (read_data[0] & iv_opbErrorMask) ) //error bits + || (read_data[0] & l_opbErrorMask) ) //error bits { break; } @@ -1270,6 +1406,14 @@ errlHndl_t FsiDD::pollForComplete(FsiAddrInfo_t& i_addrInfo, } while( elapsed_time_ns <= MAX_OPB_TIMEOUT_NS ); // hardware has 1ms limit if( l_err ) { break; } + // check if we got an error from the OPB + // (will also check for busy/timeout) + l_err = handleOpbErrors( i_addrInfo, read_data[0] ); + if( l_err ) + { + break; + } + // we should never timeout because the hardware should set an error if( elapsed_time_ns > MAX_OPB_TIMEOUT_NS ) { @@ -1316,20 +1460,17 @@ errlHndl_t FsiDD::pollForComplete(FsiAddrInfo_t& i_addrInfo, l_err, i_addrInfo.opbTarg ); + //Clear out the error indication so that we can + // do subsequent FSI operations + errlHndl_t tmp_err = errorCleanup( i_addrInfo, FSI::RC_OPB_ERROR ); + if(tmp_err) { delete tmp_err; } + l_err->collectTrace(FSI_COMP_NAME); l_err->collectTrace(FSIR_TRACE_BUF); break; } - // check if we got an error from the OPB - // (will also check for busy/timeout) - l_err = handleOpbErrors( i_addrInfo, read_data[0] ); - if( l_err ) - { - break; - } - // read valid isn't on if( o_readData ) // only check if we're doing a read { @@ -1537,6 +1678,8 @@ errlHndl_t FsiDD::genFullFsiAddr(FsiAddrInfo_t& io_addrInfo) !(iv_master->getAttr() == TARGETING::MODEL_VENICE) ) //@fixme-RTC:35041 { + //use the local proc to drive the operation instead of + // going through the master proc indirectly io_addrInfo.opbTarg = io_addrInfo.accessInfo.master; // Note: no need to append the MFSI port since it is now local } @@ -1763,40 +1906,16 @@ errlHndl_t FsiDD::initMasterControl(TARGETING::Target* i_master, l_err = genFullFsiAddr(addr_info); if( l_err ) { break; } + // Ensure we don't have any errors before we even start uint32_t scom_data[2] = {}; size_t scom_size = sizeof(scom_data); - - uint64_t opbaddr = genOpbScomAddr(addr_info,OPB_REG_RES); - scom_data[0] = 0; scom_data[1] = 0; - l_err = deviceOp( DeviceFW::WRITE, - iv_master, - scom_data, - scom_size, - DEVICE_XSCOM_ADDRESS(opbaddr) ); - if( l_err ) { break; } - - opbaddr = genOpbScomAddr(addr_info,OPB_REG_STAT); - scom_data[0] = 0; scom_data[1] = 0; - l_err = deviceOp( DeviceFW::WRITE, - iv_master, - scom_data, - scom_size, - DEVICE_XSCOM_ADDRESS(opbaddr) ); - if( l_err ) { break; } - - // Ensure we don't have any errors before we even start - opbaddr = genOpbScomAddr(addr_info,OPB_REG_STAT); + uint64_t opbaddr = genOpbScomAddr(addr_info,OPB_REG_STAT); l_err = deviceOp( DeviceFW::READ, iv_master, scom_data, scom_size, DEVICE_XSCOM_ADDRESS(opbaddr) ); if( l_err ) { break; } - // Trace initial state for debug - TRACFCOMP(g_trac_fsi,"Scom %0.8X = %0.8X %0.8X", - opbaddr, - scom_data[0], - scom_data[1]); l_err = handleOpbErrors( addr_info, scom_data[0] ); if( l_err ) { @@ -1858,7 +1977,7 @@ errlHndl_t FsiDD::initMasterControl(TARGETING::Target* i_master, databuf = 0x50040400; //Setup timeout so that: - // code(15ms) > masterproc (0.9ms) > remote fsi master (0.8ms) + // code(10ms) > masterproc (0.9ms) > remote fsi master (0.8ms) if( i_master == iv_master ) { // 26:27= Timeout (b01) = 0.9ms @@ -2211,25 +2330,19 @@ errlHndl_t FsiDD::errorCleanup( FsiAddrInfo_t& i_addrInfo, do { if( FSI::RC_OPB_ERROR == i_errType ) { - // Clear out OPB error - uint64_t scomdata = 0; - size_t scom_size = sizeof(uint64_t); - l_err = deviceOp( DeviceFW::WRITE, - i_addrInfo.opbTarg, - &scomdata, - scom_size, - DEVICE_XSCOM_ADDRESS(0x00020001ull) ); + //Clear out the pib2opb logic for the master + // that failed + l_err = resetPib2Opb( i_addrInfo.opbTarg ); if(l_err) break; } else if( FSI::RC_ERROR_IN_MAEB == i_errType ) { - //Reset the port to clear up the residual errors - // 1= Port: Error reset - uint32_t data = 0x40000000; - uint64_t mresp0_reg = getControlReg(i_addrInfo.accessInfo.type) - | FSI_MRESP0_0D0 - | (i_addrInfo.accessInfo.port*4); - l_err = write( mresp0_reg, &data ); + //Reset the bridge to clear up the residual errors + // 0=Bridge: General reset + uint32_t data = 0x80000000; + uint64_t mesrb0_reg = getControlReg(i_addrInfo.accessInfo.type) + | FSI_MESRB0_1D0; + l_err = write( i_addrInfo.opbTarg, mesrb0_reg, &data ); if(l_err) break; } @@ -2260,9 +2373,9 @@ errlHndl_t FsiDD::checkForErrors( FsiAddrInfo_t& i_addrInfo ) { errlHndl_t l_err = NULL; - if( i_addrInfo.fsiTarg == iv_master ) + if( i_addrInfo.fsiTarg == i_addrInfo.opbTarg ) { - //nothing to check here in operations directed at master proc + //nothing to check here in operations directed at FSI Master return NULL; } @@ -2278,7 +2391,7 @@ errlHndl_t FsiDD::checkForErrors( FsiAddrInfo_t& i_addrInfo ) l_err = read( i_addrInfo.accessInfo.master, maeb_reg, &maeb_data ); if( !l_err && (maeb_data != 0) ) { - TRACFCOMP( g_trac_fsi, "FsiDD::read> Error after read of %.8X, MAEB=%lX", TARGETING::get_huid(i_addrInfo.fsiTarg), maeb_data ); + TRACFCOMP( g_trac_fsi, "FsiDD::checkForErrors> After op to %.8X, MAEB=%lX (Master=%.8X)", TARGETING::get_huid(i_addrInfo.fsiTarg), maeb_data, TARGETING::get_huid(i_addrInfo.opbTarg) ); /*@ * @errortype * @moduleid FSI::MOD_FSIDD_CHECKFORERRORS @@ -2312,6 +2425,8 @@ errlHndl_t FsiDD::checkForErrors( FsiAddrInfo_t& i_addrInfo ) //Reset the port to clean up residual errors errorCleanup(i_addrInfo,FSI::RC_ERROR_IN_MAEB); } + + iv_ffdcTask = 0; } return l_err; diff --git a/src/usr/fsi/fsidd.H b/src/usr/fsi/fsidd.H index 755330b33..99150c277 100644 --- a/src/usr/fsi/fsidd.H +++ b/src/usr/fsi/fsidd.H @@ -87,7 +87,7 @@ class FsiDD * @param[in] i_type FSI Master Type (MFSI or cMFSI) * @param[in] i_port Slave port number * @param[out] o_detected Bitstring of detected slaves - * + * * @return bool true if port sensed as active during FSI initialization */ bool isSlavePresent( TARGETING::Target* i_fsiMaster, @@ -100,7 +100,7 @@ class FsiDD * * @param[in] i_target * @param[out] o_detected Bitstring of detected slaves - * + * * @return bool true if port sensed as active during FSI initialization */ bool isSlavePresent( TARGETING::Target* i_target, @@ -121,6 +121,15 @@ class FsiDD errlHndl_t &io_log, TARGETING::Target* i_target ); + /** + * @brief Cleanup the FSI PIB2OPB logic on the procs + * + * @param[in] i_target Proc Chip Target to reset + * + * @return errlHndl_t NULL on success + */ + errlHndl_t resetPib2Opb( TARGETING::Target* i_target ); + protected: /** * @brief Constructor @@ -391,7 +400,7 @@ class FsiDD OPB_STAT_BUSY = 0x00010000, /**< Bit 15 is the Busy bit */ OPB_STAT_READ_VALID = 0x00020000, /**< Bit 14 is the Valid Read bit */ OPB_STAT_ERRACK = 0x00100000, /**< 11 is OPB errAck */ - OPB_STAT_ERR_OPB = 0x09F00000, /**< 4,7-11 are OPB errors */ + OPB_STAT_ERR_OPB = 0xFFFC0000, /**< 0-14 are OPB errors */ OPB_STAT_ERR_CMFSI = 0x0000FC00, /**< 16-21 are cMFSI errors */ OPB_STAT_ERR_MFSI = 0x000000FC, /**< 24-29 are MFSI errors */ OPB_STAT_ERR_ANY = (OPB_STAT_ERR_OPB | @@ -529,7 +538,7 @@ class FsiDD * Active slaves, 1 bit per port, 1=active, * one entry per MFSI port, plus local MFSI and local cMFSI */ - uint8_t iv_slaves[MAX_SLAVE_PORTS+2]; + uint8_t iv_slaves[MAX_SLAVE_PORTS+2]; /** * Master processor target @@ -546,6 +555,11 @@ class FsiDD */ uint32_t iv_opbErrorMask; + /** + * Last OPB Command + */ + uint64_t iv_lastOpbCmd; + private: // let my testcase poke around diff --git a/src/usr/fsiscom/fsiscom.C b/src/usr/fsiscom/fsiscom.C index ea3631d05..4d390360a 100644 --- a/src/usr/fsiscom/fsiscom.C +++ b/src/usr/fsiscom/fsiscom.C @@ -67,19 +67,49 @@ void pib_error_handler( TARGETING::Target* i_target, //Add this target to the FFDC ERRORLOG::ErrlUserDetailsTarget(i_target,"SCOM Target").addToLog(i_errlog); - //Add the callouts for the specific PCB/PIB error - uint32_t pib_error = i_status >> 12; - PIB::addFruCallouts( i_target, - pib_error, - i_errlog ); - - //Grab the PIB2OPB Status reg for a Resource Occupied error - if( pib_error == PIB::PIB_RESOURCE_OCCUPIED ) //piberr=001 + //Look for a totally dead chip + if( i_status == 0xFFFFFFFF ) { + // if things are this broken then chances are there are bigger + // problems, we can just make some guesses on what to call out + + // make code the highest since there are other issues + i_errlog->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH); + + // callout this chip as Medium and deconfigure it + i_errlog->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_NULL ); + + // grab all the FFDC we can think of + FSI::getFsiFFDC( FSI::FFDC_OPB_FAIL_SLAVE, + i_errlog, + i_target ); + FSI::getFsiFFDC( FSI::FFDC_READWRITE_FAIL, + i_errlog, + i_target ); FSI::getFsiFFDC( FSI::FFDC_PIB_FAIL, i_errlog, i_target ); } + else + { + //Add the callouts for the specific PCB/PIB error + uint32_t pib_error = i_status >> 12; + PIB::addFruCallouts( i_target, + pib_error, + i_errlog ); + + //Grab the PIB2OPB Status reg for a Resource Occupied error + if( pib_error == PIB::PIB_RESOURCE_OCCUPIED ) //piberr=001 + { + FSI::getFsiFFDC( FSI::FFDC_PIB_FAIL, + i_errlog, + i_target ); + } + } //Recovery sequence from Markus // if SCOM fails and FSI Master displays "MasterTimeOut" diff --git a/src/usr/hwas/hwasPlat.C b/src/usr/hwas/hwasPlat.C index f690dddd6..4a80846ed 100644 --- a/src/usr/hwas/hwasPlat.C +++ b/src/usr/hwas/hwasPlat.C @@ -42,6 +42,7 @@ #include #include +#include namespace HWAS { @@ -95,6 +96,52 @@ errlHndl_t platReadIDEC(const TargetHandle_t &i_target) DEVICE_FSI_ADDRESS(0x01028)); } + //Look for a totally dead chip + if( (errl == NULL) + && ((id_ec & 0xFFFFFFFF00000000) == 0xFFFFFFFF00000000) ) + { + HWAS_ERR("All FFs for chipid read on %.8X",TARGETING::get_huid(i_target)); + /*@ + * @errortype + * @moduleid HWAS::MOD_PLAT_READIDEC + * @reasoncode HWAS::RC_BAD_CHIPID + * @userdata1 Target HUID + * @userdata2 + * @devdesc platReadIDEC> Invalid chipid from hardware (all FFs) + */ + errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + HWAS::MOD_PLAT_READIDEC, + HWAS::RC_BAD_CHIPID, + TARGETING::get_huid(i_target), + 0); + + // if things are this broken then chances are there are bigger + // problems, we can just make some guesses on what to call out + + // make code the highest since there are other issues + errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH); + + // callout this chip as Medium and deconfigure it + errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_LOW, + HWAS::DECONFIG, + HWAS::GARD_NULL ); + + // Grab all the FFDC we can think of + FSI::getFsiFFDC( FSI::FFDC_OPB_FAIL_SLAVE, + errl, + i_target ); + FSI::getFsiFFDC( FSI::FFDC_READWRITE_FAIL, + errl, + i_target ); + FSI::getFsiFFDC( FSI::FFDC_PIB_FAIL, + errl, + i_target ); + + } + if (errl == NULL) { // no error, so we got a valid ID/EC value back // EC - nibbles 0,2 diff --git a/src/usr/hwpf/hwp/activate_powerbus/activate_powerbus.C b/src/usr/hwpf/hwp/activate_powerbus/activate_powerbus.C index e7b3385dc..fc7a69136 100644 --- a/src/usr/hwpf/hwp/activate_powerbus/activate_powerbus.C +++ b/src/usr/hwpf/hwp/activate_powerbus/activate_powerbus.C @@ -59,6 +59,7 @@ #include "proc_build_smp/proc_build_smp.H" #include +#include namespace ACTIVATE_POWERBUS { @@ -286,6 +287,20 @@ void* call_proc_build_smp( void *io_pArgs ) // Turn off FSI scom and turn on Xscom. l_proc_target->setAttr(l_switches); + + // Reset the FSI2OPB logic on the new chips + l_errl = FSI::resetPib2Opb(l_proc_target); + if(l_errl) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR : resetPib2Opb on %.8X", + TARGETING::get_huid(l_proc_target)); + // Create IStep error log and cross reference error that occurred + l_StepError.addErrorDetails(l_errl); + // Commit error + errlCommit( l_errl, HWPF_COMP_ID ); + break; + } } } diff --git a/src/usr/pore/poreve/porevesrc/pib2cfam.C b/src/usr/pore/poreve/porevesrc/pib2cfam.C index cfd19d455..7e8fe4d9b 100644 --- a/src/usr/pore/poreve/porevesrc/pib2cfam.C +++ b/src/usr/pore/poreve/porevesrc/pib2cfam.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2013 */ +/* COPYRIGHT International Business Machines Corp. 2012,2014 */ /* */ /* p1 */ /* */ @@ -101,6 +101,8 @@ Pib2Cfam::operation(Transaction& io_transaction) me = ME_SUCCESS; } else { me = ME_FAILURE; + //@todo CQ:SW248690 - need a better way to catch these + fapiLogError( rc, fapi::FAPI_ERRL_SEV_UNRECOVERABLE ); } break; default: @@ -131,6 +133,8 @@ Pib2Cfam::operation(Transaction& io_transaction) me = ME_SUCCESS; } else { me = ME_FAILURE; + //@todo CQ:SW248690 - need a better way to catch these + fapiLogError( rc, fapi::FAPI_ERRL_SEV_UNRECOVERABLE ); } break; -- cgit v1.2.1