diff options
Diffstat (limited to 'src/usr/ibscom/ibscom.C')
-rw-r--r-- | src/usr/ibscom/ibscom.C | 475 |
1 files changed, 371 insertions, 104 deletions
diff --git a/src/usr/ibscom/ibscom.C b/src/usr/ibscom/ibscom.C index 874f8f98c..abbc3f3c7 100644 --- a/src/usr/ibscom/ibscom.C +++ b/src/usr/ibscom/ibscom.C @@ -41,6 +41,9 @@ #include <limits.h> #include <errl/errludtarget.H> #include <xscom/piberror.H> +#include <diag/attn/attn.H> +#include <ibscom/ibscomif.H> +#include <targeting/common/utilFilter.H> // Easy macro replace for unit testing //#define TRACUCOMP(args...) TRACFCOMP(args) @@ -48,13 +51,16 @@ // Trace definition trace_desc_t* g_trac_ibscom = NULL; -TRAC_INIT(&g_trac_ibscom, "IBSCOM", KILOBYTE); +TRAC_INIT(&g_trac_ibscom, IBSCOM_COMP_NAME, KILOBYTE); using namespace ERRORLOG; using namespace TARGETING; namespace IBSCOM { +// SCOM Register addresses +const uint32_t MBS_FIR = 0x02011400; +const uint32_t MBSIBERR0 = 0x0201141B; // Register XSCcom access functions to DD framework DEVICE_REGISTER_ROUTE(DeviceFW::WILDCARD, @@ -296,6 +302,109 @@ errlHndl_t getTargetVirtualAddress(Target* i_target, return l_err; } +/////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +void err_cleanup(Target* i_target, + uint64_t i_addr) +{ + //Going to commit at most 1 informational error here + errlHndl_t l_err = NULL; + errlHndl_t tmp_err = NULL; + ERRORLOG::ErrlUserDetailsLogRegister l_logReg(i_target); + + uint64_t zeroData = 0x0; + size_t op_size = sizeof(uint64_t); + + // Clear our the status reg + op_size = sizeof(uint64_t); + tmp_err = deviceOp( DeviceFW::WRITE, + i_target, + &zeroData, + op_size, + DEVICE_FSISCOM_ADDRESS(MBSIBERR0) ); + if(tmp_err) + { + if( l_err ) + { + delete tmp_err; + } + else + { + l_err = tmp_err; + } + + //Really just want to save the address, so stick in some + //obvious dummy data + uint64_t dummyData = 0x00000000DEADBEEF; + l_logReg.addDataBuffer(&dummyData, sizeof(dummyData), + DEVICE_IBSCOM_ADDRESS(MBSIBERR0)); + } + + // Clear out the FIR bits we might trigger + uint64_t mbs_fir = 0; + op_size = sizeof(uint64_t); + tmp_err = deviceOp( DeviceFW::READ, + i_target, + &mbs_fir, + op_size, + DEVICE_FSISCOM_ADDRESS(MBS_FIR) ); + if(tmp_err) + { + if( l_err ) + { + delete tmp_err; + } + else + { + l_err = tmp_err; + } + + //Really just want to save the address, so stick in some + //obvious dummy data + uint64_t dummyData = 0x10000000DEADBEEF; + l_logReg.addDataBuffer(&dummyData, sizeof(dummyData), + DEVICE_IBSCOM_ADDRESS(MBS_FIR)); + } + + //22=MBS_FIR_MASK_REG_HOST_INBAND_READ_ERROR + //23=MBS_FIR_MASK_REG_HOST_INBAND_WRITE_ERROR + mbs_fir &= 0xFFFFFCFFFFFFFFFF; + op_size = sizeof(uint64_t); + l_err = deviceOp( DeviceFW::WRITE, + i_target, + &mbs_fir, + op_size, + DEVICE_FSISCOM_ADDRESS(MBS_FIR) ); + if(tmp_err) + { + if( l_err ) + { + delete tmp_err; + } + else + { + l_err = tmp_err; + } + + //Really just want to save the address, so stick in some + //obvious dummy data + uint64_t dummyData = 0x20000000DEADBEEF; + l_logReg.addDataBuffer(&dummyData, sizeof(dummyData), + DEVICE_IBSCOM_ADDRESS(MBS_FIR)); + } + + if( l_err ) + { + l_logReg.addToLog(l_err); + + //force to informational so we don't log extra errors + //inside of possible error collection paths + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); + errlCommit(l_err,IBSCOM_COMP_ID); + l_err = NULL; + } +} + /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// @@ -351,6 +460,35 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, l_mutex = i_target->getHbMutexAttr<TARGETING::ATTR_IBSCOM_MUTEX>(); mutex_lock(l_mutex); need_unlock = true; + + //Need to check if ibscom is still enabled before moving on in + //case we flipped the switch due to an error + ScomSwitches l_switches = i_target->getAttr<ATTR_SCOM_SWITCHES>(); + if( !l_switches.useInbandScom ) + { + TRACFCOMP(g_trac_ibscom, ERR_MRK"doIBScom> IBSCOM longer enabled on %.8X, error must have occurred", get_huid(i_target)); + /*@ + * @errortype + * @moduleid IBSCOM_DO_IBSCOM + * @reasoncode IBSCOM_RETRY_DUE_TO_ERROR + * @userdata1[0:31] HUID of Centaur Target + * @userdata1[32:64] SCOM Address + * @userdata2 Not Used + * @devdesc Previous error disabled ibscom, so forcing + * a retry via FSI + */ + l_err = + new ErrlEntry(ERRL_SEV_UNRECOVERABLE, + IBSCOM_DO_IBSCOM, + IBSCOM_RETRY_DUE_TO_ERROR, + get_huid(i_target), + i_addr); + //This error should NEVER get returned to caller, so it's a + //FW bug if it actually gets comitted. + l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH); + break; + } } if (i_opType == DeviceFW::READ) @@ -391,6 +529,8 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, //FW bug if it actually gets comitted. l_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_HIGH); + ERRORLOG::ErrlUserDetailsTarget(i_target,"IBSCOM Target") + .addToLog(l_err); break; } else @@ -416,20 +556,19 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, l_virtAddr[i_addr] = l_data; eieio(); + //Workaround for HW264203 + //A read of MBSIBWRSTAT will not trigger a SUE so we need to + //read the MBS_FIR instead. TRACDCOMP(g_trac_ibscom, - "doIBScom: Read MBSIBWRSTAT to check for error"); - //Read MBSIBWRSTAT to check for errors - //If an error occured on last write, reading MBSIBWRSTAT will - //trigger a SUE. - const uint32_t MBSIBWRSTAT = 0x201141D; - uint64_t statData = 0; + "doIBScom: Read MBS_FIR to check for error"); + uint64_t fir_data = 0; size_t readSize = sizeof(uint64_t); l_err = doIBScom(DeviceFW::READ, - i_target, - &statData, - readSize, - MBSIBWRSTAT, - true); + i_target, + &fir_data, + readSize, + MBS_FIR, + true); if(l_err != NULL) { if( IBSCOM_SUE_IN_ERR_PATH == l_err->reasonCode() ) @@ -446,103 +585,122 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, break; } } + else + { + TRACUCOMP(g_trac_ibscom, "doIBScom: MBS_FIR=%.16X",fir_data); + //check the FIR bits specifically + //23 = MBS_FIR_MASK_REG_HOST_INBAND_WRITE_ERROR: A PIB error + // or inband buffer error was detected on a host inband + // write operation. + if( fir_data & 0x0000010000000000 ) + { + TRACFCOMP(g_trac_ibscom, ERR_MRK" doIBScom: MBS_FIR[23] detected after write : %.16X", fir_data); + rw_error = true; + } + } } + // Common error checking for both read and write if(rw_error) { bool busDown = false; TRACUCOMP(g_trac_ibscom, "doIBScom: Get Error data, read MBSIBERR0"); - const uint32_t MBSIBERR0 = 0x201141B; - const uint64_t HOST_ERROR_VALID = 0x0000000080000000; - const uint64_t PIB_ERROR_STATUS_MASK = 0x0000000070000000; - const uint64_t PIB_ERROR_SHIFT = 28; - size_t readSize = sizeof(uint64_t); - uint64_t mbsiberr0_data = 0; + size_t op_size = sizeof(uint64_t); + + // Note: Using FSISCOM path to read the errors even though + // we could use IBSCOM in DD2 because it makes code simpler + + MBSIBERRO_Reg_t mbsiberr0; + op_size = sizeof(uint64_t); + l_err = deviceOp( DeviceFW::READ, + i_target, + &(mbsiberr0.data), + op_size, + DEVICE_FSISCOM_ADDRESS(MBSIBERR0) ); + if(l_err) + { + TRACFCOMP(g_trac_ibscom, ERR_MRK + "doIBScom: Error reading MBSIBERR0 over FSI"); + //Save away the IBSCOM address + ERRORLOG::ErrlUserDetailsLogRegister l_logReg(i_target); + //Really just want to save the address, so stick in some + //obvious dummy data + uint64_t dummyData = 0x30000000DEADBEEF; + l_logReg.addDataBuffer(&dummyData, sizeof(dummyData), + DEVICE_IBSCOM_ADDRESS(i_addr)); + l_logReg.addToLog(l_err); + + //force to informational so we don't log extra errors + //inside of possible error collection paths + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); + errlCommit(l_err,IBSCOM_COMP_ID); + l_err = NULL; + + //fabricate some error data + mbsiberr0.addr = i_addr; + mbsiberr0.errvalid = 1; + mbsiberr0.piberr = 0; + mbsiberr0.iswrite = (i_opType == DeviceFW::READ) ? 0 : 1; + mbsiberr0.reserved = 0xBADBAD; + } + + TRACUCOMP(g_trac_ibscom, + "doIBScom: MBSIBERR0(0x%.16x) = 0x%.16X", + MBSIBERR0, mbsiberr0.data); - //Use FSISCOM as workaround for DD1.x centaur chips (HW246298) - if(i_target->getAttr<TARGETING::ATTR_EC>() < 0x20) + //if the MBSIBERR0Q_IB_HOST_ERROR_VALID bit is not set + // then we have a bus failure + if( !(mbsiberr0.errvalid) ) { - //Need to explicitly use FSI SCOM in DD1X chips - l_err = deviceOp( DeviceFW::READ, - i_target, - &mbsiberr0_data, - readSize, - DEVICE_FSISCOM_ADDRESS(MBSIBERR0) ); - if(l_err) - { - TRACFCOMP(g_trac_ibscom, ERR_MRK - "doIBScom: Error reading MBSIBERR0 over FSI"); - //Save away the IBSCOM address - ERRORLOG::ErrlUserDetailsLogRegister - l_logReg(i_target); - //Really just want to save the addres, so stick in some - //obvious dummy data - uint64_t dummyData = 0x00000000DEADBEEF; - l_logReg.addDataBuffer(&dummyData, sizeof(dummyData), - DEVICE_IBSCOM_ADDRESS(i_addr)); - l_logReg.addToLog(l_err); - break; - } - TRACUCOMP(g_trac_ibscom, - "doIBScom: MBSIBERR0(0x%.16x) = 0x%.16X", - MBSIBERR0, mbsiberr0_data); + //Bus is down + busDown = true; + } + //confirm that we are looking at error data for the scom we did + //0:31 = MBSIBERR0Q_IB_HOST_ADDRESS: This is the 32 bit scom + // address that was being accessed when the error was detected. + else if( mbsiberr0.addr != i_addr ) + { + TRACFCOMP( g_trac_ibscom, "doIBScom> The address in MBSIBERR0 (0x%.8X) doesn't match what we were scomming (0x%.8X)", mbsiberr0.addr, i_addr ); + /*@ + * @errortype + * @moduleid IBSCOM_DO_IBSCOM + * @reasoncode IBSCOM_WRONG_ERROR + * @userdata1[0:31] HUID of Centaur Target + * @userdata1[32:64] SCOM Address + * @userdata2 Contents of MBSIBERR0 register + * @devdesc Detected error doesn't match the address + * we failed on + */ + l_err = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, + IBSCOM_DO_IBSCOM, + IBSCOM_WRONG_ERROR, + TWO_UINT32_TO_UINT64( + get_huid(i_target), + i_addr), + mbsiberr0.data); + // this would be a code bug because we got out of sync somehow + l_err->addProcedureCallout( HWAS::EPUB_PRC_HB_CODE, + HWAS::SRCI_PRIORITY_HIGH ); + ERRORLOG::ErrlUserDetailsTarget(i_target,"IBSCOM Target") + .addToLog(l_err); + ERRORLOG::ErrlUserDetailsLogRegister ffdc(i_target); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBS_FIR)); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBSIBERR0)); + ffdc.addToLog(l_err); + l_err->collectTrace(IBSCOM_COMP_NAME); //attempt to clear the error register so future accesses //will work - uint64_t zeroData = 0x0; - readSize = sizeof(uint64_t); - l_err = deviceOp( DeviceFW::WRITE, - i_target, - &zeroData, - readSize, - DEVICE_FSISCOM_ADDRESS(MBSIBERR0) ); - if(l_err ) - { - errlCommit(l_err,IBSCOM_COMP_ID); - l_err = NULL; - } + err_cleanup(i_target,i_addr); - //if the MBSIBERR0Q_IB_HOST_ERROR_VALID bit is not set - // then we have a bus failure - if( !(mbsiberr0_data & HOST_ERROR_VALID) ) - { - //Bus is down - busDown = true; - } + break; } - else // >= DD20 - { - //TODO RTC: 68984: Validate error path on DD2.0 Centaurs - l_err = doIBScom(DeviceFW::READ, - i_target, - &mbsiberr0_data, - readSize, - MBSIBERR0, - true); - if(l_err != NULL) - { - if( IBSCOM_SUE_IN_ERR_PATH == l_err->reasonCode() ) - { - TRACFCOMP(g_trac_ibscom, ERR_MRK - "doIBScom: SUE on write detected"); - delete l_err; - l_err = NULL; - busDown = true; - } - else - { - TRACFCOMP(g_trac_ibscom, ERR_MRK"doIBScom: Unexpected error when checking for SUE"); - break; - } - } - } // >= DD20 + if(busDown) { - //TODO RTC: 69115 - call PRD to do FIR analysis, return PRD - //error instead. /*@ * @errortype * @moduleid IBSCOM_DO_IBSCOM @@ -553,19 +711,25 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, * @devdesc Bus failure when attempting to perform * IBSCOM operation. IBSCOM disabled. */ - l_err = + errlHndl_t ib_err = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, IBSCOM_DO_IBSCOM, IBSCOM_BUS_FAILURE, TWO_UINT32_TO_UINT64( get_huid(i_target), i_addr), - mbsiberr0_data); + mbsiberr0.data); + + ib_err->addHwCallout(i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL); - l_err->addHwCallout(i_target, - HWAS::SRCI_PRIORITY_HIGH, - HWAS::NO_DECONFIG, - HWAS::GARD_NULL); + //grab some HW regs via FSISCOM + ERRORLOG::ErrlUserDetailsLogRegister ffdc(i_target); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBS_FIR)); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBSIBERR0)); + ffdc.addToLog(l_err); //disable IBSCOM ScomSwitches l_switches = @@ -581,6 +745,33 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, // Turn off IBSCOM and turn on FSI SCOM. i_target->setAttr<ATTR_SCOM_SWITCHES>(l_switches); } + + //@todo: RTC:92971 + //There is a potential deadlock if we call PRD here + //Look for a better PRD error + //errlHndl_t prd_err = ATTN::checkForIplAttentions(); + errlHndl_t prd_err = NULL; + if( prd_err ) + { + TRACFCOMP( g_trac_ibscom, ERR_MRK"Error from checkForIplAttentions : PLID=%X", prd_err->plid() ); + //connect up the plids + ib_err->plid(prd_err->plid()); + //commit my log as info because PRD's log is better + ib_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); + errlCommit(ib_err,IBSCOM_COMP_ID); + l_err = prd_err; + } + else + { + //my log is the only one + l_err = ib_err; + } + + l_err->collectTrace(IBSCOM_COMP_NAME); + + //Note-not cleaning up the error status here since + // we will not be using IBSCOM again + break; } else // bus isn't down, some other kind of error @@ -597,23 +788,33 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, */ l_err = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, IBSCOM_DO_IBSCOM, - IBSCOM_BUS_FAILURE, + IBSCOM_PIB_FAILURE, TWO_UINT32_TO_UINT64( get_huid(i_target), i_addr), - mbsiberr0_data); + mbsiberr0.data); //Add this target to the FFDC - ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog(l_err); - - uint64_t pib_code = - (mbsiberr0_data & PIB_ERROR_STATUS_MASK) >> PIB_ERROR_SHIFT; + ERRORLOG::ErrlUserDetailsTarget(i_target,"IBSCOM Target") + .addToLog(l_err); //add callouts based on the PIB error PIB::addFruCallouts( i_target, - pib_code, + mbsiberr0.piberr, l_err ); + //grab some HW regs via FSISCOM + ERRORLOG::ErrlUserDetailsLogRegister ffdc(i_target); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBS_FIR)); + ffdc.addData(DEVICE_FSISCOM_ADDRESS(MBSIBERR0)); + ffdc.addToLog(l_err); + + l_err->collectTrace(IBSCOM_COMP_NAME); + + //attempt to clear the error register so future accesses + //will work + err_cleanup(i_target,i_addr); + break; } } @@ -659,4 +860,70 @@ errlHndl_t ibscomPerformOp(DeviceFW::OperationType i_opType, return l_err; } + +/** + * @brief Enable or disable Inband SCOMs on all capable chips + */ +void enableInbandScoms( bool i_disable ) +{ + TARGETING::TargetHandleList membufChips; + TARGETING::getAllChips(membufChips, TYPE_MEMBUF, true); + + mutex_t* l_mutex = NULL; + + TARGETING::Target * sys = NULL; + TARGETING::targetService().getTopLevelTarget(sys); + + uint8_t l_override = + sys->getAttr<TARGETING::ATTR_IBSCOM_ENABLE_OVERRIDE>(); + TRACFCOMP(g_trac_ibscom,"IBSCOM_ENABLE_OVERRIDE=%d",l_override); + + for(uint32_t i=0; i<membufChips.size(); i++) + { + TARGETING::Target* mb = membufChips[i]; + + // If the membuf chip supports IBSCOM AND.. + // (Chip is >=DD20 OR IBSCOM Override is set) + if( (mb->getAttr<ATTR_PRIMARY_CAPABILITIES>().supportsInbandScom) + && + ( (mb->getAttr<TARGETING::ATTR_EC>() >= 0x20) || + (l_override != 0) ) + ) + { + //don't mess with attributes without the mutex (just to be safe) + l_mutex = mb->getHbMutexAttr<TARGETING::ATTR_IBSCOM_MUTEX>(); + mutex_lock(l_mutex); + + ScomSwitches l_switches = mb->getAttr<ATTR_SCOM_SWITCHES>(); + + uint8_t ib_new = 1; + uint8_t fsi_new = 0; + if( i_disable == IBSCOM_DISABLE ) + { + ib_new = 0; + fsi_new = 1; + } + + // If Inband Scom enablement changed + if ((l_switches.useInbandScom != ib_new) || + (l_switches.useFsiScom != fsi_new)) + { + l_switches.useFsiScom = fsi_new; + l_switches.useInbandScom = ib_new; + + // Modify attribute + membufChips[i]->setAttr<ATTR_SCOM_SWITCHES>(l_switches); + + TRACFCOMP(g_trac_ibscom, + "IBSCOM=%d on target HUID %.8X", + ib_new, + TARGETING::get_huid(mb)); + } + + mutex_unlock(l_mutex); + } + } +} + + } // end namespace |