diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2014-04-10 09:16:03 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2014-06-23 15:24:15 -0500 |
commit | 02c81ca3e1e06b910c58905bcab913d30b0a5337 (patch) | |
tree | d301ae5056413e37daf15c3dc8cbae8ac1b74c45 /src | |
parent | 0ea3be7d289e8e19aae6a426b65ec91a925ea11a (diff) | |
download | talos-hostboot-02c81ca3e1e06b910c58905bcab913d30b0a5337.tar.gz talos-hostboot-02c81ca3e1e06b910c58905bcab913d30b0a5337.zip |
More FFDC for SCOM Fails
Added more address-specific FFDC collection logic to handle
more specific error scenarios we've seen or talked about.
Change-Id: Ifcc2b98b9c55ed5e6a35d1556cd530438ec120c2
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/11257
Tested-by: Jenkins Server
Reviewed-by: Michael Baiocchi <baiocchi@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/include/usr/xscom/piberror.H | 5 | ||||
-rw-r--r-- | src/usr/fsiscom/fsiscom.C | 17 | ||||
-rw-r--r-- | src/usr/ibscom/ibscom.C | 1 | ||||
-rw-r--r-- | src/usr/scom/scom.C | 71 | ||||
-rw-r--r-- | src/usr/xscom/piberror.C | 44 | ||||
-rw-r--r-- | src/usr/xscom/xscom.C | 1 |
6 files changed, 108 insertions, 31 deletions
diff --git a/src/include/usr/xscom/piberror.H b/src/include/usr/xscom/piberror.H index 9836d9acd..5640c6ce1 100644 --- a/src/include/usr/xscom/piberror.H +++ b/src/include/usr/xscom/piberror.H @@ -50,12 +50,15 @@ namespace PIB * * @param[in] i_target Operation target * @param[in] i_pibErrStatus Error Status bits retrieved + * @param[in] i_scomAddr Address of SCOM operation that led to the fail, + * if operation was not a scom, set to UINT64_MAX * @param[in/out] io_errl Originating errorlog that we will add Fru * Callouts to. * @return none */ void addFruCallouts(TARGETING::Target* i_target, - uint32_t i_pibErrStatus, + uint32_t i_pibErrStatus, + uint64_t i_scomAddr, errlHndl_t& io_errl); diff --git a/src/usr/fsiscom/fsiscom.C b/src/usr/fsiscom/fsiscom.C index 962cfd5ab..aade95479 100644 --- a/src/usr/fsiscom/fsiscom.C +++ b/src/usr/fsiscom/fsiscom.C @@ -60,9 +60,19 @@ union ioData6432 /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// +/** + * @brief Common function to add callouts and FFDC and recover + * from PIB errors + * + * @param[in] i_target SCom target + * @param[in] i_errlog Error log to append to + * @param[in] i_status FSI2PIB status register + * @param[in] i_scomAddr Address that we failed on + */ void pib_error_handler( TARGETING::Target* i_target, errlHndl_t i_errlog, - uint32_t i_status ) + uint32_t i_status, + uint32_t i_scomAddr ) { //Add this target to the FFDC ERRORLOG::ErrlUserDetailsTarget(i_target,"SCOM Target").addToLog(i_errlog); @@ -100,6 +110,7 @@ void pib_error_handler( TARGETING::Target* i_target, uint32_t pib_error = i_status >> 12; PIB::addFruCallouts( i_target, pib_error, + i_scomAddr, i_errlog ); //Grab the PIB2OPB Status reg for a Resource Occupied error @@ -294,7 +305,7 @@ errlHndl_t fsiScomPerformOp(DeviceFW::OperationType i_opType, l_status)); // call common error handler to do callouts and recovery - pib_error_handler( i_target, l_err, l_status ); + pib_error_handler( i_target, l_err, l_status, l_scomAddr ); //Grab the PIB2OPB Status reg for a XSCOM Block error if( (l_status & 0x00007000) == 0x00001000 ) //piberr=001 @@ -381,7 +392,7 @@ errlHndl_t fsiScomPerformOp(DeviceFW::OperationType i_opType, l_status)); // call common error handler to do callouts and recovery - pib_error_handler( i_target, l_err, l_status ); + pib_error_handler( i_target, l_err, l_status, l_scomAddr ); break; } diff --git a/src/usr/ibscom/ibscom.C b/src/usr/ibscom/ibscom.C index 25b4492d2..405708ee2 100644 --- a/src/usr/ibscom/ibscom.C +++ b/src/usr/ibscom/ibscom.C @@ -811,6 +811,7 @@ errlHndl_t doIBScom(DeviceFW::OperationType i_opType, //add callouts based on the PIB error PIB::addFruCallouts( i_target, mbsiberr0.piberr, + i_addr, l_err ); //grab some HW regs via FSISCOM diff --git a/src/usr/scom/scom.C b/src/usr/scom/scom.C index e11736f62..6cded14e5 100644 --- a/src/usr/scom/scom.C +++ b/src/usr/scom/scom.C @@ -57,8 +57,8 @@ namespace SCOM * @param[in] i_target Target of SCOM operation * @param[in] i_addr SCOM address */ -void addScomFailFFFDC( errlHndl_t i_err, - TARGETING::Target* i_target, +void addScomFailFFDC( errlHndl_t i_err, + TARGETING::Target* i_target, uint64_t i_addr ); @@ -310,6 +310,7 @@ errlHndl_t checkIndirectAndDoScom(DeviceFW::OperationType i_opType, //Add the callouts for the specific PCB/PIB error PIB::addFruCallouts( i_target, scomout.piberr, + i_addr, l_err ); //Add this target to the FFDC @@ -426,6 +427,7 @@ errlHndl_t checkIndirectAndDoScom(DeviceFW::OperationType i_opType, //Add the callouts for the specific PCB/PIB error PIB::addFruCallouts( i_target, scomout.piberr, + i_addr, l_err ); //Add this target to the FFDC @@ -553,7 +555,7 @@ errlHndl_t doScomOp(DeviceFW::OperationType i_opType, //Add some additional FFDC based on the specific operation if( l_err ) { - addScomFailFFFDC( l_err, i_target, i_addr ); + addScomFailFFDC( l_err, i_target, i_addr ); } return l_err; @@ -562,18 +564,18 @@ errlHndl_t doScomOp(DeviceFW::OperationType i_opType, /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// -void addScomFailFFFDC( errlHndl_t i_err, - TARGETING::Target* i_target, - uint64_t i_addr ) +void addScomFailFFDC( errlHndl_t i_err, + TARGETING::Target* i_target, + uint64_t i_addr ) { // Read some error regs from scom ERRORLOG::ErrlUserDetailsLogRegister l_scom_data(i_target); bool addit = false; + TARGETING::TYPE l_type = i_target->getAttr<TARGETING::ATTR_TYPE>(); //PBA scoms on the processor if( ((i_addr & 0xFFFFF000) == 0x00064000) - && (TARGETING::TYPE_PROC - == i_target->getAttr<TARGETING::ATTR_TYPE>()) ) + && (TARGETING::TYPE_PROC == l_type) ) { addit = true; //look for hung operations on the PBA @@ -592,6 +594,59 @@ void addScomFailFFFDC( errlHndl_t i_err, l_scom_data.addData(DEVICE_SCOM_ADDRESS(ffdc_regs[x])); } } + //EX scoms on the processor (not including PCB slave regs) + else if( ((i_addr & 0xF0000000) == 0x10000000) + && ((i_addr & 0x00FF0000) != 0x000F0000) + && (TARGETING::TYPE_PROC == l_type) ) + { + addit = true; + uint64_t ex_offset = 0xFF000000 & i_addr; + //grab some data related to the PCB slave state + uint64_t ffdc_regs[] = { + 0x0F010B, //Special Wakeup + 0x0F0012, //GP3 + 0x0F0100, //PowerManagement GP0 + 0x0F0106, //PFET Status Core + 0x0F010E, //PFET Status ECO + 0x0F0111, //PM State History + }; + for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + { + l_scom_data.addData(DEVICE_SCOM_ADDRESS(ex_offset|ffdc_regs[x])); + } + } + + //Any non-PCB Slave and non TP reg on the processor + if( ((i_addr & 0x00FF0000) != 0x000F0000) + && ((i_addr & 0xFF000000) != 0x00000000) + && (TARGETING::TYPE_PROC == l_type) ) + { + addit = true; + uint64_t chiplet_offset = 0xFF000000 & i_addr; + //grab some data related to the PCB slave state + uint64_t ffdc_regs[] = { + 0x0F0012, //GP3 + 0x0F001F, //Error capture reg + }; + for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + { + l_scom_data.addData( DEVICE_SCOM_ADDRESS( + chiplet_offset|ffdc_regs[x]) ); + } + + //grab the clock/osc regs + l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x00050019)); + l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x0005001A)); + //grab the clock regs via FSI too, just in case + TARGETING::Target* mproc = NULL; + TARGETING::targetService().masterProcChipTargetHandle(mproc); + if( (i_target != TARGETING::MASTER_PROCESSOR_CHIP_TARGET_SENTINEL) + && (i_target != mproc) ) + { + l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2864));//==2819 + l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2868));//==281A + } + } if( addit ) { diff --git a/src/usr/xscom/piberror.C b/src/usr/xscom/piberror.C index aa769aea5..e879cdc29 100644 --- a/src/usr/xscom/piberror.C +++ b/src/usr/xscom/piberror.C @@ -39,20 +39,15 @@ namespace PIB /** * @brief Add callouts to an errorlog based on the type of PIB error could be a * hardware or procedure callout - * - * @param[in] i_target Operation target - * @param[in] i_pibErrStatus Error Status bits retrieved - * @param[in/out] io_errl Originating errorlog that we will add Fru - * Callouts to. - * @return none */ void addFruCallouts(TARGETING::Target* i_target, uint32_t i_pibErrStatus, + uint64_t i_scomAddr, errlHndl_t& io_errl) { switch (i_pibErrStatus) { - case PIB::PIB_CHIPLET_OFFLINE: + case PIB::PIB_CHIPLET_OFFLINE: //b010 //Offline should just be a code bug, but it seems that there are // cases where bad hardware can also cause this problem //Since we assume code is good before going out, make the @@ -65,12 +60,12 @@ void addFruCallouts(TARGETING::Target* i_target, HWAS::SRCI_PRIORITY_MED); break; - case PIB::PIB_PARTIAL_GOOD: + case PIB::PIB_PARTIAL_GOOD: //b011 io_errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_HIGH); break; - case PIB::PIB_INVALID_ADDRESS: + case PIB::PIB_INVALID_ADDRESS: //b100 //Invalid Address should just be a code bug, but it seems that there // are cases where bad hardware can also cause this problem //Since we assume code is good before going out, make the @@ -83,28 +78,40 @@ void addFruCallouts(TARGETING::Target* i_target, HWAS::SRCI_PRIORITY_MED); break; - case PIB::PIB_PARITY_ERROR: - case PIB::PIB_TIMEOUT: + case PIB::PIB_PARITY_ERROR: //b110 + case PIB::PIB_TIMEOUT: //b111 io_errl->addHwCallout( i_target, HWAS::SRCI_PRIORITY_LOW, HWAS::NO_DECONFIG, HWAS::GARD_NULL ); break; - case PIB::PIB_CLOCK_ERROR: + case PIB::PIB_CLOCK_ERROR: //b101 if (i_target->getAttr<TARGETING::ATTR_TYPE>() == TARGETING::TYPE_PROC) { - io_errl->addClockCallout(i_target, - HWAS::OSCREFCLK_TYPE, - HWAS::SRCI_PRIORITY_LOW); + //check for PCI range + if( ((i_scomAddr & 0xFF000000) == 0x09000000) + && ((i_scomAddr & 0x00FF0000) != 0x000F0000) ) + { + io_errl->addClockCallout(i_target, + HWAS::OSCPCICLK_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } + //for everything else blame the ref clock + else + { + io_errl->addClockCallout(i_target, + HWAS::OSCREFCLK_TYPE, + HWAS::SRCI_PRIORITY_LOW); + } } else if (i_target->getAttr<TARGETING::ATTR_TYPE>() == TARGETING::TYPE_MEMBUF) { io_errl->addClockCallout(i_target, - HWAS::MEMCLK_TYPE, - HWAS::SRCI_PRIORITY_LOW); + HWAS::MEMCLK_TYPE, + HWAS::SRCI_PRIORITY_LOW); } else // for anything else, just blame the refclock { @@ -115,8 +122,7 @@ void addFruCallouts(TARGETING::Target* i_target, break; default: - // should never commit a log that gets here so that is a - // code bug + // Anything else would most likely be a code bug io_errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_HIGH); break; diff --git a/src/usr/xscom/xscom.C b/src/usr/xscom/xscom.C index 1b5b06558..30df917da 100644 --- a/src/usr/xscom/xscom.C +++ b/src/usr/xscom/xscom.C @@ -794,6 +794,7 @@ errlHndl_t xscomPerformOp(DeviceFW::OperationType i_opType, // Add Callouts to the errorlog PIB::addFruCallouts(i_target, l_hmer.mXSComStatus, + l_addr, l_err); // Call XscomCollectFFDC.. |