diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2017-04-11 15:46:40 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2017-04-14 14:10:27 -0400 |
commit | 669fa93550c86b4c37d4fd1e61234dcdb189aabb (patch) | |
tree | f99d98088f370b36467f331a8a4c214640789fa5 | |
parent | d379388635df2705525dc1664f33ade86665874d (diff) | |
download | talos-hostboot-669fa93550c86b4c37d4fd1e61234dcdb189aabb.tar.gz talos-hostboot-669fa93550c86b4c37d4fd1e61234dcdb189aabb.zip |
PRD: runtime ECC analysis for command complete attn
Change-Id: Ic5565154c8b6ccd0425fc7835772d693e9a065d7
RTC: 171915
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39128
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C | 13 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C | 13 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 176 |
3 files changed, 188 insertions, 14 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C index 3e93152b4..ea7250918 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C @@ -29,6 +29,7 @@ #include <prdfMemAddress.H> #include <prdfMemCaptureData.H> #include <prdfMemScrubUtils.H> +#include <prdfP9McaDataBundle.H> #include <prdfP9McbistExtraSig.H> #include <prdfParserEnums.H> @@ -162,7 +163,7 @@ uint32_t MemTdCtlr<T>::handleCmdComplete( STEP_CODE_DATA_STRUCT & io_sc ) // some way to change the template to use the MCA. It is also a local function // because this is only for MemTdCtlr internal use and it didn't make much sense // to create a public function. -template<TARGETING::TYPE T> +template<TARGETING::TYPE T, typename D> uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); @@ -248,8 +249,11 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip, for ( auto & mcaChip : portList ) { bool errorsFound; - uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, io_queue, i_addr, - errorsFound, io_sc ); + uint32_t l_rc = __checkEcc<TYPE_MCA, McaDataBundle *>( mcaChip, + io_queue, + i_addr, + errorsFound, + io_sc ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x) failed", @@ -282,8 +286,11 @@ uint32_t __analyzeCmdComplete<TYPE_MBA>( ExtensibleChip * i_chip, // Update iv_stoppedRank. o_stoppedRank = TdRankListEntry( i_chip, i_addr.getRank() ); + /* TODO RTC 157888 // Check the MBA for ECC errors. return __checkEcc<TYPE_MBA>(i_chip, io_queue, i_addr, o_errorsFound, io_sc); + */ + return SUCCESS; } //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C index 06c3f867b..25fb6ed07 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C @@ -36,6 +36,7 @@ #include <prdfMemScrubUtils.H> #include <prdfMemUtils.H> #include <prdfMemVcm.H> +#include <prdfP9McaDataBundle.H> #include <prdfP9McaExtraSig.H> #include <UtilHash.H> // for Util::hashString @@ -132,7 +133,7 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -template <TARGETING::TYPE T> +template <TARGETING::TYPE T, typename D> uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) @@ -241,14 +242,18 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, } template -uint32_t __checkEcc<TYPE_MCA>( ExtensibleChip * i_chip, TdQueue & io_queue, - const MemAddr & i_addr, bool & o_errorsFound, - STEP_CODE_DATA_STRUCT & io_sc ); +uint32_t __checkEcc<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip, + TdQueue & io_queue, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); +/* TODO RTC 157888 template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +*/ //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index f935c8d30..214c28e24 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -30,7 +30,13 @@ #include <prdfMemTdCtlr.H> // Platform includes +#include <prdfMemEccAnalysis.H> #include <prdfMemScrubUtils.H> +#include <prdfMemTps.H> +#include <prdfMemUtils.H> +#include <prdfMemVcm.H> +#include <prdfP9McaDataBundle.H> +#include <prdfP9McaExtraSig.H> #include <prdfPlatServices.H> using namespace TARGETING; @@ -191,17 +197,25 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -template <TARGETING::TYPE T> +template <TARGETING::TYPE T, typename D> uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__checkEcc] " + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( T == i_chip->getType() ); + uint32_t o_rc = SUCCESS; o_errorsFound = false; + TargetHandle_t trgt = i_chip->getTrgt(); + HUID huid = i_chip->getHuid(); + + MemRank rank = i_addr.getRank(); + do { // Check for ECC errors. @@ -209,12 +223,156 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, o_rc = checkEccFirs<T>( i_chip, eccAttns ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed", huid ); break; } - // TODO RTC 171915 + if ( 0 != (eccAttns & MAINT_INT_NCE_ETE) ) + { + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintINTER_CTE); + + // Can't do any more isolation at this time. So add the rank to the + // callout list. + MemoryMru mm { trgt, rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + + // Add a TPS procedure to the queue. + TdEntry * e = new TpsEvent<T>{ i_chip, rank }; + io_queue.push( e ); + } + + if ( 0 != (eccAttns & MAINT_SOFT_NCE_ETE) ) + { + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintSOFT_CTE ); + + // Can't do any more isolation at this time. So add the rank to the + // callout list. + MemoryMru mm { trgt, rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + + // Add a TPS procedure to the queue. + TdEntry * e = new TpsEvent<T>{ i_chip, rank }; + io_queue.push( e ); + } + + if ( 0 != (eccAttns & MAINT_HARD_NCE_ETE) ) + { + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintHARD_CTE ); + + // Query the per-symbol counters for the hard CE symbol. + MemUtils::MaintSymbols symData; MemSymbol junk; + o_rc = MemUtils::collectCeStats<T>( i_chip, rank, symData, junk ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemUtils::collectCeStats(0x%08x,m%ds%d) " + "failed", huid, rank.getMaster(), rank.getSlave() ); + break; + } + + // The command will have stopped on the first occurrence. So there + // should only be one symbol in the list. + PRDF_ASSERT( 1 == symData.size() ); + + // Add the symbol to the callout list. + MemoryMru mm { trgt, rank, symData[0].symbol }; + io_sc.service_data->SetCallout( mm ); + + // Any hard CEs in MNFG should be immediately reported. + if ( mfgMode() ) + io_sc.service_data->setServiceCall(); + + // Add a TPS procedure to the queue. + TdEntry * e = new TpsEvent<T>{ i_chip, rank }; + io_queue.push( e ); + + /* TODO RTC 136129 + // Dynamically deallocation the page. + o_rc = MemDealloc::page<T>( i_chip, i_addr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::page(0x%08x) failed", huid ); + break; + } + */ + } + + if ( 0 != (eccAttns & MAINT_MPE) ) + { + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintMPE ); + + // Add entry to UE table. + D db = static_cast<D>(i_chip->getDataBundle()); + db->iv_ueTable.addEntry( UE_TABLE::SCRUB_MPE, i_addr ); + + // Read the chip mark from markstore. + MemMark chipMark; + o_rc = MarkStore::readChipMark<T>( i_chip, rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,%d) failed", + huid, rank.getMaster() ); + break; + } + + // If the chip mark is not valid, then somehow the chip mark was + // placed on a rank other than the rank in which the command + // stopped. This would most likely be a code bug. + PRDF_ASSERT( chipMark.isValid() ); + + // Add the mark to the callout list. + MemoryMru mm { trgt, rank, chipMark.getSymbol() }; + io_sc.service_data->SetCallout( mm ); + + // Add a VCM procedure to the queue. + TdEntry * e = new VcmEvent<T>{ i_chip, rank, chipMark }; + io_queue.push( e ); + } + + if ( 0 != (eccAttns & MAINT_RCE_ETE) ) + { + o_errorsFound = true; + + // TODO: RTC 171867 + } + + if ( 0 != (eccAttns & MAINT_UE) ) + { + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE ); + + // Since this will be a predictive callout, change the primary + // signature as well. + io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE ); + + // Add entry to UE table. + D db = static_cast<D>(i_chip->getDataBundle()); + db->iv_ueTable.addEntry( UE_TABLE::SCRUB_UE, i_addr ); + + // Add the rank to the callout list. + MemEcc::calloutMemUe<T>( i_chip, rank, io_sc ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Add a TPS procedure to the queue. + TdEntry * e = new TpsEvent<T>{ i_chip, rank }; + io_queue.push( e ); + + /* TODO RTC 136129 + // Dynamically deallocation the rank. + o_rc = MemDealloc::rank<T>( i_chip, rank ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::rank(0x%08x, m%ds%d) failed", + huid, rank.getMaster(), rank.getSlave() ); + break; + } + */ + } } while (0); @@ -224,14 +382,18 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, } template -uint32_t __checkEcc<TYPE_MCA>( ExtensibleChip * i_chip, TdQueue & io_queue, - const MemAddr & i_addr, bool & o_errorsFound, - STEP_CODE_DATA_STRUCT & io_sc ); +uint32_t __checkEcc<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip, + TdQueue & io_queue, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); +/* TODO RTC 157888 template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +*/ //------------------------------------------------------------------------------ |