diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2016-09-23 10:31:12 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2016-12-06 10:01:11 -0500 |
commit | d4077821a7b026fab75450add9e5ad1302df93c2 (patch) | |
tree | 5759bc4dba899870932319cee2325262d3b58fad /src/usr/diag | |
parent | 9c8141ec796a5cc63ec70b71c5ab1d79a91ba922 (diff) | |
download | talos-hostboot-d4077821a7b026fab75450add9e5ad1302df93c2.tar.gz talos-hostboot-d4077821a7b026fab75450add9e5ad1302df93c2.zip |
PRD: Add ECC checking for maint cmd complete attentions
Change-Id: I195e96ef91f495cbbc0cef262b5a040b24179d3f
RTC: 157892
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/32509
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/33079
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag')
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C | 15 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H | 11 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C | 172 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 81 | ||||
-rwxr-xr-x | src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H | 5 |
5 files changed, 279 insertions, 5 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C index 5c543ad79..6823e8254 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C @@ -83,16 +83,21 @@ uint32_t MemTdCtlr<T>::handleCmdComplete( STEP_CODE_DATA_STRUCT & io_sc ) break; } - // TODO: RTC 157892 Check why the command stopped and take actions - // appropriately. Note that since nothing is happening here at - // the moment, the code will simply assume the command stopped - // at the end of memory with no errors. + // Then, check for ECC errors, if they exist. + bool errorsFound = false; + o_rc = checkEcc( errorsFound, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "checkEcc(0x%08x) failed", + iv_chip->getHuid() ); + break; + } // If the command completed successfully with no error, the error // log will not have any useful information. Therefore, do not // commit the error log. This is done to avoid useless // informational error logs. - io_sc.service_data->setDontCommitErrl(); + if ( !errorsFound ) io_sc.service_data->setDontCommitErrl(); } // Move onto the next step in the state machine. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H index bd2f66718..c853b2031 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H @@ -172,6 +172,17 @@ class MemTdCtlr */ uint32_t initStoppedRank(); + /** + * @brief This is called when handling a command complete attention for a + * non-TD command to check for ECC errors. This must be called after + * initStoppedRank() to ensure iv_stoppedRank is initialized. + * @param o_errorsFound True if errors where found and handled. False + * otherwise. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + uint32_t checkEcc( bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); + #ifdef __HOSTBOOT_RUNTIME /** diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C index a34e4f199..4e49e423e 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C @@ -30,7 +30,11 @@ #include <prdfMemTdCtlr.H> // Platform includes +#include <prdfMemMark.H> +#include <prdfMemoryMru.H> #include <prdfMemScrubUtils.H> +#include <prdfMemVcm_ipl.H> +#include <prdfP9McaExtraSig.H> using namespace TARGETING; @@ -99,6 +103,174 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +template <TARGETING::TYPE T> +uint32_t __checkEcc( ExtensibleChip * i_chip, const MemRank & i_rank, + TdQueue & io_queue, bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[__checkEcc] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = true; // Assume true for unless nothing found. + + TargetHandle_t trgt = i_chip->getTrgt(); + HUID huid = i_chip->getHuid(); + + do + { + // Check for ECC errors. + uint32_t eccAttns = 0; + o_rc = checkEccFirs<T>( i_chip, eccAttns ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed", huid ); + break; + } + + if ( 0 != (eccAttns & MAINT_UE) ) + { + // Add the signature to the multi-signature list. Also, since + // this will be a predictive callout, change the primary + // signature as well. + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE ); + io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE ); + + // Add the rank to the callout list. + MemoryMru mm { trgt, i_rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + else if ( 0 != (eccAttns & MAINT_MPE) ) + { + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintMPE ); + + // Read the chip mark from markstore. + MemMark chipMark; + o_rc = MarkStore::readChipMark<T>( i_chip, i_rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,%d) failed", + huid, i_rank.getMaster() ); + break; + } + + // If the chip mark is not valid, then somehow the chip mark was + // placed on a rank other than the rank in which the command + // stopped. This would most likely be a code bug. + PRDF_ASSERT( chipMark.isValid() ); + + // Add the mark to the callout list. + MemoryMru mm { trgt, i_rank, chipMark.getSymbol() }; + io_sc.service_data->SetCallout( mm ); + + // Add a new VCM procedure to the queue. + TdEntry * e = new VcmEvent<T>{ i_chip, i_rank }; + io_queue.push( e ); + } + else if ( isMfgCeCheckingEnabled() && + (0 != (eccAttns & MAINT_HARD_NCE_ETE)) ) + { + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintHARD_CTE ); + + // TODO RTC 136128 + // - Query the per-symbol counters for the hard CE symbol (there + // should be only one). + // - Add the symbol to the callout list (via MemoryMru). + // - Add a TPS procedure to the queue. + } + else // Nothing found. + { + o_errorsFound = false; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template <> +uint32_t MemTdCtlr<TYPE_MCBIST>::checkEcc( bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::checkEcc] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + MemRank rank = iv_stoppedRank.getRank(); + + do + { + // Get all ports in which the command was run. + std::vector<ExtensibleChip *> portList; + o_rc = getMcbistMaintPort( iv_chip, portList ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + // Check each MCA for ECC errors. + for ( auto & mcaChip : portList ) + { + bool errorsFound; + uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, rank, iv_queue, + errorsFound, io_sc ); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x,%d) failed", + mcaChip->getHuid(), rank.getMaster() ); + o_rc |= l_rc; continue; // Try the other MCAs. + } + + if ( errorsFound ) o_errorsFound = true; + } + if ( SUCCESS != o_rc ) break; + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template <> +uint32_t MemTdCtlr<TYPE_MBA>::checkEcc( bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::checkEcc] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + MemRank rank = iv_stoppedRank.getRank(); + + o_rc = __checkEcc<TYPE_MBA>( iv_chip, rank, iv_queue, o_errorsFound, io_sc); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MBA>(0x%08x,%d) failed", + iv_chip->getHuid(), rank.getMaster() ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index 9254d3ee2..f525acebd 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -151,6 +151,87 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +template <> +uint32_t MemTdCtlr<TYPE_MCBIST>::checkEcc( bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::checkEcc] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + /* TODO: RTC 136126 + MemRank rank = iv_stoppedRank.getRank(); + + do + { + // Get all ports in which the command was run. + std::vector<ExtensibleChip *> portList; + o_rc = getMcbistMaintPort( iv_chip, portList ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + // Check each MCA for ECC errors. + for ( auto & mcaChip : portList ) + { + bool errorsFound; + uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, rank, iv_queue, + io_sc, errorsFound ); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x,%d) failed", + mcaChip->getHuid(), rank.getMaster() ); + o_rc |= l_rc; continue; // Try the other MCAs. + } + + if ( errorsFound ) o_errorsFound = true; + } + if ( SUCCESS != o_rc ) break; + + } while (0); + */ + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template <> +uint32_t MemTdCtlr<TYPE_MBA>::checkEcc( bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::checkEcc] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + /* TODO: RTC 136126 + MemRank rank = iv_stoppedRank.getRank(); + + o_rc = __checkEcc<TYPE_MBA>( iv_chip, rank, iv_queue, io_sc, + o_errorsFound ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MBA>(0x%08x,%d) failed", + iv_chip->getHuid(), rank.getMaster() ); + } + */ + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H b/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H index c46ef715c..5f33e5f3c 100755 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H @@ -186,6 +186,11 @@ class TdQueue { iv_queue.push_back( i_e ); } + else + { + // The event is already in the queue. So free up the memory. + delete i_e; + } } /** |