summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2016-09-23 10:31:12 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2016-12-06 10:01:11 -0500
commitd4077821a7b026fab75450add9e5ad1302df93c2 (patch)
tree5759bc4dba899870932319cee2325262d3b58fad /src/usr/diag/prdf
parent9c8141ec796a5cc63ec70b71c5ab1d79a91ba922 (diff)
downloadtalos-hostboot-d4077821a7b026fab75450add9e5ad1302df93c2.tar.gz
talos-hostboot-d4077821a7b026fab75450add9e5ad1302df93c2.zip
PRD: Add ECC checking for maint cmd complete attentions
Change-Id: I195e96ef91f495cbbc0cef262b5a040b24179d3f RTC: 157892 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/32509 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/33079 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf')
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C15
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H11
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C172
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C81
-rwxr-xr-xsrc/usr/diag/prdf/plat/mem/prdfMemTdQueue.H5
5 files changed, 279 insertions, 5 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
index 5c543ad79..6823e8254 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
@@ -83,16 +83,21 @@ uint32_t MemTdCtlr<T>::handleCmdComplete( STEP_CODE_DATA_STRUCT & io_sc )
break;
}
- // TODO: RTC 157892 Check why the command stopped and take actions
- // appropriately. Note that since nothing is happening here at
- // the moment, the code will simply assume the command stopped
- // at the end of memory with no errors.
+ // Then, check for ECC errors, if they exist.
+ bool errorsFound = false;
+ o_rc = checkEcc( errorsFound, io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "checkEcc(0x%08x) failed",
+ iv_chip->getHuid() );
+ break;
+ }
// If the command completed successfully with no error, the error
// log will not have any useful information. Therefore, do not
// commit the error log. This is done to avoid useless
// informational error logs.
- io_sc.service_data->setDontCommitErrl();
+ if ( !errorsFound ) io_sc.service_data->setDontCommitErrl();
}
// Move onto the next step in the state machine.
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
index bd2f66718..c853b2031 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
@@ -172,6 +172,17 @@ class MemTdCtlr
*/
uint32_t initStoppedRank();
+ /**
+ * @brief This is called when handling a command complete attention for a
+ * non-TD command to check for ECC errors. This must be called after
+ * initStoppedRank() to ensure iv_stoppedRank is initialized.
+ * @param o_errorsFound True if errors where found and handled. False
+ * otherwise.
+ * @param io_sc The step code data struct.
+ * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
+ */
+ uint32_t checkEcc( bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc );
+
#ifdef __HOSTBOOT_RUNTIME
/**
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
index a34e4f199..4e49e423e 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
@@ -30,7 +30,11 @@
#include <prdfMemTdCtlr.H>
// Platform includes
+#include <prdfMemMark.H>
+#include <prdfMemoryMru.H>
#include <prdfMemScrubUtils.H>
+#include <prdfMemVcm_ipl.H>
+#include <prdfP9McaExtraSig.H>
using namespace TARGETING;
@@ -99,6 +103,174 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+template <TARGETING::TYPE T>
+uint32_t __checkEcc( ExtensibleChip * i_chip, const MemRank & i_rank,
+ TdQueue & io_queue, bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[__checkEcc] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = true; // Assume true for unless nothing found.
+
+ TargetHandle_t trgt = i_chip->getTrgt();
+ HUID huid = i_chip->getHuid();
+
+ do
+ {
+ // Check for ECC errors.
+ uint32_t eccAttns = 0;
+ o_rc = checkEccFirs<T>( i_chip, eccAttns );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed", huid );
+ break;
+ }
+
+ if ( 0 != (eccAttns & MAINT_UE) )
+ {
+ // Add the signature to the multi-signature list. Also, since
+ // this will be a predictive callout, change the primary
+ // signature as well.
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE );
+ io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE );
+
+ // Add the rank to the callout list.
+ MemoryMru mm { trgt, i_rank, MemoryMruData::CALLOUT_RANK };
+ io_sc.service_data->SetCallout( mm );
+
+ // Make the error log predictive.
+ io_sc.service_data->setServiceCall();
+ }
+ else if ( 0 != (eccAttns & MAINT_MPE) )
+ {
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintMPE );
+
+ // Read the chip mark from markstore.
+ MemMark chipMark;
+ o_rc = MarkStore::readChipMark<T>( i_chip, i_rank, chipMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,%d) failed",
+ huid, i_rank.getMaster() );
+ break;
+ }
+
+ // If the chip mark is not valid, then somehow the chip mark was
+ // placed on a rank other than the rank in which the command
+ // stopped. This would most likely be a code bug.
+ PRDF_ASSERT( chipMark.isValid() );
+
+ // Add the mark to the callout list.
+ MemoryMru mm { trgt, i_rank, chipMark.getSymbol() };
+ io_sc.service_data->SetCallout( mm );
+
+ // Add a new VCM procedure to the queue.
+ TdEntry * e = new VcmEvent<T>{ i_chip, i_rank };
+ io_queue.push( e );
+ }
+ else if ( isMfgCeCheckingEnabled() &&
+ (0 != (eccAttns & MAINT_HARD_NCE_ETE)) )
+ {
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintHARD_CTE );
+
+ // TODO RTC 136128
+ // - Query the per-symbol counters for the hard CE symbol (there
+ // should be only one).
+ // - Add the symbol to the callout list (via MemoryMru).
+ // - Add a TPS procedure to the queue.
+ }
+ else // Nothing found.
+ {
+ o_errorsFound = false;
+ }
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template <>
+uint32_t MemTdCtlr<TYPE_MCBIST>::checkEcc( bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::checkEcc] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = false;
+
+ MemRank rank = iv_stoppedRank.getRank();
+
+ do
+ {
+ // Get all ports in which the command was run.
+ std::vector<ExtensibleChip *> portList;
+ o_rc = getMcbistMaintPort( iv_chip, portList );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed",
+ iv_chip->getHuid() );
+ break;
+ }
+
+ // Check each MCA for ECC errors.
+ for ( auto & mcaChip : portList )
+ {
+ bool errorsFound;
+ uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, rank, iv_queue,
+ errorsFound, io_sc );
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x,%d) failed",
+ mcaChip->getHuid(), rank.getMaster() );
+ o_rc |= l_rc; continue; // Try the other MCAs.
+ }
+
+ if ( errorsFound ) o_errorsFound = true;
+ }
+ if ( SUCCESS != o_rc ) break;
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template <>
+uint32_t MemTdCtlr<TYPE_MBA>::checkEcc( bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::checkEcc] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = false;
+
+ MemRank rank = iv_stoppedRank.getRank();
+
+ o_rc = __checkEcc<TYPE_MBA>( iv_chip, rank, iv_queue, o_errorsFound, io_sc);
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MBA>(0x%08x,%d) failed",
+ iv_chip->getHuid(), rank.getMaster() );
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
// Avoid linker errors with the template.
template class MemTdCtlr<TYPE_MCBIST>;
template class MemTdCtlr<TYPE_MBA>;
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
index 9254d3ee2..f525acebd 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
@@ -151,6 +151,87 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+template <>
+uint32_t MemTdCtlr<TYPE_MCBIST>::checkEcc( bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::checkEcc] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = false;
+
+ /* TODO: RTC 136126
+ MemRank rank = iv_stoppedRank.getRank();
+
+ do
+ {
+ // Get all ports in which the command was run.
+ std::vector<ExtensibleChip *> portList;
+ o_rc = getMcbistMaintPort( iv_chip, portList );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed",
+ iv_chip->getHuid() );
+ break;
+ }
+
+ // Check each MCA for ECC errors.
+ for ( auto & mcaChip : portList )
+ {
+ bool errorsFound;
+ uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, rank, iv_queue,
+ io_sc, errorsFound );
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x,%d) failed",
+ mcaChip->getHuid(), rank.getMaster() );
+ o_rc |= l_rc; continue; // Try the other MCAs.
+ }
+
+ if ( errorsFound ) o_errorsFound = true;
+ }
+ if ( SUCCESS != o_rc ) break;
+
+ } while (0);
+ */
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template <>
+uint32_t MemTdCtlr<TYPE_MBA>::checkEcc( bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::checkEcc] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = false;
+
+ /* TODO: RTC 136126
+ MemRank rank = iv_stoppedRank.getRank();
+
+ o_rc = __checkEcc<TYPE_MBA>( iv_chip, rank, iv_queue, io_sc,
+ o_errorsFound );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MBA>(0x%08x,%d) failed",
+ iv_chip->getHuid(), rank.getMaster() );
+ }
+ */
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
// Avoid linker errors with the template.
template class MemTdCtlr<TYPE_MCBIST>;
template class MemTdCtlr<TYPE_MBA>;
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H b/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H
index c46ef715c..5f33e5f3c 100755
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdQueue.H
@@ -186,6 +186,11 @@ class TdQueue
{
iv_queue.push_back( i_e );
}
+ else
+ {
+ // The event is already in the queue. So free up the memory.
+ delete i_e;
+ }
}
/**
OpenPOWER on IntegriCloud