summaryrefslogtreecommitdiffstats
path: root/src/usr
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2017-04-11 15:46:40 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2017-04-14 14:10:27 -0400
commit669fa93550c86b4c37d4fd1e61234dcdb189aabb (patch)
treef99d98088f370b36467f331a8a4c214640789fa5 /src/usr
parentd379388635df2705525dc1664f33ade86665874d (diff)
downloadtalos-hostboot-669fa93550c86b4c37d4fd1e61234dcdb189aabb.tar.gz
talos-hostboot-669fa93550c86b4c37d4fd1e61234dcdb189aabb.zip
PRD: runtime ECC analysis for command complete attn
Change-Id: Ic5565154c8b6ccd0425fc7835772d693e9a065d7 RTC: 171915 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39128 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Diffstat (limited to 'src/usr')
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C13
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C13
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C176
3 files changed, 188 insertions, 14 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
index 3e93152b4..ea7250918 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
@@ -29,6 +29,7 @@
#include <prdfMemAddress.H>
#include <prdfMemCaptureData.H>
#include <prdfMemScrubUtils.H>
+#include <prdfP9McaDataBundle.H>
#include <prdfP9McbistExtraSig.H>
#include <prdfParserEnums.H>
@@ -162,7 +163,7 @@ uint32_t MemTdCtlr<T>::handleCmdComplete( STEP_CODE_DATA_STRUCT & io_sc )
// some way to change the template to use the MCA. It is also a local function
// because this is only for MemTdCtlr internal use and it didn't make much sense
// to create a public function.
-template<TARGETING::TYPE T>
+template<TARGETING::TYPE T, typename D>
uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
@@ -248,8 +249,11 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip,
for ( auto & mcaChip : portList )
{
bool errorsFound;
- uint32_t l_rc = __checkEcc<TYPE_MCA>( mcaChip, io_queue, i_addr,
- errorsFound, io_sc );
+ uint32_t l_rc = __checkEcc<TYPE_MCA, McaDataBundle *>( mcaChip,
+ io_queue,
+ i_addr,
+ errorsFound,
+ io_sc );
if ( SUCCESS != l_rc )
{
PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_MCA>(0x%08x) failed",
@@ -282,8 +286,11 @@ uint32_t __analyzeCmdComplete<TYPE_MBA>( ExtensibleChip * i_chip,
// Update iv_stoppedRank.
o_stoppedRank = TdRankListEntry( i_chip, i_addr.getRank() );
+ /* TODO RTC 157888
// Check the MBA for ECC errors.
return __checkEcc<TYPE_MBA>(i_chip, io_queue, i_addr, o_errorsFound, io_sc);
+ */
+ return SUCCESS;
}
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
index 06c3f867b..25fb6ed07 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
@@ -36,6 +36,7 @@
#include <prdfMemScrubUtils.H>
#include <prdfMemUtils.H>
#include <prdfMemVcm.H>
+#include <prdfP9McaDataBundle.H>
#include <prdfP9McaExtraSig.H>
#include <UtilHash.H> // for Util::hashString
@@ -132,7 +133,7 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
-template <TARGETING::TYPE T>
+template <TARGETING::TYPE T, typename D>
uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc )
@@ -241,14 +242,18 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
}
template
-uint32_t __checkEcc<TYPE_MCA>( ExtensibleChip * i_chip, TdQueue & io_queue,
- const MemAddr & i_addr, bool & o_errorsFound,
- STEP_CODE_DATA_STRUCT & io_sc );
+uint32_t __checkEcc<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
+ TdQueue & io_queue,
+ const MemAddr & i_addr,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc );
+/* TODO RTC 157888
template
uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
+*/
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
index f935c8d30..214c28e24 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
@@ -30,7 +30,13 @@
#include <prdfMemTdCtlr.H>
// Platform includes
+#include <prdfMemEccAnalysis.H>
#include <prdfMemScrubUtils.H>
+#include <prdfMemTps.H>
+#include <prdfMemUtils.H>
+#include <prdfMemVcm.H>
+#include <prdfP9McaDataBundle.H>
+#include <prdfP9McaExtraSig.H>
#include <prdfPlatServices.H>
using namespace TARGETING;
@@ -191,17 +197,25 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
-template <TARGETING::TYPE T>
+template <TARGETING::TYPE T, typename D>
uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[__checkEcc] "
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( T == i_chip->getType() );
+
uint32_t o_rc = SUCCESS;
o_errorsFound = false;
+ TargetHandle_t trgt = i_chip->getTrgt();
+ HUID huid = i_chip->getHuid();
+
+ MemRank rank = i_addr.getRank();
+
do
{
// Check for ECC errors.
@@ -209,12 +223,156 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
o_rc = checkEccFirs<T>( i_chip, eccAttns );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed",
- i_chip->getHuid() );
+ PRDF_ERR( PRDF_FUNC "checkEccFirs<T>(0x%08x) failed", huid );
break;
}
- // TODO RTC 171915
+ if ( 0 != (eccAttns & MAINT_INT_NCE_ETE) )
+ {
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintINTER_CTE);
+
+ // Can't do any more isolation at this time. So add the rank to the
+ // callout list.
+ MemoryMru mm { trgt, rank, MemoryMruData::CALLOUT_RANK };
+ io_sc.service_data->SetCallout( mm );
+
+ // Add a TPS procedure to the queue.
+ TdEntry * e = new TpsEvent<T>{ i_chip, rank };
+ io_queue.push( e );
+ }
+
+ if ( 0 != (eccAttns & MAINT_SOFT_NCE_ETE) )
+ {
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintSOFT_CTE );
+
+ // Can't do any more isolation at this time. So add the rank to the
+ // callout list.
+ MemoryMru mm { trgt, rank, MemoryMruData::CALLOUT_RANK };
+ io_sc.service_data->SetCallout( mm );
+
+ // Add a TPS procedure to the queue.
+ TdEntry * e = new TpsEvent<T>{ i_chip, rank };
+ io_queue.push( e );
+ }
+
+ if ( 0 != (eccAttns & MAINT_HARD_NCE_ETE) )
+ {
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintHARD_CTE );
+
+ // Query the per-symbol counters for the hard CE symbol.
+ MemUtils::MaintSymbols symData; MemSymbol junk;
+ o_rc = MemUtils::collectCeStats<T>( i_chip, rank, symData, junk );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MemUtils::collectCeStats(0x%08x,m%ds%d) "
+ "failed", huid, rank.getMaster(), rank.getSlave() );
+ break;
+ }
+
+ // The command will have stopped on the first occurrence. So there
+ // should only be one symbol in the list.
+ PRDF_ASSERT( 1 == symData.size() );
+
+ // Add the symbol to the callout list.
+ MemoryMru mm { trgt, rank, symData[0].symbol };
+ io_sc.service_data->SetCallout( mm );
+
+ // Any hard CEs in MNFG should be immediately reported.
+ if ( mfgMode() )
+ io_sc.service_data->setServiceCall();
+
+ // Add a TPS procedure to the queue.
+ TdEntry * e = new TpsEvent<T>{ i_chip, rank };
+ io_queue.push( e );
+
+ /* TODO RTC 136129
+ // Dynamically deallocation the page.
+ o_rc = MemDealloc::page<T>( i_chip, i_addr );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MemDealloc::page(0x%08x) failed", huid );
+ break;
+ }
+ */
+ }
+
+ if ( 0 != (eccAttns & MAINT_MPE) )
+ {
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintMPE );
+
+ // Add entry to UE table.
+ D db = static_cast<D>(i_chip->getDataBundle());
+ db->iv_ueTable.addEntry( UE_TABLE::SCRUB_MPE, i_addr );
+
+ // Read the chip mark from markstore.
+ MemMark chipMark;
+ o_rc = MarkStore::readChipMark<T>( i_chip, rank, chipMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,%d) failed",
+ huid, rank.getMaster() );
+ break;
+ }
+
+ // If the chip mark is not valid, then somehow the chip mark was
+ // placed on a rank other than the rank in which the command
+ // stopped. This would most likely be a code bug.
+ PRDF_ASSERT( chipMark.isValid() );
+
+ // Add the mark to the callout list.
+ MemoryMru mm { trgt, rank, chipMark.getSymbol() };
+ io_sc.service_data->SetCallout( mm );
+
+ // Add a VCM procedure to the queue.
+ TdEntry * e = new VcmEvent<T>{ i_chip, rank, chipMark };
+ io_queue.push( e );
+ }
+
+ if ( 0 != (eccAttns & MAINT_RCE_ETE) )
+ {
+ o_errorsFound = true;
+
+ // TODO: RTC 171867
+ }
+
+ if ( 0 != (eccAttns & MAINT_UE) )
+ {
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE );
+
+ // Since this will be a predictive callout, change the primary
+ // signature as well.
+ io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE );
+
+ // Add entry to UE table.
+ D db = static_cast<D>(i_chip->getDataBundle());
+ db->iv_ueTable.addEntry( UE_TABLE::SCRUB_UE, i_addr );
+
+ // Add the rank to the callout list.
+ MemEcc::calloutMemUe<T>( i_chip, rank, io_sc );
+
+ // Make the error log predictive.
+ io_sc.service_data->setServiceCall();
+
+ // Add a TPS procedure to the queue.
+ TdEntry * e = new TpsEvent<T>{ i_chip, rank };
+ io_queue.push( e );
+
+ /* TODO RTC 136129
+ // Dynamically deallocation the rank.
+ o_rc = MemDealloc::rank<T>( i_chip, rank );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MemDealloc::rank(0x%08x, m%ds%d) failed",
+ huid, rank.getMaster(), rank.getSlave() );
+ break;
+ }
+ */
+ }
} while (0);
@@ -224,14 +382,18 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue,
}
template
-uint32_t __checkEcc<TYPE_MCA>( ExtensibleChip * i_chip, TdQueue & io_queue,
- const MemAddr & i_addr, bool & o_errorsFound,
- STEP_CODE_DATA_STRUCT & io_sc );
+uint32_t __checkEcc<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip,
+ TdQueue & io_queue,
+ const MemAddr & i_addr,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc );
+/* TODO RTC 157888
template
uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, TdQueue & io_queue,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
+*/
//------------------------------------------------------------------------------
OpenPOWER on IntegriCloud