summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
diff options
context:
space:
mode:
Diffstat (limited to 'src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C')
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C452
1 files changed, 440 insertions, 12 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
index 9869a8c08..f206a074e 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
@@ -127,6 +127,87 @@ uint32_t handleMemUe<TYPE_MCA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
i_chip->getHuid(), i_type );
break;
}
+
+ #ifdef __HOSTBOOT_RUNTIME
+ // Increment the UE counter and store the rank we're on, resetting
+ // the UE and CE counts if we have stopped on a new rank.
+ ExtensibleChip * mcb = getConnectedParent( i_chip, TYPE_MCBIST );
+ McbistDataBundle * mcbdb = getMcbistDataBundle(mcb);
+ if ( mcbdb->iv_ceUeRank != i_addr.getRank() )
+ {
+ mcbdb->iv_ceStopCounter.reset();
+ mcbdb->iv_ueStopCounter.reset();
+ }
+ mcbdb->iv_ueStopCounter.inc( io_sc );
+ mcbdb->iv_ceUeRank = i_addr.getRank();
+ #endif
+ }
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<>
+uint32_t handleMemUe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemAddr & i_addr,
+ UE_TABLE::Type i_type,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemEcc::handleMemUe<TYPE_OCMB_CHIP>] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // First check to see if this is a side-effect UE.
+ SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("OCMB_LFIR");
+ o_rc = fir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on OCMB_LFIR: i_chip=0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+
+ // Check OCMB_LFIR[38] to determine if this is a side-effect.
+ if ( fir->IsBitSet(38) )
+ {
+ // This is a side-effect. Callout the OCMB.
+ PRDF_TRAC( PRDF_FUNC "Memory UE is side-effect of DDRPHY error" );
+ io_sc.service_data->SetCallout( i_chip->getTrgt() );
+ io_sc.service_data->setServiceCall();
+ }
+ else
+ {
+ // Handle the memory UE.
+ o_rc = __handleMemUe<TYPE_OCMB_CHIP>( i_chip, i_addr, i_type,
+ io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed",
+ i_chip->getHuid(), i_type );
+ break;
+ }
+
+ #ifdef __HOSTBOOT_RUNTIME
+ // Increment the UE counter and store the rank we're on, resetting
+ // the UE and CE counts if we have stopped on a new rank.
+ OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip);
+ if ( ocmbdb->iv_ceUeRank != i_addr.getRank() )
+ {
+ ocmbdb->iv_ceStopCounter.reset();
+ ocmbdb->iv_ueStopCounter.reset();
+ }
+ ocmbdb->iv_ueStopCounter.inc( io_sc );
+ ocmbdb->iv_ceUeRank = i_addr.getRank();
+ #endif
+
}
} while (0);
@@ -328,6 +409,52 @@ uint32_t maskMemPort<TYPE_MCA>( ExtensibleChip * i_chip )
#undef PRDF_FUNC
}
+template<>
+uint32_t maskMemPort<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
+{
+ #define PRDF_FUNC "[MemEcc::maskMemPort<TYPE_OCMB_CHIP>] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Mask all FIRs on the OCMB in the chiplet FIRs.
+ SCAN_COMM_REGISTER_CLASS * chipletMask =
+ i_chip->getRegister("OCMB_CHIPLET_FIR_MASK");
+ SCAN_COMM_REGISTER_CLASS * chipletSpaMask =
+ i_chip->getRegister("OCMB_CHIPLET_SPA_FIR_MASK");
+
+ chipletMask->setAllBits();
+ chipletSpaMask->setAllBits();
+
+ o_rc = chipletMask->Write() | chipletSpaMask->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() );
+ break;
+ }
+
+ #ifdef __HOSTBOOT_RUNTIME
+
+ // Dynamically deallocate the port.
+ if ( SUCCESS != MemDealloc::port<TYPE_OCMB_CHIP>( i_chip ) )
+ {
+ PRDF_ERR( PRDF_FUNC "MemDealloc::port<TYPE_OCMB_CHIP>(0x%08x) "
+ "failed", i_chip->getHuid() );
+ }
+
+ #endif
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
#endif // __HOSTBOOT_MODULE
//------------------------------------------------------------------------------
@@ -390,6 +517,62 @@ uint32_t triggerPortFail<TYPE_MCA>( ExtensibleChip * i_chip )
#undef PRDF_FUNC
}
+template<>
+uint32_t triggerPortFail<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
+{
+ #define PRDF_FUNC "[MemEcc::triggerPortFail<TYPE_OCMB_CHIP>] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ OcmbDataBundle * db = getOcmbDataBundle( i_chip );
+
+ do
+ {
+ // trigger a port fail
+ // set FARB0[59] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_CONSTANT and
+ // FARB0[40] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_ADDR5
+ SCAN_COMM_REGISTER_CLASS * farb0 = i_chip->getRegister("FARB0");
+
+ o_rc = farb0->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() FARB0 failed: i_chip=0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+
+ farb0->SetBit(59);
+ farb0->SetBit(40);
+
+ o_rc = farb0->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() FARB0 failed: i_chip=0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+
+ // reset thresholds to prevent issuing multiple port failures on
+ // the same port
+ for ( auto & resetTh : db->iv_iueTh )
+ {
+ resetTh.second.reset();
+ }
+
+ db->iv_iuePortFail = true;
+
+ break;
+ }while(0);
+
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
#endif // __HOSTBOOT_RUNTIME
//------------------------------------------------------------------------------
@@ -420,6 +603,30 @@ bool queryIueTh<TYPE_MCA>( ExtensibleChip * i_chip,
return iueAtTh;
}
+template<>
+bool queryIueTh<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ bool iueAtTh = false;
+
+ OcmbDataBundle * db = getOcmbDataBundle( i_chip );
+
+ // Loop through all our thresholds
+ for ( auto & th : db->iv_iueTh )
+ {
+ // If threshold reached
+ if ( th.second.thReached(io_sc) )
+ {
+ iueAtTh = true;
+ }
+ }
+
+ return iueAtTh;
+}
+
#endif
//------------------------------------------------------------------------------
@@ -493,6 +700,11 @@ template
uint32_t handleMpe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr,
UE_TABLE::Type i_type,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t handleMpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemAddr & i_addr,
+ UE_TABLE::Type i_type,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
@@ -581,6 +793,10 @@ template
uint32_t analyzeFetchMpe<TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t analyzeFetchMpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
@@ -794,6 +1010,9 @@ uint32_t analyzeFetchNceTce<TYPE_MCA>( ExtensibleChip * i_chip,
template
uint32_t analyzeFetchNceTce<TYPE_MBA>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t analyzeFetchNceTce<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
@@ -871,6 +1090,9 @@ uint32_t analyzeFetchUe<TYPE_MCA>( ExtensibleChip * i_chip,
template
uint32_t analyzeFetchUe<TYPE_MBA>( ExtensibleChip * i_chip,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t analyzeFetchUe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
@@ -955,16 +1177,97 @@ uint32_t handleMemIue<TYPE_MCA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
-//------------------------------------------------------------------------------
-
template<>
-uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip,
+uint32_t handleMemIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc )
{
+ #define PRDF_FUNC "[MemEcc::handleMemIue] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ // Add the DIMM to the callout list.
+ MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK };
+ io_sc.service_data->SetCallout( mm );
+
+ #ifdef __HOSTBOOT_MODULE
+
+ do
+ {
+ // Nothing else to do if handling a system checkstop.
+ if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break;
+
+ // Get the data bundle from chip.
+ OcmbDataBundle * db = getOcmbDataBundle( i_chip );
+
+ // If we have already caused a port fail, mask the IUE bits.
+ if ( true == db->iv_iuePortFail )
+ {
+ SCAN_COMM_REGISTER_CLASS * mask_or =
+ i_chip->getRegister("RDFFIR_MASK_OR");
+
+ mask_or->SetBit(17);
+ mask_or->SetBit(37);
+
+ o_rc = mask_or->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+ }
+
+ // Get the DIMM select.
+ uint8_t ds = i_rank.getDimmSlct();
+
+ // Initialize threshold if it doesn't exist yet.
+ if ( 0 == db->iv_iueTh.count(ds) )
+ {
+ db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() );
+ }
+
+ // Increment the count and check if at threshold.
+ if ( db->iv_iueTh[ds].inc(io_sc) )
+ {
+ // Make the error log predictive.
+ io_sc.service_data->setServiceCall();
+
+ // The port fail will be triggered in the PostAnalysis plugin after
+ // the error log has been committed.
+
+ // Mask off the entire port to avoid collateral.
+ o_rc = MemEcc::maskMemPort<TYPE_OCMB_CHIP>( i_chip );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort(0x%08x) failed",
+ i_chip->getHuid() );
+ break;
+ }
+ }
+
+ } while (0);
+
+ #endif // __HOSTBOOT_MODULE
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<TARGETING::TYPE T>
+uint32_t analyzeMainlineIue( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
#define PRDF_FUNC "[MemEcc::analyzeMainlineIue] "
PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MCA == i_chip->getType() );
+ PRDF_ASSERT( T == i_chip->getType() );
uint32_t o_rc = SUCCESS;
@@ -974,7 +1277,7 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip,
// not likely that we will have two independent failure modes at the
// same time. So we just assume the address is correct.
MemAddr addr;
- o_rc = getMemReadAddr<TYPE_MCA>( i_chip, MemAddr::READ_RCE_ADDR, addr );
+ o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_RCE_ADDR, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed",
@@ -983,7 +1286,7 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip,
}
MemRank rank = addr.getRank();
- o_rc = handleMemIue<TYPE_MCA>( i_chip, rank, io_sc );
+ o_rc = handleMemIue<T>( i_chip, rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed",
@@ -998,16 +1301,23 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
+template
+uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t analyzeMainlineIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
+
//------------------------------------------------------------------------------
-template<>
-uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip,
- STEP_CODE_DATA_STRUCT & io_sc )
+template<TARGETING::TYPE T>
+uint32_t analyzeMaintIue( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemEcc::analyzeMaintIue] "
PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MCA == i_chip->getType() );
+ PRDF_ASSERT( T == i_chip->getType() );
uint32_t o_rc = SUCCESS;
@@ -1015,7 +1325,7 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip,
{
// Use the current address in the MCBMCAT.
MemAddr addr;
- o_rc = getMemMaintAddr<TYPE_MCA>( i_chip, addr );
+ o_rc = getMemMaintAddr<T>( i_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -1024,7 +1334,7 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip,
}
MemRank rank = addr.getRank();
- o_rc = handleMemIue<TYPE_MCA>( i_chip, rank, io_sc );
+ o_rc = handleMemIue<T>( i_chip, rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed",
@@ -1039,6 +1349,13 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
+template
+uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t analyzeMaintIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
+
//------------------------------------------------------------------------------
template<>
@@ -1152,6 +1469,117 @@ uint32_t analyzeImpe<TYPE_MCA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
+template<>
+uint32_t analyzeImpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+
+ #define PRDF_FUNC "[MemEcc::analyzeImpe] "
+
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // get the mark shadow register
+ SCAN_COMM_REGISTER_CLASS * msr = i_chip->getRegister("EXP_MSR");
+
+ o_rc = msr->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on EXP_MSR: i_chip=0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+
+ TargetHandle_t trgt = i_chip->getTrgt();
+
+ // get galois field code - bits 8:15 of MSR
+ uint8_t galois = msr->GetBitFieldJustified( 8, 8 );
+
+ // get rank - bits 16:18 of MSR
+ uint8_t mrnk = msr->GetBitFieldJustified( 16, 3 );
+ MemRank rank( mrnk );
+
+ // get symbol and DRAM
+ MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois );
+ if ( !symbol.isValid() )
+ {
+ PRDF_ERR( PRDF_FUNC "Galois 0x%02x from EXP_MSR is invalid: 0x%08x,"
+ "0x%02x", galois, i_chip->getHuid(), rank.getKey() );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Add the DIMM to the callout list
+ MemoryMru memmru( trgt, rank, MemoryMruData::CALLOUT_RANK );
+ io_sc.service_data->SetCallout( memmru );
+
+ #ifdef __HOSTBOOT_MODULE
+ // get data bundle from chip
+ OcmbDataBundle * db = getOcmbDataBundle( i_chip );
+ uint8_t dram = symbol.getDram();
+
+ // Increment the count and check threshold.
+ if ( db->getImpeThresholdCounter()->inc(rank, dram, io_sc) )
+ {
+ // Make the error log predictive if DRAM Repairs are disabled or if
+ // the number of DRAMs on this rank with IMPEs has reached threshold
+ if ( areDramRepairsDisabled() ||
+ db->getImpeThresholdCounter()->queryDrams(rank, dram, io_sc) )
+ {
+ io_sc.service_data->setServiceCall();
+ }
+ else // Otherwise, place a chip mark on the failing DRAM.
+ {
+ MemMark chipMark( trgt, rank, galois );
+ o_rc = MarkStore::writeChipMark<TYPE_OCMB_CHIP>( i_chip, rank,
+ chipMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), rank.getKey() );
+ break;
+ }
+
+ o_rc = MarkStore::chipMarkCleanup<TYPE_OCMB_CHIP>( i_chip, rank,
+ io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), rank.getKey() );
+ break;
+ }
+ }
+ }
+
+ // If a predictive callout is made, mask both mainline and maintenance
+ // attentions.
+ if ( io_sc.service_data->queryServiceCall() )
+ {
+ SCAN_COMM_REGISTER_CLASS * mask
+ = i_chip->getRegister( "RDFFIR_MASK_OR" );
+ mask->SetBit(19); // mainline
+ mask->SetBit(39); // maintenance
+ o_rc = mask->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR: "
+ "0x%08x", i_chip->getHuid() );
+ break;
+ }
+ }
+ #endif // __HOSTBOOT_MODULE
+
+ } while (0);
+
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//------------------------------------------------------------------------------
template<>
OpenPOWER on IntegriCloud