summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/plat/mem
diff options
context:
space:
mode:
Diffstat (limited to 'src/usr/diag/prdf/plat/mem')
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemDsd.H4
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C93
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C81
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C506
-rwxr-xr-xsrc/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C10
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C76
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C62
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H16
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C16
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C500
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H22
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C71
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C399
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm.C118
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm.H5
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C9
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C72
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfP9Mca.C835
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C6
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H21
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C147
21 files changed, 2637 insertions, 432 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H
index 5990a902e..063e92775 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2018 */
+/* Contributors Listed Below - COPYRIGHT 2018,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -47,7 +47,7 @@ class DsdEvent : public TdEntry
/**
* @brief Constructor
- * @param i_chip MCA or MBA.
+ * @param i_chip MCA, MBA, or OCMB.
* @param i_rank Rank reporting chip mark.
*/
DsdEvent<T>( ExtensibleChip * i_chip, const MemRank & i_rank,
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C
index 70a6be7f2..9dbaeeb3c 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2018 */
+/* Contributors Listed Below - COPYRIGHT 2018,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -30,6 +30,8 @@
#include <prdfMemDqBitmap.H>
#include <prdfMemDsd.H>
+#include <hwp_wrappers.H>
+
using namespace TARGETING;
namespace PRDF
@@ -37,18 +39,12 @@ namespace PRDF
using namespace PlatServices;
-//##############################################################################
-//
-// Specializations for MBA
-//
-//##############################################################################
-
-template<>
-uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::checkEcc( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::checkEcc] "
+ #define PRDF_FUNC "[DsdEvent<T>::checkEcc] "
uint32_t o_rc = SUCCESS;
@@ -71,7 +67,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
// At this point we don't actually have an address for the UE. The
// best we can do is get the address in which the command stopped.
MemAddr addr;
- o_rc = getMemMaintAddr<TYPE_MBA>( iv_chip, addr );
+ o_rc = getMemMaintAddr<T>( iv_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -79,8 +75,8 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
break;
}
- o_rc = MemEcc::handleMemUe<TYPE_MBA>( iv_chip, addr,
- UE_TABLE::SCRUB_UE, io_sc );
+ o_rc = MemEcc::handleMemUe<T>( iv_chip, addr,
+ UE_TABLE::SCRUB_UE, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed",
@@ -101,12 +97,12 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
//------------------------------------------------------------------------------
-template<>
-uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::verifySpare( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::verifySpare] "
+ #define PRDF_FUNC "[DsdEvent<T>::verifySpare] "
uint32_t o_rc = SUCCESS;
@@ -166,7 +162,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
PRDFSIG_DsdDramSpared );
// Remove the chip mark.
- o_rc = MarkStore::clearChipMark<TYPE_MBA>( iv_chip, iv_rank );
+ o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed",
@@ -190,7 +186,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
template<>
uint32_t DsdEvent<TYPE_MBA>::startCmd()
{
- #define PRDF_FUNC "[DsdEvent::startCmd] "
+ #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::startCmd] "
uint32_t o_rc = SUCCESS;
@@ -231,7 +227,54 @@ uint32_t DsdEvent<TYPE_MBA>::startCmd()
//------------------------------------------------------------------------------
template<>
-uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
+uint32_t DsdEvent<TYPE_OCMB_CHIP>::startCmd()
+{
+ #define PRDF_FUNC "[DsdEvent<TYPE_OCMB_CHIP>::startCmd] "
+
+ uint32_t o_rc = SUCCESS;
+
+ #ifdef CONFIG_AXONE
+
+ mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond;
+
+ switch ( iv_phase )
+ {
+ case TD_PHASE_1:
+ // Start the steer cleanup procedure on this master rank.
+ o_rc = startTdSteerCleanup<TYPE_OCMB_CHIP>( iv_chip, iv_rank,
+ MASTER_RANK, stopCond );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdSteerCleanup(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+ break;
+
+ case TD_PHASE_2:
+ // Start the superfast read procedure on this master rank.
+ o_rc = startTdSfRead<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK,
+ stopCond );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdSfRead(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+ break;
+
+ default: PRDF_ASSERT( false ); // invalid phase
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
{
uint32_t signature = 0;
@@ -260,5 +303,9 @@ uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+// Avoid linker errors with the template.
+template class DsdEvent<TYPE_MBA>;
+template class DsdEvent<TYPE_OCMB_CHIP>;
+
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C
index 42b7eb9fc..1478a666d 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2018 */
+/* Contributors Listed Below - COPYRIGHT 2018,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -29,6 +29,8 @@
#include <prdfCenMbaExtraSig.H>
#include <prdfMemDsd.H>
+#include <hwp_wrappers.H>
+
using namespace TARGETING;
namespace PRDF
@@ -36,18 +38,12 @@ namespace PRDF
using namespace PlatServices;
-//##############################################################################
-//
-// Specializations for MBA
-//
-//##############################################################################
-
-template<>
-uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::checkEcc( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::checkEcc] "
+ #define PRDF_FUNC "[DsdEvent<T>::checkEcc] "
uint32_t o_rc = SUCCESS;
@@ -64,7 +60,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
// At this point we don't actually have an address for the UE. The
// best we can do is get the address in which the command stopped.
MemAddr addr;
- o_rc = getMemMaintAddr<TYPE_MBA>( iv_chip, addr );
+ o_rc = getMemMaintAddr<T>( iv_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -72,8 +68,8 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
break;
}
- o_rc = MemEcc::handleMemUe<TYPE_MBA>( iv_chip, addr,
- UE_TABLE::SCRUB_UE, io_sc );
+ o_rc = MemEcc::handleMemUe<T>( iv_chip, addr,
+ UE_TABLE::SCRUB_UE, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed",
@@ -83,7 +79,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
// Because of the UE, any further TPS requests will likely have no
// effect. So ban all subsequent requests.
- MemDbUtils::banTps<TYPE_MBA>( iv_chip, addr.getRank() );
+ MemDbUtils::banTps<T>( iv_chip, addr.getRank() );
// Leave the mark in place and abort this procedure.
o_done = true; break;
@@ -114,12 +110,12 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
//------------------------------------------------------------------------------
-template<>
-uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::verifySpare( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::verifySpare] "
+ #define PRDF_FUNC "[DsdEvent<T>::verifySpare] "
uint32_t o_rc = SUCCESS;
@@ -134,7 +130,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
// error (i.e. a UE).
bool lastAddr = false;
- o_rc = didCmdStopOnLastAddr<TYPE_MBA>( iv_chip, MASTER_RANK, lastAddr );
+ o_rc = didCmdStopOnLastAddr<T>( iv_chip, MASTER_RANK, lastAddr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "didCmdStopOnLastAddr(0x%08x) failed",
@@ -155,7 +151,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_DsdDramSpared );
// Remove the chip mark.
- o_rc = MarkStore::clearChipMark<TYPE_MBA>( iv_chip, iv_rank );
+ o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed",
@@ -179,7 +175,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns,
template<>
uint32_t DsdEvent<TYPE_MBA>::startCmd()
{
- #define PRDF_FUNC "[DsdEvent::startCmd] "
+ #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::startCmd] "
uint32_t o_rc = SUCCESS;
@@ -224,7 +220,38 @@ uint32_t DsdEvent<TYPE_MBA>::startCmd()
//------------------------------------------------------------------------------
template<>
-uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
+uint32_t DsdEvent<TYPE_OCMB_CHIP>::startCmd()
+{
+ #define PRDF_FUNC "[DsdEvent<TYPE_OCMB_CHIP>::startCmd] "
+
+ uint32_t o_rc = SUCCESS;
+
+ #ifdef CONFIG_AXONE
+
+ mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond;
+
+ stopCond.set_pause_on_ue(mss::ON);
+
+ // Start the time based scrub procedure on this master rank.
+ o_rc = startTdScrub<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK,
+ stopCond );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<TARGETING::TYPE T>
+uint32_t DsdEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
{
uint32_t signature = 0;
@@ -258,5 +285,9 @@ uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+// Avoid linker errors with the template.
+template class DsdEvent<TYPE_MBA>;
+template class DsdEvent<TYPE_OCMB_CHIP>;
+
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C
index 41b0de3ea..40653ee09 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2017,2018 */
+/* Contributors Listed Below - COPYRIGHT 2017,2020 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -64,7 +64,7 @@ bool isEnabled()
!isMfgAvpEnabled() && !isMfgHdatAvpEnabled() );
}
-int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct,
+int32_t __getAddrConfig( ExtensibleChip * i_chip, uint8_t i_dslct,
bool & o_twoDimmConfig, uint8_t & o_mrnkBits,
uint8_t & o_srnkBits, uint8_t & o_extraRowBits )
{
@@ -72,12 +72,12 @@ int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct,
int32_t o_rc = SUCCESS;
- SCAN_COMM_REGISTER_CLASS * reg = i_mcaChip->getRegister( "MC_ADDR_TRANS" );
+ SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MC_ADDR_TRANS" );
o_rc = reg->Read();
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_mcaChip=0x%08x",
- i_mcaChip->getHuid() );
+ PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_chip=0x%08x",
+ i_chip->getHuid() );
return o_rc;
}
@@ -98,8 +98,8 @@ int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct,
// for some reason B2 is valid, there is definitely a bug.
if ( reg->IsBitSet(i_dslct ? 28:12) )
{
- PRDF_ERR( PRDF_FUNC "B2 enabled in MC_ADDR_TRANS: i_mcaChip=0x%08x "
- "i_dslct=%d", i_mcaChip->getHuid(), i_dslct );
+ PRDF_ERR( PRDF_FUNC "B2 enabled in MC_ADDR_TRANS: i_chip=0x%08x "
+ "i_dslct=%d", i_chip->getHuid(), i_dslct );
return FAIL;
}
@@ -386,7 +386,7 @@ int32_t __getPortAddr<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr,
// Local vars for address fields
uint64_t col = reverseBits(i_addr.getCol(), 7); // C9 C8 C7 C6 C5 C4 C3
uint64_t row = reverseBits(i_addr.getRow(), 18); // R17 R16 R15 .. R1 R0
- uint64_t bnk = i_addr.getBank(); // BG0 BG1 B0 B1 B2
+ uint64_t bnk = i_addr.getBank(); // B0 B1 B2 BG0 BG1
uint64_t srnk = i_addr.getRank().getSlave(); // S0 S1 S2
uint64_t mrnk = i_addr.getRank().getRankSlct(); // M0 M1
uint64_t dslct = i_addr.getRank().getDimmSlct(); // D
@@ -473,6 +473,266 @@ int32_t __getPortAddr<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr,
return o_rc;
}
+void __adjustCapiAddrBitPos( uint8_t & io_bitPos )
+{
+ // Note: the translation bitmaps are all 5 bits that are defined
+ // consistently as:
+ // 00000 = CAPI_Address(5)
+ // 00001 = CAPI_Address(6)
+ // 00010 = CAPI_Address(7)
+ // ...
+ // 01010 = CAPI_Address(15)
+ // 01011 = CAPI_Address(31)
+ // 01100 = CAPI_Address(32)
+ // ...
+ // 10011 = CAPI_Address(39)
+ // So the value from the regs can be converted to the CAPI address bit pos
+ // by adding 5 if the value is less than or equal to 10, or by adding 20
+ // if it is above 10.
+
+ if ( io_bitPos <= 10 )
+ {
+ io_bitPos += 5;
+ }
+ else
+ {
+ io_bitPos += 20;
+ }
+}
+
+template <>
+int32_t __getPortAddr<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, MemAddr i_addr,
+ uint64_t & o_addr )
+{
+ #define PRDF_FUNC "[MemDealloc::__getPortAddr<TYPE_OCMB_CHIP>] "
+
+ int32_t o_rc = SUCCESS;
+
+ o_addr = 0;
+
+ // Local vars for address fields
+ uint64_t col = reverseBits(i_addr.getCol(), 7); // C9 C8 C7 C6 C5 C4 C3
+ uint64_t row = reverseBits(i_addr.getRow(), 18); // R17 R16 R15 .. R1 R0
+ uint64_t bnk = i_addr.getBank(); // B0 B1 B2 BG0 BG1
+ uint64_t srnk = i_addr.getRank().getSlave(); // S0 S1 S2
+ uint64_t mrnk = i_addr.getRank().getRankSlct(); // M0 M1
+ uint64_t dslct = i_addr.getRank().getDimmSlct(); // D
+
+ // Determine if a two DIMM config is used. Also, determine how many
+ // mrank (M0-M1), srnk (S0-S2), or extra row (R17-R15) bits are used.
+ bool twoDimmConfig;
+ uint8_t mrnkBits, srnkBits, extraRowBits;
+ o_rc = __getAddrConfig( i_chip, dslct, twoDimmConfig, mrnkBits, srnkBits,
+ extraRowBits );
+ if ( SUCCESS != o_rc ) return o_rc;
+
+ // Mask off the non-configured bits. If this address came from hardware,
+ // this would not be a problem. However, the get_mrank_range() and
+ // get_srank_range() HWPS got lazy just set the entire fields and did not
+ // take into account the actual bit ranges.
+ mrnk = __maskBits( mrnk, mrnkBits );
+ srnk = __maskBits( srnk, srnkBits );
+ row = __maskBits( row, 15 + extraRowBits );
+
+ // Insert the needed bits based on the config defined in the MC Address
+ // Translation Registers.
+
+ uint8_t bitPos = 0;
+
+ // Split the row into its components.
+ uint8_t r17 = (row & 0x20000) >> 17;
+ uint8_t r16 = (row & 0x10000) >> 16;
+ uint8_t r15 = (row & 0x08000) >> 15;
+ uint16_t r14_r0 = (row & 0x07fff);
+
+ // Split the master rank and slave rank into their components
+ uint8_t m0 = (mrnk & 0x2) >> 1;
+ uint8_t m1 = (mrnk & 0x1);
+
+ uint8_t s0 = (srnk & 0x4) >> 2;
+ uint8_t s1 = (srnk & 0x2) >> 1;
+ uint8_t s2 = (srnk & 0x1);
+
+ // Split the column into its components
+ uint8_t c9 = (col & 0x40) >> 6;
+ uint8_t c8 = (col & 0x20) >> 5;
+ uint8_t c7 = (col & 0x10) >> 4;
+ uint8_t c6 = (col & 0x08) >> 3;
+ uint8_t c5 = (col & 0x04) >> 2;
+ uint8_t c4 = (col & 0x02) >> 1;
+ uint8_t c3 = (col & 0x01);
+
+ // Split the bank and bank group into their components
+ // Note: B2 is not used for OCMB
+ uint8_t b0 = (bnk & 0x10) >> 4;
+ uint8_t b1 = (bnk & 0x08) >> 3;
+
+ uint8_t bg0 = (bnk & 0x2) >> 1;
+ uint8_t bg1 = (bnk & 0x1);
+
+ // Row bits 14:0 are always at CAPI addr position 30:16
+ o_addr |= (r14_r0 << 16);
+
+ // Check MC_ADDR_TRANS0 register for bit positions
+ SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MC_ADDR_TRANS" );
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_chip=0x%08x",
+ i_chip->getHuid() );
+ return o_rc;
+ }
+
+ // If the DIMM select is valid, insert that bit
+ if ( twoDimmConfig )
+ {
+ // DIMM bitmap: MC_ADDR_TRANS0[33:37]
+ bitPos = reg->GetBitFieldJustified( 33, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (dslct << bitPos);
+ }
+
+ // Insert any of the master rank bits that are valid
+ switch( mrnkBits )
+ {
+ case 2:
+ // Master rank 0 bitmap: MC_ADDR_TRANS0[38:42]
+ bitPos = reg->GetBitFieldJustified( 38, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (m0 << bitPos);
+ case 1:
+ // Master rank 1 bitmap: MC_ADDR_TRANS0[43:47]
+ bitPos = reg->GetBitFieldJustified( 43, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (m1 << bitPos);
+ break;
+ }
+
+ // Insert any extra row bits (17:15) that are valid
+ switch ( extraRowBits )
+ {
+ case 3:
+ // Row 17 bitmap: MC_ADDR_TRANS0[49:53]
+ bitPos = reg->GetBitFieldJustified( 49, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (r17 << bitPos);
+ case 2:
+ // Row 16 bitmap: MC_ADDR_TRANS0[54:58]
+ bitPos = reg->GetBitFieldJustified( 54, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (r16 << bitPos);
+ case 1:
+ // Row 15 bitmap: MC_ADDR_TRANS0[59:63]
+ bitPos = reg->GetBitFieldJustified( 59, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (r15 << bitPos);
+ break;
+ }
+
+ // Check MC_ADDR_TRANS1 register for bit positions
+ reg = i_chip->getRegister( "MC_ADDR_TRANS1" );
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS1: i_chip=0x%08x",
+ i_chip->getHuid() );
+ return o_rc;
+ }
+
+ // Insert any of the slave rank bits that are valid
+ switch ( srnkBits )
+ {
+ case 3:
+ // Slave rank 0 bitmap: MC_ADDR_TRANS1[3:7]
+ bitPos = reg->GetBitFieldJustified( 3, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (s0 << bitPos);
+ case 2:
+ // Slave rank 1 bitmap: MC_ADDR_TRANS1[11:15]
+ bitPos = reg->GetBitFieldJustified( 11, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (s1 << bitPos);
+ case 1:
+ // Slave rank 2 bitmap: MC_ADDR_TRANS1[19:23]
+ bitPos = reg->GetBitFieldJustified( 19, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (s2 << bitPos);
+ break;
+ }
+
+ // Column 3 bitmap: MC_ADDR_TRANS1[30:34]
+ bitPos = reg->GetBitFieldJustified( 30, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c3 << bitPos);
+
+ // Column 4 bitmap: MC_ADDR_TRANS1[35:39]
+ bitPos = reg->GetBitFieldJustified( 35, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c4 << bitPos);
+
+ // Column 5 bitmap: MC_ADDR_TRANS1[43:47]
+ bitPos = reg->GetBitFieldJustified( 43, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c5 << bitPos);
+
+ // Column 6 bitmap: MC_ADDR_TRANS1[51:55]
+ bitPos = reg->GetBitFieldJustified( 51, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c6 << bitPos);
+
+ // Column 7 bitmap: MC_ADDR_TRANS1[59:63]
+ bitPos = reg->GetBitFieldJustified( 59, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c7 << bitPos);
+
+ // Check MC_ADDR_TRANS2 register for bit positions
+ reg = i_chip->getRegister( "MC_ADDR_TRANS2" );
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS2: i_chip=0x%08x",
+ i_chip->getHuid() );
+ return o_rc;
+ }
+
+ // Column 8 bitmap: MC_ADDR_TRANS2[3:7]
+ bitPos = reg->GetBitFieldJustified( 3, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c8 << bitPos);
+
+ // Column 9 bitmap: MC_ADDR_TRANS2[11:15]
+ bitPos = reg->GetBitFieldJustified( 11, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (c9 << bitPos);
+
+ // Bank 0 bitmap: MC_ADDR_TRANS2[19:23]
+ bitPos = reg->GetBitFieldJustified( 19, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (b0 << bitPos );
+
+ // Bank 1 bitmap: MC_ADDR_TRANS2[27:31]
+ bitPos = reg->GetBitFieldJustified( 27, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (b1 << bitPos);
+
+ // Bank 2 bitmap: MC_ADDR_TRANS2[35:39]
+ // Note: Bank2 not used for OCMB
+
+ // Bank group 0 bitmap: MC_ADDR_TRANS2[43:47]
+ bitPos = reg->GetBitFieldJustified( 43, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (bg0 << bitPos);
+
+ // Bank group 1 bitmap: MC_ADDR_TRANS2[51:55]
+ bitPos = reg->GetBitFieldJustified( 51, 5 );
+ __adjustCapiAddrBitPos( bitPos );
+ o_addr |= (bg1 << bitPos);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
template <>
int32_t __getPortAddr<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr,
uint64_t & o_addr )
@@ -566,12 +826,12 @@ int32_t __getPortAddr<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr,
//------------------------------------------------------------------------------
template<TYPE T>
-void __getGrpPrms( ExtensibleChip * i_chip, uint8_t o_portPos,
+void __getGrpPrms( ExtensibleChip * i_chip, uint8_t & o_portPos,
SCAN_COMM_REGISTER_CLASS * &o_mcfgp,
SCAN_COMM_REGISTER_CLASS * &o_mcfgpm );
template<>
-void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t o_portPos,
+void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t & o_portPos,
SCAN_COMM_REGISTER_CLASS * &o_mcfgp,
SCAN_COMM_REGISTER_CLASS * &o_mcfgpm )
{
@@ -585,7 +845,33 @@ void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t o_portPos,
}
template<>
-void __getGrpPrms<TYPE_MBA>( ExtensibleChip * i_chip, uint8_t o_portPos,
+void __getGrpPrms<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, uint8_t & o_portPos,
+ SCAN_COMM_REGISTER_CLASS * &o_mcfgp,
+ SCAN_COMM_REGISTER_CLASS * &o_mcfgpm )
+{
+ // Get the connected parent MI;
+ ExtensibleChip * mcc = getConnectedParent( i_chip, TYPE_MCC );
+ ExtensibleChip * mi = getConnectedParent( mcc, TYPE_MI );
+
+ // TODO RTC 210072 - support for multiple ports
+ o_portPos = 0;
+
+ // Get the position of the MCC relative to the MI (0:1)
+ uint8_t chnlPos = mcc->getPos() % MAX_MCC_PER_MI;
+
+ char mcfgpName[64];
+ sprintf( mcfgpName, "MCFGP%d", chnlPos );
+
+ char mcfgpmName[64];
+ sprintf( mcfgpmName, "MCFGPM%d", chnlPos );
+
+ o_mcfgp = mi->getRegister( mcfgpName );
+ o_mcfgpm = mi->getRegister( mcfgpmName );
+
+}
+
+template<>
+void __getGrpPrms<TYPE_MBA>( ExtensibleChip * i_chip, uint8_t & o_portPos,
SCAN_COMM_REGISTER_CLASS * &o_mcfgp,
SCAN_COMM_REGISTER_CLASS * &o_mcfgpm )
{
@@ -686,12 +972,67 @@ uint32_t __getGrpInfo( ExtensibleChip * i_chip, uint64_t & o_grpChnls,
#undef PRDF_FUNC
}
+template<>
+uint32_t __getGrpInfo<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ uint64_t & o_grpChnls,
+ uint64_t & o_grpId, uint64_t & o_grpSize,
+ uint64_t & o_grpBar )
+{
+ #define PRDF_FUNC "[MemDealloc::__getGrpInfo] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Get portPos and MCFGP/M registers
+ uint8_t portPos = 0xFF;
+ SCAN_COMM_REGISTER_CLASS * mcfgp = nullptr;
+ SCAN_COMM_REGISTER_CLASS * mcfgpm = nullptr;
+ __getGrpPrms<TYPE_OCMB_CHIP>( i_chip, portPos, mcfgp, mcfgpm );
+
+ o_rc = mcfgp->Read(); if ( SUCCESS != o_rc ) break;
+
+ // Get the number of channels in this group: MCFGP[40:42]
+ uint8_t mcGrpCnfg = mcfgp->GetBitFieldJustified( 40, 3 );
+ switch ( mcGrpCnfg )
+ {
+ case 0: o_grpChnls = 8; break; // 8MCS
+ case 1: o_grpChnls = 1; break; // 1MCS
+ case 2: o_grpChnls = 2; break; // 2MCS
+ case 3: o_grpChnls = 3; break; // 3MCS
+ case 4: o_grpChnls = 4; break; // 4MCS
+ case 5: o_grpChnls = 6; break; // 6MCS
+ default:
+ PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value: 0x%x "
+ "on 0x%08x", mcGrpCnfg, i_chip->getHuid() );
+ o_rc = FAIL;
+ }
+ if ( SUCCESS != o_rc ) break;
+
+ // Get the group ID and group size.
+ o_grpId = mcfgp->GetBitFieldJustified( 43, 3 ); // MCFGP[43:45]
+ o_grpSize = mcfgp->GetBitFieldJustified( 25, 15 ); // MCFGP[25:39]
+
+ // TODO RTC 210072 - support for multiple ports, see generic handling
+
+ // Get the base address (BAR).
+ // Channel 0 is always from the MCFGP.
+ o_grpBar = mcfgp->GetBitFieldJustified(1, 24); // MCFGP[1:24]
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//------------------------------------------------------------------------------
-uint32_t __insertGrpId( uint64_t & io_addr, uint64_t i_grpChnls,
- uint64_t i_grpId )
+template <TYPE T>
+uint32_t __insertGrpId( ExtensibleChip * i_chip, uint64_t & io_addr,
+ uint64_t i_grpChnls, uint64_t i_grpId )
{
- #define PRDF_FUNC "[MemDealloc::__insertGrpId] "
+ #define PRDF_FUNC "[MemDealloc::__insertGrpId<T>] "
uint32_t o_rc = SUCCESS;
@@ -742,6 +1083,108 @@ uint32_t __insertGrpId( uint64_t & io_addr, uint64_t i_grpChnls,
#undef PRDF_FUNC
}
+template<>
+uint32_t __insertGrpId<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ uint64_t & io_addr, uint64_t i_grpChnls,
+ uint64_t i_grpId )
+{
+ #define PRDF_FUNC "[MemDealloc::__insertGrpId<TYPE_OCMB_CHIP>] "
+
+ uint32_t o_rc = SUCCESS;
+
+ uint64_t upper33 = io_addr & 0xFFFFFFFF80ull;
+ uint64_t lower7 = io_addr & 0x000000007full;
+
+ bool subChanAEnable = false;
+ bool subChanBEnable = false;
+ bool bothSubChansEnabled = false;
+
+ ExtensibleChip * mcc = getConnectedParent( i_chip, TYPE_MCC );
+
+ // Check both subchannels whether we can get the connected OCMB to
+ // determine whether they are enabled.
+ // Check for subchannel A
+ ExtensibleChip * subchanA = getConnectedChild( mcc, TYPE_OCMB_CHIP, 0 );
+ if ( nullptr != subchanA ) subChanAEnable = true;
+
+ // Check for subchannel B
+ ExtensibleChip * subchanB = getConnectedChild( mcc, TYPE_OCMB_CHIP, 1 );
+ if ( nullptr != subchanB ) subChanBEnable = true;
+
+ // Check if both subchannels were enabled
+ if ( subChanAEnable && subChanBEnable ) bothSubChansEnabled = true;
+
+ // If both subchannels are enabled, bit 56 of the address will contain the
+ // subchannel select bit.
+ if ( bothSubChansEnabled )
+ {
+ uint8_t ocmbChnl = i_chip->getPos() % MAX_OCMB_PER_MCC; // 0:1
+ uint8_t bitInsert = 0;
+
+ switch ( i_grpChnls )
+ {
+ case 1: // insert 1 bit for subchannel select
+ case 3:
+ case 6:
+ bitInsert = ( ocmbChnl & 0x1 );
+ io_addr = (upper33 << 1) | (bitInsert << 7) | lower7;
+ break;
+
+ case 2: // insert 1 bit for subchannel select and 1 bit for grpId
+ bitInsert = ( ((i_grpId & 0x1) << 1) | (ocmbChnl & 0x1) );
+ io_addr = (upper33 << 2) | (bitInsert << 7) | lower7;
+ break;
+
+ case 4: // insert 1 bit for subchannel select and 2 bits for grpId
+ bitInsert = ( ((i_grpId & 0x3) << 1) | (ocmbChnl & 0x1) );
+ io_addr = (upper33 << 3) | (bitInsert << 7) | lower7;
+ break;
+
+ case 8: // insert 1 bit for subchannel select and 3 bits for grpId
+ bitInsert = ( ((i_grpId & 0x7) << 1) | (ocmbChnl & 0x1) );
+ io_addr = (upper33 << 4) | (bitInsert << 7) | lower7;
+ break;
+
+ default:
+ PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value %d",
+ i_grpChnls );
+ o_rc = FAIL;
+ }
+ }
+ else
+ {
+ switch ( i_grpChnls )
+ {
+ case 1: // no shifting
+ case 3:
+ case 6:
+ break;
+
+ case 2: // insert 1 bit
+ io_addr = (upper33 << 1) | ((i_grpId & 0x1) << 7) | lower7;
+ break;
+
+ case 4: // insert 2 bits
+ io_addr = (upper33 << 2) | ((i_grpId & 0x3) << 7) | lower7;
+ break;
+
+ case 8: // insert 3 bits
+ io_addr = (upper33 << 3) | ((i_grpId & 0x7) << 7) | lower7;
+ break;
+
+ default:
+ PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value %d",
+ i_grpChnls );
+ o_rc = FAIL;
+ }
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+
+}
+
//------------------------------------------------------------------------------
// The hardware uses a mod3 hashing algorithm to calculate which memory channel
@@ -849,7 +1292,7 @@ void __addBar( uint64_t & io_addr, uint64_t i_grpBar )
template<TYPE T>
uint32_t getSystemAddr( ExtensibleChip * i_chip, MemAddr i_addr,
- uint64_t & o_addr )
+ uint64_t & o_addr )
{
#define PRDF_FUNC "[MemDealloc::getSystemAddr] "
@@ -867,7 +1310,7 @@ uint32_t getSystemAddr( ExtensibleChip * i_chip, MemAddr i_addr,
if ( SUCCESS != o_rc ) break;
// Insert the group ID.
- o_rc = __insertGrpId( o_addr, grpChnls, grpId );
+ o_rc = __insertGrpId<T>( i_chip, o_addr, grpChnls, grpId );
if ( SUCCESS != o_rc ) break;
// Notes on 3 and 6 channel per group configs:
@@ -915,8 +1358,8 @@ uint32_t getSystemAddrRange( ExtensibleChip * i_chip,
if ( SUCCESS != o_rc ) break;
// Insert the group ID.
- o_rc = __insertGrpId( o_saddr, grpChnls, grpId );
- o_rc |= __insertGrpId( o_eaddr, grpChnls, grpId );
+ o_rc = __insertGrpId<T>( i_chip, o_saddr, grpChnls, grpId );
+ o_rc |= __insertGrpId<T>( i_chip, o_eaddr, grpChnls, grpId );
if ( SUCCESS != o_rc ) break;
// Notes on 3 and 6 channel per group configs:
@@ -975,6 +1418,7 @@ int32_t page( ExtensibleChip * i_chip, MemAddr i_addr )
}
template int32_t page<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr );
template int32_t page<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr );
+template int32_t page<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip, MemAddr i_addr);
//------------------------------------------------------------------------------
@@ -1025,6 +1469,7 @@ int32_t rank( ExtensibleChip * i_chip, MemRank i_rank )
}
template int32_t rank<TYPE_MCA>( ExtensibleChip * i_chip, MemRank i_rank );
template int32_t rank<TYPE_MBA>( ExtensibleChip * i_chip, MemRank i_rank );
+template int32_t rank<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip, MemRank i_rank);
//------------------------------------------------------------------------------
@@ -1074,6 +1519,7 @@ int32_t port( ExtensibleChip * i_chip )
}
template int32_t port<TYPE_MCA>( ExtensibleChip * i_chip );
template int32_t port<TYPE_MBA>( ExtensibleChip * i_chip );
+template int32_t port<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip );
//------------------------------------------------------------------------------
@@ -1236,6 +1682,22 @@ int32_t dimmList( TargetHandleList & i_dimmList )
sendPredDeallocRequest( ssAddr, seAddr );
PRDF_TRAC( PRDF_FUNC "Predictive dealloc for start addr: 0x%016llx "
"end addr: 0x%016llx", ssAddr, seAddr );
+
+ #ifdef CONFIG_NVDIMM
+ // If the DIMM is an NVDIMM, send a message to PHYP that a save/restore
+ // may work.
+ if ( isNVDIMM(*it) )
+ {
+ uint32_t l_rc = PlatServices::nvdimmNotifyProtChange( *it,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_TRAC( PRDF_FUNC "nvdimmNotifyProtChange(0x%08x) "
+ "failed.", getHuid(*it) );
+ continue;
+ }
+ }
+ #endif
}
return o_rc;
@@ -1278,6 +1740,14 @@ int32_t dimmList( TargetHandleList & i_dimmList )
break;
}
+ // Third, check for OCMBs.
+ list = getConnected( dimmTrgt, TYPE_OCMB_CHIP );
+ if ( !list.empty() )
+ {
+ o_rc = dimmList<TYPE_OCMB_CHIP>( i_dimmList );
+ break;
+ }
+
// If we get here we did not find a supported target.
PRDF_ERR( PRDF_FUNC "Unsupported connected parent to dimm 0x%08x",
getHuid(dimmTrgt) );
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C b/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C
index 869aa92e8..b257d0874 100755
--- a/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C
@@ -83,8 +83,8 @@ void MemIplCeStats<TYPE_MCA>::banAnalysis( uint8_t i_dimmSlct,
//------------------------------------------------------------------------------
template<>
-void MemIplCeStats<TYPE_MEM_PORT>::banAnalysis( uint8_t i_dimmSlct,
- uint8_t i_portSlct )
+void MemIplCeStats<TYPE_OCMB_CHIP>::banAnalysis( uint8_t i_dimmSlct,
+ uint8_t i_portSlct )
{
PRDF_ASSERT( i_dimmSlct < MAX_DIMM_PER_PORT );
PRDF_ASSERT( 0 == i_portSlct );
@@ -117,9 +117,9 @@ void MemIplCeStats<TYPE_MCA>::banAnalysis( uint8_t i_dimmSlct )
//------------------------------------------------------------------------------
template<>
-void MemIplCeStats<TYPE_MEM_PORT>::banAnalysis( uint8_t i_dimmSlct )
+void MemIplCeStats<TYPE_OCMB_CHIP>::banAnalysis( uint8_t i_dimmSlct )
{
- // Only one DIMM per DIMM select on MEM_PORT.
+ // Only one DIMM per DIMM select on OCMB_CHIP.
banAnalysis( i_dimmSlct, 0 );
}
@@ -481,6 +481,6 @@ void MemIplCeStats<T>::addMruAndCommitErrl( const MemoryMru & i_memmru,
// need these templates to avoid linker errors
template class MemIplCeStats<TYPE_MCA>;
template class MemIplCeStats<TYPE_MBA>;
-template class MemIplCeStats<TYPE_MEM_PORT>;
+template class MemIplCeStats<TYPE_OCMB_CHIP>;
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C
index 5351b842a..bececfa21 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2019 */
+/* Contributors Listed Below - COPYRIGHT 2016,2020 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -106,17 +106,6 @@ uint32_t clearCmdCompleteAttn<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
}
template<>
-uint32_t clearCmdCompleteAttn<TYPE_MEM_PORT>( ExtensibleChip * i_chip )
-{
- PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() );
-
- ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP );
-
- return clearCmdCompleteAttn<TYPE_OCMB_CHIP>( ocmbChip );
-}
-
-template<>
uint32_t clearCmdCompleteAttn<TYPE_MBA>( ExtensibleChip * i_chip )
{
// Clear MBASPA[0,8].
@@ -194,17 +183,6 @@ uint32_t clearEccCounters<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
}
template<>
-uint32_t clearEccCounters<TYPE_MEM_PORT>( ExtensibleChip * i_chip )
-{
- PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() );
-
- ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP );
-
- return clearEccCounters<TYPE_OCMB_CHIP>( ocmbChip );
-}
-
-template<>
uint32_t clearEccCounters<TYPE_MBA>( ExtensibleChip * i_chip )
{
PRDF_ASSERT( nullptr != i_chip );
@@ -306,17 +284,6 @@ uint32_t clearEccFirs<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
}
template<>
-uint32_t clearEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip )
-{
- PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() );
-
- ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP );
-
- return clearEccFirs<TYPE_OCMB_CHIP>( ocmbChip );
-}
-
-template<>
uint32_t clearEccFirs<TYPE_MBA>( ExtensibleChip * i_chip )
{
uint32_t o_rc = SUCCESS;
@@ -409,22 +376,20 @@ uint32_t checkEccFirs<TYPE_MCA>( ExtensibleChip * i_chip,
//------------------------------------------------------------------------------
template<>
-uint32_t checkEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip,
- uint32_t & o_eccAttns )
+uint32_t checkEccFirs<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ uint32_t & o_eccAttns )
{
- #define PRDF_FUNC "[checkEccFirs<TYPE_MEM_PORT>] "
+ #define PRDF_FUNC "[checkEccFirs<TYPE_OCMB_CHIP>] "
uint32_t o_rc = SUCCESS;
o_eccAttns = MAINT_NO_ERROR;
PRDF_ASSERT( nullptr != i_chip );
- PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() );
-
- ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP );
+ PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
- SCAN_COMM_REGISTER_CLASS * rdffir = ocmbChip->getRegister( "RDFFIR" );
- SCAN_COMM_REGISTER_CLASS * mcbistfir = ocmbChip->getRegister( "MCBISTFIR" );
+ SCAN_COMM_REGISTER_CLASS * rdffir = i_chip->getRegister( "RDFFIR" );
+ SCAN_COMM_REGISTER_CLASS * mcbistfir = i_chip->getRegister( "MCBISTFIR" );
do
{
@@ -453,7 +418,7 @@ uint32_t checkEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip,
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR: mcbChip=0x%08x",
- ocmbChip->getHuid() );
+ i_chip->getHuid() );
break;
}
@@ -733,11 +698,11 @@ uint32_t setBgScrubThresholds<TYPE_MBA>( ExtensibleChip * i_chip,
//------------------------------------------------------------------------------
-template<>
-uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip,
- AddrRangeType i_rangeType,
- bool & o_stoppedOnLastAddr,
- bool i_rowRepair )
+template<TARGETING::TYPE T>
+uint32_t didCmdStopOnLastAddr( ExtensibleChip * i_chip,
+ AddrRangeType i_rangeType,
+ bool & o_stoppedOnLastAddr,
+ bool i_rowRepair )
{
#define PRDF_FUNC "[didCmdStopOnLastAddr] "
@@ -749,7 +714,7 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip,
{
// Get the current address.
MemAddr curAddr;
- o_rc = getMemMaintAddr<TYPE_MBA>( i_chip, curAddr );
+ o_rc = getMemMaintAddr<T>( i_chip, curAddr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -759,7 +724,7 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip,
// Get the end address of the current rank.
MemAddr junk, endAddr;
- o_rc = getMemAddrRange<TYPE_MBA>( i_chip, curAddr.getRank(), junk,
+ o_rc = getMemAddrRange<T>( i_chip, curAddr.getRank(), junk,
endAddr, i_rangeType );
if ( SUCCESS != o_rc )
{
@@ -784,7 +749,16 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
-
+template
+uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip,
+ AddrRangeType i_rangeType,
+ bool & o_stoppedOnLastAddr,
+ bool i_rowRepair );
+template
+uint32_t didCmdStopOnLastAddr<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ AddrRangeType i_rangeType,
+ bool & o_stoppedOnLastAddr,
+ bool i_rowRepair );
//------------------------------------------------------------------------------
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
index f86110458..5d310c51b 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2020 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -248,8 +248,8 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip,
do
{
// Get all ports in which the command was run.
- std::vector<ExtensibleChip *> portList;
- o_rc = getMcbistMaintPort( i_chip, portList );
+ ExtensibleChipList portList;
+ o_rc = getMcbistMaintPort<TYPE_MCBIST>( i_chip, portList );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed",
@@ -291,6 +291,43 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip,
}
template<>
+uint32_t __analyzeCmdComplete<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ TdRankListEntry & o_stoppedRank,
+ const MemAddr & i_addr,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[__analyzeCmdComplete] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_errorsFound = false;
+
+ do
+ {
+ // Update iv_stoppedRank.
+ o_stoppedRank = __getStopRank<TYPE_OCMB_CHIP>( i_chip, i_addr );
+
+ // Check the OCMB for ECC errors.
+ bool errorsFound;
+ o_rc = __checkEcc<TYPE_OCMB_CHIP>( i_chip, i_addr, errorsFound, io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_OCMB_CHIP>(0x%08x) failed",
+ i_chip->getHuid() );
+ break;
+ }
+
+ if ( errorsFound ) o_errorsFound = true;
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<>
uint32_t __analyzeCmdComplete<TYPE_MBA>( ExtensibleChip * i_chip,
TdRankListEntry & o_stoppedRank,
const MemAddr & i_addr,
@@ -346,7 +383,7 @@ uint32_t MemTdCtlr<T>::analyzeCmdComplete( bool & o_errorsFound,
// of in defaultStep() because a TD procedure could have been run
// before defaultStep() and it is possible that canResumeBgScrub()
// could give as a false positive in that case.
- o_rc = canResumeBgScrub( iv_resumeBgScrub );
+ o_rc = canResumeBgScrub( iv_resumeBgScrub, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "canResumeBgScrub(0x%08x) failed",
@@ -397,9 +434,15 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc,
// Get the version to use.
uint8_t version = TD_CTLR_DATA::VERSION_1;
+ bool isNimbus = false;
if ( MODEL_NIMBUS == getChipModel(getMasterProc()) )
{
version = TD_CTLR_DATA::VERSION_2;
+ isNimbus = true;
+ }
+ else if ( MODEL_AXONE == getChipModel(getMasterProc()) )
+ {
+ version = TD_CTLR_DATA::VERSION_2;
}
// Get the IPL state.
@@ -443,6 +486,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc,
if ( TD_CTLR_DATA::VERSION_2 == version )
{
curPort = iv_curProcedure->getChip()->getPos() % MAX_MCA_PER_MCBIST;
+ if ( !isNimbus )
+ {
+ TargetHandle_t portTrgt = iv_curProcedure->getChip()->getTrgt();
+ curPort = portTrgt->getAttr<ATTR_REL_POS>();
+ }
}
}
@@ -475,6 +523,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc,
if ( TD_CTLR_DATA::VERSION_2 == version )
{
itPort = queue[n]->getChip()->getPos() % MAX_MCA_PER_MCBIST;
+ if ( !isNimbus )
+ {
+ TargetHandle_t portTrgt = queue[n]->getChip()->getTrgt();
+ itPort = portTrgt->getAttr<ATTR_REL_POS>();
+ }
}
bsb.setFieldJustify( pos, 3, itMrnk ); pos+=3;
@@ -502,6 +555,7 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc,
// Avoid linker errors with the template.
template class MemTdCtlr<TYPE_MCBIST>;
template class MemTdCtlr<TYPE_MBA>;
+template class MemTdCtlr<TYPE_OCMB_CHIP>;
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
index 332109b48..da969e2c1 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -54,14 +54,14 @@ class MemTdCtlr
/**
* @brief Constructor
*
- * This contructor will only be called in the MCBIST or MBA data bundle,
- * which already checks for a valid type.
+ * This contructor will only be called in the MCBIST, MBA, or OCMB data
+ * bundle, which already checks for a valid type.
*
* Need to initialize iv_stoppedRank to a valid entry in iv_rankList. Use
* the last entry in the list so that the 'next' rank is the first entry
* in the list.
*
- * @param i_chip An MCBIST or MBA chip.
+ * @param i_chip An MCBIST, MBA, or OCMB chip.
*/
explicit MemTdCtlr( ExtensibleChip * i_chip ) :
iv_chip( i_chip ), iv_rankList( i_chip ),
@@ -122,7 +122,7 @@ class MemTdCtlr
/**
* @brief Bans TPS on the given rank. Any attempts to add a TPS procedure
* to the queue for this rank will be ignored.
- * @param i_chip MCA or MBA chip.
+ * @param i_chip MCA, MBA, or OCMB chip.
* @param i_rank The target slave rank.
*/
void banTps( ExtensibleChip * i_chip, const MemRank & i_rank )
@@ -294,15 +294,17 @@ class MemTdCtlr
/**
* @param o_canResume True, if background scrubbing can be resumed. False,
* if a new background scrub command must be started.
+ * @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
*/
- uint32_t canResumeBgScrub( bool & o_canResume );
+ uint32_t canResumeBgScrub( bool & o_canResume,
+ STEP_CODE_DATA_STRUCT & io_sc );
#endif
private: // instance variables
- /** An MCBIST or MBA chip associated with this TD controller. */
+ /** An MCBIST, MBA, or OCMB chip associated with this TD controller. */
ExtensibleChip * const iv_chip;
/** The TD queue that contains all of the pending TD procedures. */
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
index ea04d2964..401a48042 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -160,6 +160,14 @@ bool __mnfgCeCheck<TYPE_MCA>( uint32_t i_eccAttns )
}
template<> inline
+bool __mnfgCeCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns )
+{
+ return ( ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) ) &&
+ ( (0 != (i_eccAttns & MAINT_NCE)) ||
+ (0 != (i_eccAttns & MAINT_TCE)) ) );
+}
+
+template<> inline
bool __mnfgCeCheck<TYPE_MBA>( uint32_t i_eccAttns )
{
return ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) );
@@ -251,12 +259,18 @@ template
uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemAddr & i_addr,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
// Avoid linker errors with the template.
template class MemTdCtlr<TYPE_MCBIST>;
template class MemTdCtlr<TYPE_MBA>;
+template class MemTdCtlr<TYPE_OCMB_CHIP>;
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
index d52ef2d1d..5565e217f 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C
@@ -107,6 +107,36 @@ void __recaptureRegs<TYPE_MCBIST>( STEP_CODE_DATA_STRUCT & io_sc,
}
template<>
+void __recaptureRegs<TYPE_OCMB_CHIP>( STEP_CODE_DATA_STRUCT & io_sc,
+ ExtensibleChip * i_chip )
+{
+ #define PRDF_FUNC "[__recaptureRegs<TYPE_OCMB_CHIP>] "
+
+ RegDataCache & cache = RegDataCache::getCachedRegisters();
+ CaptureData & cd = io_sc.service_data->GetCaptureData();
+
+ // refresh and recapture the ocmb registers
+ const char * ocmbRegs[] =
+ {
+ "MCBISTFIR", "RDFFIR", "MBSEC0", "MBSEC1", "OCMB_MBSSYMEC0",
+ "OCMB_MBSSYMEC1", "OCMB_MBSSYMEC2", "OCMB_MBSSYMEC3",
+ "OCMB_MBSSYMEC4", "OCMB_MBSSYMEC5", "OCMB_MBSSYMEC6",
+ "OCMB_MBSSYMEC7", "OCMB_MBSSYMEC8", "MBSMSEC", "MCBMCAT",
+ };
+
+ for ( uint32_t i = 0; i < sizeof(ocmbRegs)/sizeof(char*); i++ )
+ {
+ SCAN_COMM_REGISTER_CLASS * reg =
+ i_chip->getRegister( ocmbRegs[i] );
+ cache.flush( i_chip, reg );
+ }
+
+ i_chip->CaptureErrorData( cd, Util::hashString("MaintCmdRegs_ocmb") );
+
+ #undef PRDF_FUNC
+}
+
+template<>
void __recaptureRegs<TYPE_MBA>( STEP_CODE_DATA_STRUCT & io_sc,
ExtensibleChip * i_chip )
{
@@ -283,7 +313,7 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc )
PRDF_TRAC( PRDF_FUNC "Calling resumeBgScrub<T>(0x%08x)",
iv_chip->getHuid() );
- o_rc = resumeBgScrub<T>( iv_chip );
+ o_rc = resumeBgScrub<T>( iv_chip, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "resumeBgScrub<T>(0x%08x) failed",
@@ -358,9 +388,48 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip,
uint32_t count = symData.size();
switch ( T )
{
- case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break;
- case TYPE_MBA: PRDF_ASSERT( 1 == count ); break;
- default: PRDF_ASSERT( false );
+ case TYPE_MCA:
+ {
+ PRDF_ASSERT( 1 <= count && count <= 2 );
+ // Increment the CE counter and store the rank we're on,
+ // reset the UE and CE counts if we have stopped on a new rank.
+ ExtensibleChip * mcb = getConnectedParent(i_chip, TYPE_MCBIST);
+ McbistDataBundle * mcbdb = getMcbistDataBundle(mcb);
+ if ( mcbdb->iv_ceUeRank != i_addr.getRank() )
+ {
+ mcbdb->iv_ceStopCounter.reset();
+ mcbdb->iv_ueStopCounter.reset();
+ }
+ mcbdb->iv_ceStopCounter.inc( io_sc );
+ mcbdb->iv_ceUeRank = i_addr.getRank();
+
+ break;
+ }
+ case TYPE_MBA:
+ {
+ PRDF_ASSERT( 1 == count );
+ break;
+ }
+ case TYPE_OCMB_CHIP:
+ {
+ PRDF_ASSERT( 1 <= count && count <= 2 );
+ // Increment the UE counter and store the rank we're on,
+ // reset the UE and CE counts if we have stopped on a new rank.
+ OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip);
+ if ( ocmbdb->iv_ceUeRank != i_addr.getRank() )
+ {
+ ocmbdb->iv_ceStopCounter.reset();
+ ocmbdb->iv_ueStopCounter.reset();
+ }
+ ocmbdb->iv_ceStopCounter.inc( io_sc );
+ ocmbdb->iv_ceUeRank = i_addr.getRank();
+
+ break;
+ }
+ default:
+ {
+ PRDF_ASSERT( false );
+ }
}
for ( auto & d : symData )
@@ -408,6 +477,14 @@ uint32_t __handleSoftInterCeEte<TYPE_MCA>( ExtensibleChip * i_chip,
}
template<>
+uint32_t __handleSoftInterCeEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemAddr & i_addr,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ return __handleNceEte<TYPE_OCMB_CHIP>( i_chip, i_addr, io_sc );
+}
+
+template<>
uint32_t __handleSoftInterCeEte<TYPE_MBA>( ExtensibleChip * i_chip,
const MemAddr & i_addr,
STEP_CODE_DATA_STRUCT & io_sc )
@@ -480,6 +557,52 @@ uint32_t __handleRceEte<TYPE_MCA>( ExtensibleChip * i_chip,
}
template<>
+uint32_t __handleRceEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[__handleRceEte] "
+
+ uint32_t o_rc = SUCCESS;
+
+ // Should only get this attention in MNFG mode.
+ PRDF_ASSERT( mfgMode() );
+
+ do
+ {
+ // The RCE ETE attention could be from IUE, IMPE, or IRCD. Need to check
+ // RDFFIR[37] to determine if there was at least one IUE.
+ SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "RDFFIR" );
+ o_rc = fir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on RDFFIR: i_chip=0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+ if ( !fir->IsBitSet(37) ) break; // nothing else to do
+
+ // Handle the IUE.
+ o_errorsFound = true;
+ io_sc.service_data->AddSignatureList( i_chip->getTrgt(),
+ PRDFSIG_MaintIUE );
+ o_rc = MemEcc::handleMemIue<TYPE_OCMB_CHIP>( i_chip, i_rank, io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "analyzeMaintIue(0x%08x) failed",
+ i_chip->getHuid() );
+ break;
+ }
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<>
uint32_t __handleRceEte<TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc )
@@ -698,6 +821,11 @@ template
uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip,
const MemAddr & i_addr, bool & o_errorsFound,
STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip,
+ const MemAddr & i_addr,
+ bool & o_errorsFound,
+ STEP_CODE_DATA_STRUCT & io_sc );
//------------------------------------------------------------------------------
@@ -786,6 +914,76 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::unmaskEccAttns()
//------------------------------------------------------------------------------
template<>
+uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns()
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns] "
+
+ uint32_t o_rc = SUCCESS;
+
+ SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_OR" );
+
+ mask->clearAllBits();
+ mask->SetBit(8); // Mainline read NCE
+ mask->SetBit(9); // Mainline read TCE
+
+ o_rc = mask->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR" );
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<>
+uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns()
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns] "
+
+ uint32_t o_rc = SUCCESS;
+
+ // Memory CEs were masked at the beginning of the TD procedure, so
+ // clear and unmask them. Also, it is possible that memory UEs have
+ // thresholded so clear and unmask them as well.
+
+ SCAN_COMM_REGISTER_CLASS * fir = iv_chip->getRegister( "RDFFIR_AND" );
+ SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_AND" );
+
+ fir->setAllBits(); mask->setAllBits();
+
+ // Do not unmask NCE and TCE attentions if they have been permanently
+ // masked due to certain TPS conditions.
+ if ( !(getOcmbDataBundle(iv_chip)->iv_maskMainlineNceTce) )
+ {
+ fir->ClearBit(8); mask->ClearBit(8); // Mainline read NCE
+ fir->ClearBit(9); mask->ClearBit(9); // Mainline read TCE
+ }
+ fir->ClearBit(14); mask->ClearBit(14); // Mainline read UE
+
+ o_rc = fir->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_AND" );
+ }
+
+ o_rc = mask->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_AND" );
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<>
uint32_t MemTdCtlr<TYPE_MBA>::maskEccAttns()
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::maskEccAttns] "
@@ -887,6 +1085,13 @@ SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MCA>( ExtensibleChip * i_chip )
}
template<>
+SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_OCMB_CHIP>(
+ ExtensibleChip * i_chip )
+{
+ return i_chip->getRegister( "RDFFIR_AND" );
+}
+
+template<>
SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MBA>( ExtensibleChip * i_chip )
{
ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF );
@@ -1009,6 +1214,45 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::initialize()
}
template<>
+uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::initialize()
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::initialize] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ if ( iv_initialized ) break; // nothing to do
+
+ // Unmask the fetch attentions just in case there were masked during a
+ // TD procedure prior to a reset/reload.
+ o_rc = unmaskEccAttns();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "unmaskEccAttns() failed" );
+ break;
+ }
+
+ // Find all unverified chip marks.
+ o_rc = __findChipMarks<TYPE_OCMB_CHIP>( iv_rankList );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__findChipMarks() failed on 0x%08x",
+ iv_chip->getHuid() );
+ break;
+ }
+
+ // At this point, the TD controller is initialized.
+ iv_initialized = true;
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<>
uint32_t MemTdCtlr<TYPE_MBA>::initialize()
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::initialize] "
@@ -1162,6 +1406,118 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::handleRrFo()
//------------------------------------------------------------------------------
template<>
+uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo()
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Check if maintenance command complete attention is set.
+ SCAN_COMM_REGISTER_CLASS * mcbistfir =
+ iv_chip->getRegister("MCBISTFIR");
+ o_rc = mcbistfir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR");
+ break;
+ }
+
+ // If there is a command complete attention, nothing to do, break out.
+ if ( mcbistfir->IsBitSet(10) )
+ break;
+
+
+ // Check if a command is not running.
+ // If bit 0 of MCB_CNTLSTAT is on, a mcbist run is in progress.
+ SCAN_COMM_REGISTER_CLASS * mcb_cntlstat =
+ iv_chip->getRegister("MCB_CNTLSTAT");
+ o_rc = mcb_cntlstat->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on MCB_CNTLSTAT" );
+ break;
+ }
+
+ // If a command is not running, set command complete attn, break.
+ if ( !mcb_cntlstat->IsBitSet(0) )
+ {
+ SCAN_COMM_REGISTER_CLASS * mcbistfir_or =
+ iv_chip->getRegister("MCBISTFIR_OR");
+ mcbistfir_or->SetBit( 10 );
+
+ mcbistfir_or->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Write() failed on MCBISTFIR_OR" );
+ }
+ break;
+ }
+
+ // Check if there are unverified chip marks.
+ std::vector<TdRankListEntry> vectorList = iv_rankList.getList();
+
+ for ( auto & entry : vectorList )
+ {
+ ExtensibleChip * ocmbChip = entry.getChip();
+ MemRank rank = entry.getRank();
+
+ // Get the chip mark
+ MemMark chipMark;
+ o_rc = MarkStore::readChipMark<TYPE_OCMB_CHIP>( ocmbChip, rank,
+ chipMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_OCMB_CHIP>(0x%08x,%d) "
+ "failed", ocmbChip->getHuid(), rank.getMaster() );
+ break;
+ }
+
+ if ( !chipMark.isValid() ) continue; // no chip mark present
+
+ // Get the DQ Bitmap data.
+ MemDqBitmap dqBitmap;
+
+ o_rc = getBadDqBitmap( ocmbChip->getTrgt(), rank, dqBitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, %d)",
+ ocmbChip->getHuid(), rank.getMaster() );
+ break;
+ }
+
+ // Check if the chip mark is verified or not.
+ bool cmVerified = false;
+ o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark failed." );
+ break;
+ }
+
+ // If there are any unverified chip marks, stop the command, break.
+ if ( !cmVerified )
+ {
+ o_rc = stopBgScrub<TYPE_OCMB_CHIP>( iv_chip );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "stopBgScrub<TYPE_OCMB_CHIP>(0x%08x) "
+ "failed", iv_chip->getHuid() );
+ }
+ break;
+ }
+ }
+
+ } while (0);
+
+ return o_rc;
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+template<>
uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo()
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::handleRrFo] "
@@ -1289,7 +1645,8 @@ uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo()
//------------------------------------------------------------------------------
template<>
-uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
+uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume,
+ STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub] "
@@ -1305,21 +1662,124 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
// can use the stop conditions, which should be unique for background scrub,
// to determine if it has been configured.
- SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
- o_rc = reg->Read();
- if ( SUCCESS != o_rc )
+ do
{
- PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
- iv_chip->getHuid() );
- }
- else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
- 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
- 0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH
- reg->IsBitSet(34) && // pause on MPE
- reg->IsBitSet(35) ) // pause on UE
+ SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
+ iv_chip->getHuid() );
+ break;
+ }
+ // Note: The stop conditions for background scrubbing can now be
+ // variable depending on whether we have hit threshold for the number
+ // of UEs or CEs that we have stopped on on a rank.
+
+ // If we haven't hit CE or UE threshold, check the CE stop conditions
+ if ( !getMcbistDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) &&
+ !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
+ {
+ // If the stop conditions aren't set, just break out.
+ if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
+ 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
+ 0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH
+ {
+ break;
+ }
+
+ }
+
+ // If we haven't hit UE threshold yet, check the UE stop condition
+ if ( !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
+ {
+ // If the stop condition isn't set, just break out
+ if ( !reg->IsBitSet(35) ) // pause on UE
+ {
+ break;
+ }
+ }
+
+ // Need to check the stop on mpe stop condition regardless of whether
+ // we hit the UE or CE threshold.
+ if ( reg->IsBitSet(34) ) // pause on MPE
+ {
+ // If we reach here, all the stop conditions are set for background
+ // scrub, so we can resume.
+ o_canResume = true;
+ }
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<>
+uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub] "
+
+ uint32_t o_rc = SUCCESS;
+
+ o_canResume = false;
+
+ // It is possible that we were running a TD procedure and the PRD service
+ // was reset. Therefore, we must check if background scrubbing was actually
+ // configured. There really is not a good way of doing this. A scrub command
+ // is a scrub command the only difference is the speed. Unfortunately, that
+ // speed can change depending on how the hardware team tunes it. For now, we
+ // can use the stop conditions, which should be unique for background scrub,
+ // to determine if it has been configured.
+
+ do
{
- o_canResume = true;
- }
+ SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" );
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x",
+ iv_chip->getHuid() );
+ break;
+ }
+ // Note: The stop conditions for background scrubbing can now be
+ // variable depending on whether we have hit threshold for the number
+ // of UEs or CEs that we have stopped on on a rank.
+
+ // If we haven't hit CE or UE threshold, check the CE stop conditions
+ if ( !getOcmbDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) &&
+ !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
+ {
+ // If the stop conditions aren't set, just break out.
+ if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH
+ 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH
+ 0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH
+ {
+ break;
+ }
+
+ }
+
+ // If we haven't hit UE threshold yet, check the UE stop condition
+ if ( !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) )
+ {
+ // If the stop condition isn't set, just break out
+ if ( !reg->IsBitSet(35) ) // pause on UE
+ {
+ break;
+ }
+ }
+
+ // Need to check the stop on mpe stop condition regardless of whether
+ // we hit the UE or CE threshold.
+ if ( reg->IsBitSet(34) ) // pause on MPE
+ {
+ // If we reach here, all the stop conditions are set for background
+ // scrub, so we can resume.
+ o_canResume = true;
+ }
+ }while(0);
return o_rc;
@@ -1327,7 +1787,8 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume )
}
template<>
-uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume )
+uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume,
+ STEP_CODE_DATA_STRUCT & io_sc )
{
#define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::canResumeBgScrub] "
@@ -1365,6 +1826,7 @@ uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume )
// Avoid linker errors with the template.
template class MemTdCtlr<TYPE_MCBIST>;
template class MemTdCtlr<TYPE_MBA>;
+template class MemTdCtlr<TYPE_OCMB_CHIP>;
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H b/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H
index e61389ea2..2e833a12a 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H
@@ -80,8 +80,8 @@ class TdRankListEntry
private:
- ExtensibleChip * iv_chip = nullptr; ///< MCA, MBA, or MEM_PORT chip.
- MemRank iv_rank = MemRank(0); ///< Any rank on the MCA/MBA/MEM_PORT
+ ExtensibleChip * iv_chip = nullptr; ///< MCA, MBA, or OCMB chip.
+ MemRank iv_rank = MemRank(0); ///< Any rank on the MCA/MBA/OCMB
};
/**
@@ -95,7 +95,7 @@ class TdRankList
/**
* @brief Constructor.
- * @param MCBIST or MBA chip.
+ * @param MCBIST, OCMB, or MBA chip.
*/
explicit TdRankList( ExtensibleChip * i_chip );
@@ -191,17 +191,13 @@ inline TdRankList<TARGETING::TYPE_OCMB_CHIP>::TdRankList(
PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() );
- ExtensibleChipList memPortChipList = getConnected( i_chip, TYPE_MEM_PORT );
- for ( auto & memPortChip : memPortChipList )
- {
- std::vector<MemRank> rankList;
- getSlaveRanks<TYPE_MEM_PORT>( memPortChip->getTrgt(), rankList );
- PRDF_ASSERT( !rankList.empty() ); // target configured with no ranks
+ std::vector<MemRank> rankList;
+ getSlaveRanks<TYPE_OCMB_CHIP>( i_chip->getTrgt(), rankList );
+ PRDF_ASSERT( !rankList.empty() ); // target configured with no ranks
- for ( auto & rank : rankList )
- {
- iv_list.push_back( TdRankListEntry(memPortChip, rank) );
- }
+ for ( auto & rank : rankList )
+ {
+ iv_list.push_back( TdRankListEntry(i_chip, rank) );
}
}
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C
index de3e62e23..64eb74648 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -36,6 +36,8 @@
#include <prdfP9McaExtraSig.H>
#include <prdfPlatServices.H>
+#include <hwp_wrappers.H>
+
using namespace TARGETING;
namespace PRDF
@@ -125,6 +127,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns )
}
template<> inline
+bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns )
+{
+ return ( 0 != (i_eccAttns & MAINT_IUE) );
+}
+
+template<> inline
bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns )
{
// IUES are reported via RCE ETE on Centaur
@@ -252,13 +260,15 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd()
uint32_t o_rc = SUCCESS;
+ #ifndef CONFIG_AXONE
+
// We don't need to set any stop-on-error conditions or thresholds for
// soft/inter/hard CEs during Memory Diagnostics. The design is to let the
// command continue to the end of the rank and we do diagnostics on the
// CE counts found in the per-symbol counters. Therefore, all we need to do
// is tell the hardware which CE types to count.
- mss::mcbist::stop_conditions stopCond;
+ mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond;
switch ( iv_phase )
{
@@ -284,6 +294,8 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd()
iv_chip->getHuid(), getKey() );
}
+ #endif
+
return o_rc;
#undef PRDF_FUNC
@@ -362,11 +374,66 @@ uint32_t TpsEvent<TYPE_MBA>::startCmd()
#undef PRDF_FUNC
}
+//##############################################################################
+//
+// Specializations for OCMB
+//
+//##############################################################################
+
+template<>
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd()
+{
+ #define PRDF_FUNC "[TpsEvent::startCmd] "
+
+ uint32_t o_rc = SUCCESS;
+
+ #ifdef CONFIG_AXONE
+
+ // We don't need to set any stop-on-error conditions or thresholds for
+ // soft/inter/hard CEs during Memory Diagnostics. The design is to let the
+ // command continue to the end of the rank and we do diagnostics on the
+ // CE counts found in the per-symbol counters. Therefore, all we need to do
+ // is tell the hardware which CE types to count.
+
+ mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond;
+
+ switch ( iv_phase )
+ {
+ case TD_PHASE_1:
+ // Set the per symbol counters to count only soft/inter CEs.
+ stopCond.set_nce_soft_symbol_count_enable( mss::ON);
+ stopCond.set_nce_inter_symbol_count_enable(mss::ON);
+ break;
+
+ case TD_PHASE_2:
+ // Set the per symbol counters to count only hard CEs.
+ stopCond.set_nce_hard_symbol_count_enable(mss::ON);
+ break;
+
+ default: PRDF_ASSERT( false ); // invalid phase
+ }
+
+ // Start the time based scrub procedure on this slave rank.
+ o_rc = startTdScrub<TYPE_OCMB_CHIP>(iv_chip, iv_rank, SLAVE_RANK, stopCond);
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//------------------------------------------------------------------------------
// Avoid linker errors with the template.
template class TpsEvent<TYPE_MCA>;
template class TpsEvent<TYPE_MBA>;
+template class TpsEvent<TYPE_OCMB_CHIP>;
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
index 187b9b28d..8b3b220c6 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
@@ -37,6 +37,8 @@
#include <prdfP9McaExtraSig.H>
#include <prdfTargetServices.H>
+#include <hwp_wrappers.H>
+
using namespace TARGETING;
namespace PRDF
@@ -54,6 +56,13 @@ static const char *mcbCeStatReg[CE_REGS_PER_PORT] =
"MCB_MBSSYMEC6", "MCB_MBSSYMEC7", "MCB_MBSSYMEC8"
};
+static const char *ocmbCeStatReg[CE_REGS_PER_PORT] =
+ {
+ "OCMB_MBSSYMEC0", "OCMB_MBSSYMEC1", "OCMB_MBSSYMEC2",
+ "OCMB_MBSSYMEC3", "OCMB_MBSSYMEC4", "OCMB_MBSSYMEC5",
+ "OCMB_MBSSYMEC6", "OCMB_MBSSYMEC7", "OCMB_MBSSYMEC8"
+ };
+
//------------------------------------------------------------------------------
template <TARGETING::TYPE T>
@@ -66,6 +75,13 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip )
}
template<>
+TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_OCMB_CHIP>(
+ ExtensibleChip * i_chip )
+{
+ return getOcmbDataBundle(i_chip)->getTpsFalseAlarmCounter();
+}
+
+template<>
TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip )
{
return getMbaDataBundle(i_chip)->getTpsFalseAlarmCounter();
@@ -73,6 +89,23 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------
+template <TARGETING::TYPE T>
+void __maskMainlineNceTces( ExtensibleChip * i_chip );
+
+template<>
+void __maskMainlineNceTces<TYPE_MCA>( ExtensibleChip * i_chip )
+{
+ getMcaDataBundle(i_chip)->iv_maskMainlineNceTce = true;
+}
+
+template<>
+void __maskMainlineNceTces<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip )
+{
+ getOcmbDataBundle(i_chip)->iv_maskMainlineNceTce = true;
+}
+
+//------------------------------------------------------------------------------
+
template<TARGETING::TYPE T>
void __getNextPhase( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc,
@@ -98,12 +131,7 @@ void __getNextPhase( ExtensibleChip * i_chip, const MemRank & i_rank,
//------------------------------------------------------------------------------
template<TARGETING::TYPE T>
-bool __badDqCount( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_badDqCount );
-
-template<>
-bool __badDqCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_badDqCount )
+bool __badDqCount(MemUtils::MaintSymbols i_nibbleStats, CeCount & io_badDqCount)
{
bool badDqFound = false;
@@ -142,11 +170,7 @@ bool __badDqCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
template<TARGETING::TYPE T>
bool __badChipCount( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_badChipCount );
-
-template<>
-bool __badChipCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_badChipCount )
+ CeCount & io_badChipCount )
{
bool badChipFound = false;
uint8_t nonZeroCount = 0;
@@ -191,11 +215,7 @@ bool __badChipCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
template<TARGETING::TYPE T>
void __sumAboveOneCount( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_sumAboveOneCount );
-
-template<>
-void __sumAboveOneCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_sumAboveOneCount )
+ CeCount & io_sumAboveOneCount )
{
uint8_t sum = 0;
MemUtils::MaintSymbols symList;
@@ -226,11 +246,7 @@ void __sumAboveOneCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
template<TARGETING::TYPE T>
void __singleSymbolCount( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_singleSymCount );
-
-template<>
-void __singleSymbolCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats,
- CeCount & io_singleSymCount )
+ CeCount & io_singleSymCount )
{
uint8_t count = 0;
bool multNonZeroSyms = false;
@@ -315,12 +331,12 @@ uint32_t __updateVpdSumAboveOne( CeCount i_sumAboveOneCount,
//------------------------------------------------------------------------------
-template <>
-uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template <TARGETING::TYPE T>
+uint32_t TpsEvent<T>::analyzeEccErrors( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeEccErrors] "
+ #define PRDF_FUNC "[TpsEvent<T>::analyzeEccErrors] "
uint32_t o_rc = SUCCESS;
@@ -338,7 +354,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
// At this point we don't actually have an address for the UE. The
// best we can do is get the address in which the command stopped.
MemAddr addr;
- o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr );
+ o_rc = getMemMaintAddr<T>( iv_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -346,8 +362,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
break;
}
- o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr,
- UE_TABLE::SCRUB_UE, io_sc );
+ o_rc = MemEcc::handleMemUe<T>( iv_chip, addr,
+ UE_TABLE::SCRUB_UE, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed",
@@ -357,7 +373,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
// Because of the UE, any further TPS requests will likely have no
// effect. So ban all subsequent requests.
- MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() );
+ MemDbUtils::banTps<T>( iv_chip, addr.getRank() );
// Abort this procedure because additional repairs will likely
// not help (also avoids complication of having UE and MPE at
@@ -371,7 +387,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_MaintIUE );
- o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc );
+ o_rc = MemEcc::handleMemIue<T>( iv_chip, iv_rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed",
@@ -397,8 +413,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_MaintMPE );
- o_rc = MemEcc::handleMpe<TYPE_MCA>( iv_chip, iv_rank,
- UE_TABLE::SCRUB_MPE, io_sc );
+ o_rc = MemEcc::handleMpe<T>( iv_chip, iv_rank,
+ UE_TABLE::SCRUB_MPE, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMpe<T>(0x%08x, 0x%02x) failed",
@@ -419,36 +435,51 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
}
+template
+uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeEccErrors(const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done);
+
//------------------------------------------------------------------------------
-template<>
-uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
+template<TARGETING::TYPE T>
+uint32_t TpsEvent<T>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
{
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_TpsFalseAlarm );
// Increase false alarm counter and check threshold.
- if ( __getTpsFalseAlarmCounter<TYPE_MCA>(iv_chip)->inc( iv_rank, io_sc) )
+ if ( __getTpsFalseAlarmCounter<T>(iv_chip)->inc( iv_rank, io_sc) )
{
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_TpsFalseAlarmTH );
// Permanently mask mainline NCEs and TCEs
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
return SUCCESS;
}
+template
+uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::handleFalseAlarm(
+ STEP_CODE_DATA_STRUCT & io_sc );
+
//------------------------------------------------------------------------------
-template<>
-uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
+template<TARGETING::TYPE T>
+uint32_t TpsEvent<T>::analyzeCeSymbolCounts( CeCount i_badDqCount,
CeCount i_badChipCount, CeCount i_sumAboveOneCount,
CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc )
{
- #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts] "
+ #define PRDF_FUNC "[TpsEvent<T>::analyzeCeSymbolCounts] "
uint32_t o_rc = SUCCESS;
@@ -457,33 +488,33 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
bool tpsFalseAlarm = false;
// Get the Bad DQ Bitmap.
- TargetHandle_t mcaTrgt = iv_chip->getTrgt();
+ TargetHandle_t trgt = iv_chip->getTrgt();
MemDqBitmap dqBitmap;
- o_rc = getBadDqBitmap( mcaTrgt, iv_rank, dqBitmap );
+ o_rc = getBadDqBitmap( trgt, iv_rank, dqBitmap );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, 0x%02x) failed",
- getHuid(mcaTrgt), iv_rank.getKey() );
+ getHuid(trgt), iv_rank.getKey() );
break;
}
// Get the symbol mark.
MemMark symMark;
- o_rc = MarkStore::readSymbolMark<TYPE_MCA>( iv_chip, iv_rank, symMark );
+ o_rc = MarkStore::readSymbolMark<T>( iv_chip, iv_rank, symMark );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "readSymbolMark<TYPE_MCA>(0x%08x, 0x%02x) "
+ PRDF_ERR( PRDF_FUNC "readSymbolMark<T>(0x%08x, 0x%02x) "
"failed", iv_chip->getHuid(), iv_rank.getKey() );
break;
}
// Get the chip mark.
MemMark chipMark;
- o_rc = MarkStore::readChipMark<TYPE_MCA>( iv_chip, iv_rank, chipMark );
+ o_rc = MarkStore::readChipMark<T>( iv_chip, iv_rank, chipMark );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MCA>(0x%08x, 0x%02x) "
+ PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x, 0x%02x) "
"failed", iv_chip->getHuid(), iv_rank.getKey() );
break;
}
@@ -512,9 +543,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
// TCE. Both are still correctable after a symbol mark
// is placed.
// Place a symbol mark on this bad DQ.
- MemMark newSymMark( mcaTrgt, iv_rank,
+ MemMark newSymMark( trgt, iv_rank,
i_badDqCount.symList[0].symbol );
- o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip,
+ o_rc = MarkStore::writeSymbolMark<T>( iv_chip,
iv_rank, newSymMark );
if ( SUCCESS != o_rc )
{
@@ -552,7 +583,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs.
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
}
else
@@ -566,7 +597,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
else if ( 2 == i_badDqCount.count && 0 == i_badChipCount.count )
{
// Permanently mask mainline NCEs and TCEs.
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
// If the symbol mark is available.
if ( !symMark.isValid() )
@@ -587,9 +618,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
highSym = sym;
}
- MemMark newSymMark( mcaTrgt, iv_rank,
+ MemMark newSymMark( trgt, iv_rank,
highSym.symbol );
- o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip,
+ o_rc = MarkStore::writeSymbolMark<T>( iv_chip,
iv_rank, newSymMark );
if ( SUCCESS != o_rc )
{
@@ -669,10 +700,10 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
// This means we have only one more potential bad DQ, which
// is still correctable after a chip mark is placed.
// Place a chip mark on this bad chip.
- MemMark newChipMark( mcaTrgt, iv_rank,
+ MemMark newChipMark( trgt, iv_rank,
i_badChipCount.symList[0].symbol );
- o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank,
- newChipMark );
+ o_rc = MarkStore::writeChipMark<T>( iv_chip, iv_rank,
+ newChipMark );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) "
@@ -708,7 +739,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
}
else
@@ -731,7 +762,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
// If the chip mark is available.
if ( !chipMark.isValid() )
@@ -742,10 +773,10 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
// This means we have no more potential bad DQ or bad chips
// since we can't correct those after chip mark is placed.
// Place a chip mark on the bad chip.
- MemMark newChipMark( mcaTrgt, iv_rank,
+ MemMark newChipMark( trgt, iv_rank,
i_badChipCount.symList[0].symbol );
- o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank,
- newChipMark );
+ o_rc = MarkStore::writeChipMark<T>( iv_chip, iv_rank,
+ newChipMark );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) "
@@ -763,8 +794,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
// this chip mark, we need to clear the symbol mark now
// instead of at the end of the function to make room
// for the additional symbol mark.
- o_rc = MarkStore::clearSymbolMark<TYPE_MCA>( iv_chip,
- iv_rank );
+ o_rc = MarkStore::clearSymbolMark<T>( iv_chip,
+ iv_rank );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MarkStore::clearSymbolMark("
@@ -810,7 +841,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs.
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
}
// If the symbol mark is available.
@@ -822,9 +853,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
// This means we have no more potential bad DQ or bad chips
// since we can't correct those after symbol mark is placed.
// Place a symbol mark on this bad DQ.
- MemMark newSymMark( mcaTrgt, iv_rank,
+ MemMark newSymMark( trgt, iv_rank,
i_badDqCount.symList[0].symbol );
- o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip,
+ o_rc = MarkStore::writeSymbolMark<T>( iv_chip,
iv_rank, newSymMark );
if ( SUCCESS != o_rc )
{
@@ -865,7 +896,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs.
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
}
@@ -888,7 +919,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
io_sc.service_data->setServiceCall();
// Permanently mask mainline NCEs and TCEs.
- getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true;
+ __maskMainlineNceTces<T>( iv_chip );
}
// If analysis resulted in a false alarm.
@@ -903,18 +934,18 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
}
// Write any updates to VPD.
- o_rc = setBadDqBitmap( mcaTrgt, iv_rank, dqBitmap );
+ o_rc = setBadDqBitmap( trgt, iv_rank, dqBitmap );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "setBadDqBitmap(0x%08x, 0x%02x) failed",
- getHuid(mcaTrgt), iv_rank.getKey() );
+ getHuid(trgt), iv_rank.getKey() );
break;
}
// We may have placed a chip mark so do any necessary cleanup. This must
// be called after writing the bad DQ bitmap because the this function
// will also write it if necessary.
- o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc );
+ o_rc = MarkStore::chipMarkCleanup<T>( iv_chip, iv_rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "MarkStore::chipMarkCleanup(0x%08x,0x%02x) "
@@ -929,6 +960,15 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
#undef PRDF_FUNC
}
+template
+uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount,
+ CeCount i_badChipCount, CeCount i_sumAboveOneCount,
+ CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeSymbolCounts( CeCount i_badDqCount,
+ CeCount i_badChipCount, CeCount i_sumAboveOneCount,
+ CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc );
+
//------------------------------------------------------------------------------
template<>
@@ -1031,11 +1071,110 @@ uint32_t TpsEvent<TYPE_MCA>::getSymbolCeCounts( CeCount & io_badDqCount,
//------------------------------------------------------------------------------
-template <>
-uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<>
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts( CeCount & io_badDqCount,
+ CeCount & io_badChipCount, CeCount & io_sumAboveOneCount,
+ CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Get the Bad DQ Bitmap.
+ TargetHandle_t ocmbTrgt = iv_chip->getTrgt();
+ MemDqBitmap dqBitmap;
+
+ o_rc = getBadDqBitmap( ocmbTrgt, iv_rank, dqBitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x,%d) failed",
+ getHuid(ocmbTrgt), iv_rank.getMaster() );
+ break;
+ }
+ std::vector<MemSymbol> bmSymList = dqBitmap.getSymbolList();
+
+ const char * reg_str = nullptr;
+ SCAN_COMM_REGISTER_CLASS * reg = nullptr;
+
+ for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_PORT; regIdx++ )
+ {
+ reg_str = ocmbCeStatReg[regIdx];
+ reg = iv_chip->getRegister( reg_str );
+
+ o_rc = reg->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "Read() failed on %s.", reg_str );
+ break;
+ }
+ uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx;
+
+ for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG;
+ i += MEM_SYMBOLS_PER_NIBBLE )
+ {
+ MemUtils::MaintSymbols nibbleStats;
+
+ // Get a nibble's worth of symbols.
+ for ( uint8_t n = 0; n < MEM_SYMBOLS_PER_NIBBLE; n++ )
+ {
+ uint8_t sym = baseSymbol + (i+n);
+ PRDF_ASSERT( sym < SYMBOLS_PER_RANK );
+
+ MemUtils::SymbolData symData;
+ symData.symbol = MemSymbol::fromSymbol( ocmbTrgt, iv_rank,
+ sym, CEN_SYMBOL::ODD_SYMBOL_DQ );
+ if ( !symData.symbol.isValid() )
+ {
+ PRDF_ERR( PRDF_FUNC "MemSymbol() failed: symbol=%d",
+ sym );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Any symbol set in the DRAM repairs VPD will have an
+ // automatic CE count of 0xFF
+ if ( std::find( bmSymList.begin(), bmSymList.end(),
+ symData.symbol ) != bmSymList.end() )
+ symData.count = 0xFF;
+ else
+ symData.count = reg->GetBitFieldJustified(((i+n)*8), 8);
+
+ nibbleStats.push_back( symData );
+
+ // Add all symbols with non-zero counts to the callout list.
+ if ( symData.count != 0 )
+ {
+ MemoryMru mm { ocmbTrgt, iv_rank, symData.symbol };
+ io_sc.service_data->SetCallout( mm );
+ }
+ }
+ if ( SUCCESS != o_rc ) break;
+
+ // Analyze the nibble of symbols.
+ __analyzeNibbleSyms<TYPE_OCMB_CHIP>( nibbleStats, io_badDqCount,
+ io_badChipCount, io_sumAboveOneCount, io_singleSymCount );
+
+ }
+ if ( SUCCESS != o_rc ) break;
+ }
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+
+}
+
+//------------------------------------------------------------------------------
+
+template <TARGETING::TYPE T>
+uint32_t TpsEvent<T>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeCeStats] "
+ #define PRDF_FUNC "[TpsEvent<T>::analyzeCeStats] "
uint32_t o_rc = SUCCESS;
@@ -1086,11 +1225,18 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc,
}
+template
+uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeStats(STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done);
+
//------------------------------------------------------------------------------
-template<>
-uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t TpsEvent<T>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
#define PRDF_FUNC "[TpsEvent::analyzePhase] "
@@ -1102,11 +1248,11 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
// Analyze Ecc Attentions
uint32_t eccAttns;
- o_rc = checkEccFirs<TYPE_MCA>( iv_chip, eccAttns );
+ o_rc = checkEccFirs<T>( iv_chip, eccAttns );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "checkEccFirs(0x%08x) failed",
- iv_chip->getHuid() );
+ iv_chip->getHuid() );
break;
}
@@ -1135,7 +1281,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
if ( (SUCCESS == o_rc) && o_done )
{
// Clear the ECC FFDC for this master rank.
- MemDbUtils::resetEccFfdc<TYPE_MCA>( iv_chip, iv_rank, SLAVE_RANK );
+ MemDbUtils::resetEccFfdc<T>( iv_chip, iv_rank, SLAVE_RANK );
}
return o_rc;
@@ -1143,6 +1289,36 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
#undef PRDF_FUNC
}
+template
+uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+
+//------------------------------------------------------------------------------
+
+template<TARGETING::TYPE T>
+uint32_t TpsEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ uint32_t signature = 0;
+
+ __getNextPhase<T>( iv_chip, iv_rank, io_sc, iv_phase, signature );
+
+ PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x",
+ iv_phase, iv_chip->getHuid(), getKey() );
+
+ io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature );
+
+ return startCmd();
+}
+
+template
+uint32_t TpsEvent<TYPE_MCA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::startNextPhase(
+ STEP_CODE_DATA_STRUCT & io_sc );
+
//##############################################################################
//
// Specializations for MCA
@@ -1156,13 +1332,15 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd()
uint32_t o_rc = SUCCESS;
+ #ifndef CONFIG_AXONE
+
// We don't need to set any stop-on-error conditions or thresholds for
// soft/inter/hard CEs at runtime. The design is to let the command continue
// to the end of the rank and we do diagnostics on the CE counts found in
// the per-symbol counters. Therefore, all we need to do is tell the
// hardware which CE types to count.
- mss::mcbist::stop_conditions stopCond;
+ mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond;
switch ( iv_phase )
{
@@ -1190,26 +1368,67 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd()
iv_chip->getHuid(), getKey() );
}
+ #endif
+
return o_rc;
#undef PRDF_FUNC
}
-//------------------------------------------------------------------------------
+//##############################################################################
+//
+// Specializations for OCMB
+//
+//##############################################################################
template<>
-uint32_t TpsEvent<TYPE_MCA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc )
+uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd()
{
- uint32_t signature = 0;
+ #define PRDF_FUNC "[TpsEvent::startCmd] "
- __getNextPhase<TYPE_MCA>( iv_chip, iv_rank, io_sc, iv_phase, signature );
+ uint32_t o_rc = SUCCESS;
- PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x",
- iv_phase, iv_chip->getHuid(), getKey() );
+ #ifdef CONFIG_AXONE
- io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature );
+ // We don't need to set any stop-on-error conditions or thresholds for
+ // soft/inter/hard CEs at runtime. The design is to let the command continue
+ // to the end of the rank and we do diagnostics on the CE counts found in
+ // the per-symbol counters. Therefore, all we need to do is tell the
+ // hardware which CE types to count.
- return startCmd();
+ mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond;
+
+ switch ( iv_phase )
+ {
+ case TD_PHASE_1:
+ // Set the per symbol counters to count only hard CEs.
+ stopCond.set_nce_hard_symbol_count_enable(mss::ON);
+ break;
+
+ case TD_PHASE_2:
+ // Since there are not enough hard CEs to trigger a symbol mark, set
+ // the per symbol counters to count all CE types.
+ stopCond.set_nce_soft_symbol_count_enable( mss::ON);
+ stopCond.set_nce_inter_symbol_count_enable(mss::ON);
+ stopCond.set_nce_hard_symbol_count_enable( mss::ON);
+ break;
+
+ default: PRDF_ASSERT( false ); // invalid phase
+ }
+
+ // Start the time based scrub procedure on this slave rank.
+ o_rc = startTdScrub<TYPE_OCMB_CHIP>(iv_chip, iv_rank, SLAVE_RANK, stopCond);
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
}
//##############################################################################
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C
index 8c3c4480a..784306baf 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2018 */
+/* Contributors Listed Below - COPYRIGHT 2018,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -30,6 +30,8 @@
// Platform includes
#include <prdfCenMbaExtraSig.H>
+#include <hwp_wrappers.H>
+
using namespace TARGETING;
namespace PRDF
@@ -39,41 +41,16 @@ using namespace PlatServices;
//##############################################################################
//
-// Specializations for MCA
+// Generic Specializations
//
//##############################################################################
-template<>
-uint32_t VcmEvent<TYPE_MCA>::startCmd()
+template<TARGETING::TYPE T>
+uint32_t VcmEvent<T>::handlePhaseComplete( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[VcmEvent::startCmd] "
-
- uint32_t o_rc = SUCCESS;
-
- // No stop conditions.
- mss::mcbist::stop_conditions stopCond;
-
- // Start the time based scrub procedure on this master rank.
- o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, MASTER_RANK, stopCond );
- if ( SUCCESS != o_rc )
- {
- PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
- iv_chip->getHuid(), getKey() );
- }
-
- return o_rc;
-
- #undef PRDF_FUNC
-}
-
-//------------------------------------------------------------------------------
-
-template<>
-uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
-{
- #define PRDF_FUNC "[VcmEvent<TYPE_MCA>::handlePhaseComplete] "
+ #define PRDF_FUNC "[VcmEvent<T>::handlePhaseComplete] "
uint32_t o_rc = SUCCESS;
@@ -100,6 +77,49 @@ uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns,
#undef PRDF_FUNC
}
+template
+uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+template
+uint32_t VcmEvent<TYPE_OCMB_CHIP>::handlePhaseComplete(
+ const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+
+//##############################################################################
+//
+// Specializations for MCA
+//
+//##############################################################################
+
+template<>
+uint32_t VcmEvent<TYPE_MCA>::startCmd()
+{
+ #define PRDF_FUNC "[VcmEvent::startCmd] "
+
+ uint32_t o_rc = SUCCESS;
+
+ #ifndef CONFIG_AXONE
+
+ // No stop conditions.
+ mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond;
+
+ // Start the time based scrub procedure on this master rank.
+ o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, MASTER_RANK, stopCond );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//##############################################################################
//
// Specializations for MBA
@@ -448,6 +468,40 @@ uint32_t VcmEvent<TYPE_MBA>::handlePhaseComplete( const uint32_t & i_eccAttns,
#undef PRDF_FUNC
}
+//##############################################################################
+//
+// Specializations for OCMB
+//
+//##############################################################################
+
+template<>
+uint32_t VcmEvent<TYPE_OCMB_CHIP>::startCmd()
+{
+ #define PRDF_FUNC "[VcmEvent::startCmd] "
+
+ uint32_t o_rc = SUCCESS;
+
+ #ifdef CONFIG_AXONE
+
+ // No stop conditions.
+ mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond;
+
+ // Start the time based scrub procedure on this master rank.
+ o_rc = startTdScrub<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK,
+ stopCond );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed",
+ iv_chip->getHuid(), getKey() );
+ }
+
+ #endif
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//------------------------------------------------------------------------------
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
index b319f910b..c712d6aa3 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -342,6 +342,9 @@ class VcmEvent : public TdEntry
#ifdef __HOSTBOOT_RUNTIME
template<>
uint32_t VcmEvent<TARGETING::TYPE_MCA>::cleanup(STEP_CODE_DATA_STRUCT & io_sc);
+template<>
+uint32_t VcmEvent<TARGETING::TYPE_OCMB_CHIP>::cleanup(
+ STEP_CODE_DATA_STRUCT & io_sc);
#endif
template<>
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
index 26ef1d727..5ffa9a84b 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -92,6 +92,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns )
}
template<> inline
+bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns )
+{
+ return ( 0 != (i_eccAttns & MAINT_IUE) );
+}
+
+template<> inline
bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns )
{
// IUES are reported via RCE ETE on Centaur
@@ -218,6 +224,7 @@ uint32_t VcmEvent<TYPE_MBA>::startCmd()
// Avoid linker errors with the template.
template class VcmEvent<TYPE_MCA>;
template class VcmEvent<TYPE_MBA>;
+template class VcmEvent<TYPE_OCMB_CHIP>;
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
index ca4de8e5a..e64227996 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -55,6 +55,12 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip )
}
template<>
+VcmFalseAlarm * __getFalseAlarmCounter<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip)
+{
+ return getOcmbDataBundle(i_chip)->getVcmFalseAlarmCounter();
+}
+
+template<>
VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip )
{
return getMbaDataBundle(i_chip)->getVcmFalseAlarmCounter();
@@ -62,16 +68,16 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip )
//##############################################################################
//
-// Specializations for MCA
+// Generic Specializations
//
//##############################################################################
-template<>
-uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
- STEP_CODE_DATA_STRUCT & io_sc,
- bool & o_done )
+template<TARGETING::TYPE T>
+uint32_t VcmEvent<T>::checkEcc( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done )
{
- #define PRDF_FUNC "[VcmEvent<TYPE_MCA>::checkEcc] "
+ #define PRDF_FUNC "[VcmEvent<T>::checkEcc] "
uint32_t o_rc = SUCCESS;
@@ -88,7 +94,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
// At this point we don't actually have an address for the UE. The
// best we can do is get the address in which the command stopped.
MemAddr addr;
- o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr );
+ o_rc = getMemMaintAddr<T>( iv_chip, addr );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed",
@@ -96,7 +102,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
break;
}
- o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr,
+ o_rc = MemEcc::handleMemUe<T>( iv_chip, addr,
UE_TABLE::SCRUB_UE, io_sc );
if ( SUCCESS != o_rc )
{
@@ -107,7 +113,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
// Because of the UE, any further TPS requests will likely have no
// effect. So ban all subsequent requests.
- MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() );
+ MemDbUtils::banTps<T>( iv_chip, addr.getRank() );
// Leave the mark in place and abort this procedure.
o_done = true; break;
@@ -118,7 +124,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_MaintIUE );
- o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc );
+ o_rc = MemEcc::handleMemIue<T>( iv_chip, iv_rank, io_sc );
if ( SUCCESS != o_rc )
{
PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed",
@@ -143,6 +149,14 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
#undef PRDF_FUNC
}
+template
+uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
+template
+uint32_t VcmEvent<TYPE_OCMB_CHIP>::checkEcc( const uint32_t & i_eccAttns,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ bool & o_done );
//------------------------------------------------------------------------------
@@ -180,6 +194,41 @@ uint32_t VcmEvent<TYPE_MCA>::cleanup( STEP_CODE_DATA_STRUCT & io_sc )
#undef PRDF_FUNC
}
+template<>
+uint32_t VcmEvent<TYPE_OCMB_CHIP>::cleanup( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[VcmEvent::cleanup] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ o_rc = MarkStore::chipMarkCleanup<TYPE_OCMB_CHIP>( iv_chip, iv_rank,
+ io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed",
+ iv_chip->getHuid(), iv_rank.getKey() );
+ break;
+ }
+
+ // The cleanup() function is called by both verified() and falseAlarm().
+ // In either case, the error log should be predictive if there has been
+ // a least one false alarm on any DRAM on this rank other than this
+ // DRAM. This is required on Nimbus because of two symbol correction,
+ // which does not exist on Centaur.
+ VcmFalseAlarm * faCntr =__getFalseAlarmCounter<TYPE_OCMB_CHIP>(iv_chip);
+ uint8_t dram = iv_mark.getSymbol().getDram();
+ if ( faCntr->queryDrams(iv_rank, dram, io_sc) )
+ io_sc.service_data->setServiceCall();
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
//##############################################################################
//
// Specializations for MBA
@@ -386,6 +435,7 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
// Avoid linker errors with the template.
template class VcmEvent<TYPE_MCA>;
template class VcmEvent<TYPE_MBA>;
+template class VcmEvent<TYPE_OCMB_CHIP>;
//------------------------------------------------------------------------------
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
index 5f7efa274..fac29fce3 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
+++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
@@ -27,7 +27,6 @@
#include <iipServiceDataCollector.h>
#include <prdfExtensibleChip.H>
#include <prdfPluginMap.H>
-#include <isteps/nvdimm/nvdimm.H>
// Platform includes
#include <prdfMemDbUtils.H>
@@ -38,6 +37,10 @@
#include <prdfMemTps.H>
#endif
+#ifdef CONFIG_NVDIMM
+ #include <nvdimm.H>
+#endif
+
using namespace TARGETING;
namespace PRDF
@@ -296,18 +299,9 @@ PRDF_PLUGIN_DEFINE( nimbus_mca, MemPortFailure );
//
//##############################################################################
+#ifdef CONFIG_NVDIMM
#ifdef __HOSTBOOT_RUNTIME
-enum nvdimmRegOffset
-{
- NVDIMM_MGT_CMD1 = 0x041,
- MODULE_HEALTH = 0x0A0,
- MODULE_HEALTH_STATUS0 = 0x0A1,
- MODULE_HEALTH_STATUS1 = 0x0A2,
- ERROR_THRESHOLD_STATUS = 0x0A5,
- WARNING_THRESHOLD_STATUS = 0x0A7,
-};
-
/**
* @brief Gets a map list of which bits are set from a uint8_t bit list (7:0)
* @param i_data uint8_t bit list (7:0)
@@ -349,6 +343,7 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm,
break;
}
+ // addPartCallout will default to GARD_NULL, NO_DECONFIG
mainErrl->addPartCallout( i_dimm, HWAS::BPM_PART_TYPE,
i_priority );
@@ -362,10 +357,12 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm,
/**
* @brief Adds a callout of the cable connecting an NVDIMM to its
* backup power module (BPM)
+ * @param i_dimm The target dimm.
* @param i_priority The callout priority.
* @return FAIL if unable to get the global error log, else SUCCESS
*/
-uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority )
+uint32_t __addNvdimmCableCallout( TargetHandle_t i_dimm,
+ HWAS::callOutPriority i_priority )
{
#define PRDF_FUNC "[__addNvdimmCableCallout] "
@@ -382,7 +379,9 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority )
break;
}
- mainErrl->addProcedureCallout( HWAS::EPUB_PRC_NVDIMM_ERR, i_priority );
+ // addPartCallout will default to GARD_NULL, NO_DECONFIG
+ mainErrl->addPartCallout( i_dimm, HWAS::BPM_CABLE_PART_TYPE,
+ i_priority );
}while(0);
@@ -391,21 +390,45 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority )
#undef PRDF_FUNC
}
+/**
+ * @brief If a previous error has been found, add a signature to the
+ * multi-signature list, else set the primary signature.
+ * @param io_sc The step code data struct.
+ * @param i_trgt The target.
+ * @param i_errFound Whether an error has already been found or not.
+ * @param i_sig The signature to be set.
+ */
+void __addSignature( STEP_CODE_DATA_STRUCT & io_sc, TargetHandle_t i_trgt,
+ bool i_errFound, uint32_t i_sig )
+{
+ if ( i_errFound )
+ {
+ io_sc.service_data->AddSignatureList( i_trgt, i_sig );
+ }
+ else
+ {
+ io_sc.service_data->setSignature( getHuid(i_trgt), i_sig );
+ }
+}
/**
* @brief Analyze NVDIMM Health Status0 Register for errors
- * @param io_sc The step code data struct.
- * @param i_dimm The target dimm.
+ * @param io_sc The step code data struct.
+ * @param i_dimm The target dimm.
+ * @param io_errFound Whether an error has already been found or not.
* @return FAIL if unable to read register, else SUCCESS
*/
-uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc,
- TargetHandle_t i_dimm )
+uint32_t __analyzeHealthStatus0Reg(STEP_CODE_DATA_STRUCT & io_sc,
+ TargetHandle_t i_dimm, bool & io_errFound)
{
#define PRDF_FUNC "[__analyzeHealthStatus0Reg] "
uint32_t o_rc = SUCCESS;
uint8_t data = 0;
+ // Get MCA, for signatures
+ TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
+
do
{
// NVDIMM health status registers size = 1 byte
@@ -413,7 +436,7 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc,
// Read the Health Status0 Register (0xA1) 7:0
errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS0) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS0) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to read Health Status0 Register. "
@@ -427,58 +450,66 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc,
// BIT 0: Voltage Regulator Fail
if ( bitList.count(0) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VoltRegFail );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_VoltRegFail );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 1: VDD Lost
if ( bitList.count(1) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VddLost );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_VddLost );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 2: VPP Lost
if ( bitList.count(2) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VppLost );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_VppLost );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 3: VTT Lost
if ( bitList.count(3) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VttLost );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_VttLost );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 4: DRAM not Self Refresh
if ( bitList.count(4) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NotSelfRefr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotSelfRefr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 5: Controller HW Error
if ( bitList.count(5) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CtrlHwErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_CtrlHwErr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 6: NVM Controller Error
if ( bitList.count(6) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmCtrlErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmCtrlErr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 7: NVM Lifetime Error
if ( bitList.count(7) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmLifeErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeErr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
}while(0);
@@ -491,18 +522,22 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc,
/**
* @brief Analyze NVDIMM Health Status1 Register for errors
- * @param io_sc The step code data struct.
- * @param i_dimm The target dimm.
+ * @param io_sc The step code data struct.
+ * @param i_dimm The target dimm.
+ * @param io_errFound Whether an error has already been found or not.
* @return FAIL if unable to read register, else SUCCESS
*/
uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
- TargetHandle_t i_dimm )
+ TargetHandle_t i_dimm, bool & io_errFound )
{
#define PRDF_FUNC "[__analyzeHealthStatus1Reg] "
uint32_t o_rc = SUCCESS;
uint8_t data = 0;
+ // Get MCA, for signatures
+ TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
+
do
{
// NVDIMM health status registers size = 1 byte
@@ -510,7 +545,7 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
// Read the Health Status1 Register (0xA2) 7:0
errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS1) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS1) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to read Health Status1 Register. "
@@ -524,83 +559,90 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
// BIT 0: Insufficient Energy
if ( bitList.count(0) )
{
- io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_InsuffEnergy);
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_InsuffEnergy );
// Callout BPM (backup power module) high, cable high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
- o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH );
+ o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 1: Invalid Firmware
if ( bitList.count(1) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_InvFwErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_InvFwErr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 2: Configuration Data Error
if ( bitList.count(2) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CnfgDataErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_CnfgDataErr );
// Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD );
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ io_errFound = true;
}
// BIT 3: No Energy Source
if ( bitList.count(3) )
{
- io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_NoEsPres);
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NoEsPres );
// Callout BPM (backup power module) high, cable high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
- o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH );
+ o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 4: Energy Policy Not Set
if ( bitList.count(4) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsPolNotSet );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsPolNotSet );
// Callout FW (Level2 Support) High
io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD );
// Callout NVDIMM low on 1st, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 5: Energy Source HW Error
if ( bitList.count(5) )
{
- io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsHwFail );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHwFail );
// Callout BPM (backup power module) high, cable high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
- o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH );
+ o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 6: Energy Source Health Assessment Error
if ( bitList.count(6) )
{
- io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_EsHlthAssess);
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHlthAssess);
// Callout BPM (backup power module) high, cable high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
- o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH );
+ o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 7: Reserved
@@ -613,18 +655,105 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
}
/**
+ * @brief Reads and merges the data from two ES_TEMP registers to get the
+ * correct temperature format.
+ * @param i_dimm The target nvdimm.
+ * @param i_tempMsbReg The address of the register that contains the most
+ * significant byte of the temperature data.
+ * @param i_tempLsbReg The address of the register that contains the least
+ * significant byte of the temperature data.
+ * @param o_tempData The 16 bit temperature data.
+ * @return FAIL if unable to read register, else SUCCESS
+ */
+uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
+ uint16_t i_tempLsbReg, uint16_t & o_tempData )
+{
+ #define PRDF_FUNC "[__readTemp] "
+
+ /*
+ * -NOTE: Example showing how to read the temperature format:
+ * ES_TEMP1 = 0x03 (MSB: bits 15-8)
+ * ES_TEMP0 = 0x48 (LSB: bits 7-0)
+ *
+ * 0x0348 = 0000 0011 0100 1000 = 52.5 C
+ *
+ * -NOTE: bit definition:
+ * [15:13]Reserved
+ * [12]Sign 0 = positive, 1 = negative; 0°C should be expressed as positive
+ * [11] 128°C
+ * [10] 64°C
+ * [9] 32°C
+ * [8] 16°C
+ * [7] 8°C
+ * [6] 4°C
+ * [5] 2°C
+ * [4] 1°C
+ * [3] 0.5°C
+ * [2] 0.25°C
+ * [1] 0.125°C Optional for temp fields; not used for temp th fields
+ * [0]0.0625°C Optional for temp fields; not used for temp th fields
+ */
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // NVDIMM health status registers size = 1 byte
+ size_t NVDIMM_SIZE = 1;
+ uint8_t msbData = 0;
+ uint8_t lsbData = 0;
+
+ // Read the two inputted temperature registers.
+ errlHndl_t errl = deviceRead( i_dimm, &msbData, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(i_tempMsbReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature MSB Register. "
+ "HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ errl = deviceRead( i_dimm, &lsbData, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(i_tempLsbReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature LSB Register. "
+ "HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ o_tempData = ((uint16_t)msbData << 8) | lsbData;
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+/**
* @brief Analyze NVDIMM Error Threshold Status Register for errors
- * @param io_sc The step code data struct.
- * @param i_dimm The target dimm.
+ * @param io_sc The step code data struct.
+ * @param i_dimm The target dimm.
+ * @param io_errFound Whether an error has already been found or not.
+ * @param o_esTempErr A flag for whether we hit an ES TEMP error or not.
* @return FAIL if unable to read register, else SUCCESS
*/
uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
- TargetHandle_t i_dimm )
+ TargetHandle_t i_dimm, bool & io_errFound,
+ bool & o_esTempErr )
{
#define PRDF_FUNC "[__analyzeErrorThrStatusReg] "
uint32_t o_rc = SUCCESS;
uint8_t data = 0;
+ o_esTempErr = false;
+
+ // Get MCA, for signatures
+ TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
do
{
@@ -633,7 +762,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// Read the Error Threshold Status Register (0xA5) 7:0
errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(ERROR_THRESHOLD_STATUS) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::ERROR_THRESHOLD_STATUS) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Status Reg. "
@@ -648,7 +777,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// BIT 1: ES Lifetime Error
if ( bitList.count(1) )
{
- io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsLifeErr );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsLifeErr );
// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
@@ -656,11 +785,60 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ io_errFound = true;
}
// BIT 2: ES Temperature Error
if ( bitList.count(2) )
{
- io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsTmpErr );
+ // Sleep two seconds to avoid exiting PRD analysis faster than the
+ // ES_TEMP sample rate.
+ PlatServices::milliSleep( 2, 0 );
+
+ // Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values
+ uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
+ uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;
+ uint16_t esTemp = 0;
+ o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
+ if ( SUCCESS != o_rc ) break;
+
+ uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1;
+ uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0;
+ uint16_t esTempHighTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
+ if ( SUCCESS != o_rc ) break;
+
+ msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1;
+ lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0;
+ uint16_t esTempLowTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
+ if ( SUCCESS != o_rc ) break;
+
+ // Check to see if the ES_TEMP is negative (bit 12)
+ bool esTempNeg = false;
+ if ( esTemp & 0x1000 ) esTempNeg = true;
+
+ // If ES_TEMP is equal or above ES_TEMP_ERROR_HIGH_THRESHOLD
+ // Just in case ES_TEMP has moved before we read it out, we'll add
+ // a 2°C margin when comparing to the threshold.
+ if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg )
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpErrHigh );
+ }
+ // Else check if the error hit the low threshold, again with the
+ // same 2°C margin.
+ else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpErrLow );
+ }
+ // Else the temperature must have gone back to a normal value, so
+ // we will label this as a false alarm case.
+ else
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpErrFa );
+ }
// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
@@ -668,6 +846,9 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+
+ o_esTempErr = true;
+ io_errFound = true;
}
// BIT 3:7: Reserved
@@ -680,6 +861,419 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
}
/**
+ * @brief Adjusts the warning threshold so that future warnings are allowed
+ * to report.
+ * @param io_sc The step code data struct.
+ * @param i_dimm The target nvdimm.
+ * @param i_warnThReg The address of the relevant warning threshold register.
+ * @param i_errThReg The address of the relevant error threshold register.
+ * @param o_firstWarn Flag if this is the first warning of this type.
+ * @param o_statusErr Flag to tell if we found an error from checking the
+ * notification status register.
+ * @return FAIL if unable to read register, else SUCCESS
+ */
+uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
+ TargetHandle_t i_dimm, uint16_t i_warnThReg,
+ uint16_t i_errThReg, bool & o_firstWarn,
+ bool & o_statusErr )
+{
+ #define PRDF_FUNC "[__adjustThreshold] "
+
+ uint32_t o_rc = SUCCESS;
+ uint16_t notifCmdReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD;
+ uint16_t notifStatusReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_STATUS;
+ o_firstWarn = false;
+ o_statusErr = false;
+
+ do
+ {
+ // NVDIMM health status registers size = 1 byte
+ size_t NVDIMM_SIZE = 1;
+
+ // Read the corresponding warning threshold
+ uint8_t warnTh = 0;
+ errlHndl_t errl = deviceRead( i_dimm, &warnTh, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(i_warnThReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Reg. HUID: "
+ "0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Read the corresponding error threshold
+ uint8_t errTh = 0;
+ errl = deviceRead( i_dimm, &errTh, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(i_errThReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Reg. HUID: "
+ "0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ // If the warning threshold is not set to the error threshold+1,
+ // move the threshold.
+ if ( warnTh != (errTh+1) )
+ {
+ o_firstWarn = true;
+
+ // SET_EVENT_NOTIFICATION_CMD is a write only register that is
+ // used to change the SET_EVENT_NOTIFICATION_STATUS register.
+ // The only bits within it that are used are bits 0 and 1, as such
+ // we can safely set the rest to 0. It is defined as:
+ // [0]: Persistency Notification
+ // [1]: Warning Threshold Notification
+ // [2]: Obsolete
+ // [3]: Firmware Activation Notification (Not Used)
+ // [4:7]: Reserved
+
+ // Clear SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set
+ uint8_t notifCmd = 0x01;
+ errl = deviceWrite( i_dimm, &notifCmd, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifCmdReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification "
+ "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Check SET_EVENT_NOTIFICATION_STATUS to ensure everything is set
+ // as we expect and we don't see any errors.
+ uint8_t notifStat = 0;
+ errl = deviceRead( i_dimm, &notifStat, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifStatusReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
+ "Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+ std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat );
+
+ // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1
+ // or Bit [2]: PERSISTENCY_ENABLED = 0
+ // or Bit [3]: WARNING_THRESHOLD_ENABLED = 1
+ if ( bitList.count(1) || !bitList.count(2) || bitList.count(3) )
+ {
+ o_statusErr = true;
+
+ // Make the log predictive and mask the fir
+ io_sc.service_data->SetThresholdMaskId(0);
+
+ // Callout the NVDIMM, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+
+ break;
+ }
+
+
+ // Set the warning threshold to error threshold + 1
+ warnTh = errTh+1;
+ errl = deviceWrite( i_dimm, &warnTh, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(i_warnThReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to write Warning Threshold Reg. "
+ "HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Set SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set
+ notifCmd = 0x03;
+ errl = deviceWrite( i_dimm, &notifCmd, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifCmdReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification "
+ "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+
+ // Recheck SET_EVENT_NOTIFICATION_STATUS to ensure everything is set
+ // as we expect and we don't see any errors.
+ errl = deviceRead( i_dimm, &notifStat, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifStatusReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
+ "Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+ bitList = __nvdimmGetActiveBits( notifStat );
+
+ // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1
+ // or Bit [2]: PERSISTENCY_ENABLED = 0
+ // or Bit [3]: WARNING_THRESHOLD_ENABLED = 0
+ if ( bitList.count(1) || !bitList.count(2) || !bitList.count(3) )
+ {
+ o_statusErr = true;
+
+ // Make the log predictive and mask the fir
+ io_sc.service_data->SetThresholdMaskId(0);
+
+ // Callout the NVDIMM, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+
+ break;
+ }
+ }
+ // Note: moving the threshold should clear the warning from
+ // WARNING_THRESHOLD_STATUS, which allows future warnings to report.
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+/**
+ * @brief Analyze NVDIMM Warning Threshold Status Register for errors
+ * @param io_sc The step code data struct.
+ * @param i_dimm The target dimm.
+ * @param io_errFound Whether an error has already been found or not.
+ * @return FAIL if unable to read register, else SUCCESS
+ */
+uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
+ TargetHandle_t i_dimm, bool & io_errFound)
+{
+ #define PRDF_FUNC "[__analyzeWarningThrStatusReg] "
+
+ uint32_t o_rc = SUCCESS;
+ uint8_t data = 0;
+
+ // Get MCA, for signatures
+ TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
+
+ do
+ {
+ // NVDIMM health status registers size = 1 byte
+ size_t NVDIMM_SIZE = 1;
+
+ // Read the Warning Threshold Status Register (0xA7) 7:0
+ errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::WARNING_THRESHOLD_STATUS) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Status Reg. "
+ "HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+ std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
+
+ // Analyze Bit 2 First
+ // BIT 2: ES_TEMP_WARNING
+ if ( bitList.count(2) )
+ {
+ // Sleep two seconds to avoid exiting PRD analysis faster than the
+ // ES_TEMP sample rate.
+ PlatServices::milliSleep( 2, 0 );
+
+ // Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values
+ uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
+ uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;
+ uint16_t esTemp = 0;
+ o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
+ if ( SUCCESS != o_rc ) break;
+
+ uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1;
+ uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0;
+ uint16_t esTempHighTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
+ if ( SUCCESS != o_rc ) break;
+
+ msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1;
+ lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0;
+ uint16_t esTempLowTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
+ if ( SUCCESS != o_rc ) break;
+
+ // Check to see if the ES_TEMP is negative (bit 12)
+ bool esTempNeg = false;
+ if ( esTemp & 0x1000 ) esTempNeg = true;
+
+ // If ES_TEMP is equal or above ES_TEMP_WARNING_HIGH_THRESHOLD
+ // Just in case ES_TEMP has moved before we read it out, we'll add
+ // a 2°C margin when comparing to the threshold.
+ if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg )
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpWarnHigh );
+ }
+ // Else check if the warning hit the low threshold, again with the
+ // same 2°C margin.
+ else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpWarnLow );
+ }
+ // Else the temperature must have gone back to a normal value, so
+ // we will label this as a false alarm case.
+ else
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpWarnFa );
+ }
+
+ // Callout BPM (backup power module) high
+ o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
+ if ( SUCCESS != o_rc ) break;
+
+ // Callout NVDIMM low, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+
+ // Because of the possibility of intermittent ES temperature
+ // false alarm readings, we will keep the log hidden. If there is
+ // an actual ES temperature problem, we assume we will continue
+ // to be called to handle the temperature warning and hit threshold.
+
+ // Only send the save/restore message to PHYP if we hit threshold.
+ if ( io_sc.service_data->IsAtThreshold() )
+ {
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+ }
+
+ io_errFound = true;
+ }
+ // BIT 0: NVM_LIFETIME_WARNING
+ if ( bitList.count(0) )
+ {
+ // Adjust warning threshold.
+ uint16_t warnThReg = NVDIMM::i2cReg::NVM_LIFETIME_WARNING_THRESHOLD;
+ uint16_t errThReg = NVDIMM::i2cReg::NVM_LIFETIME_ERROR_THRESHOLD;
+ bool firstWarn = false;
+ bool statusErr = false;
+ o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg,
+ firstWarn, statusErr );
+ if ( SUCCESS != o_rc ) break;
+
+ // Make the log predictive, but do not mask the FIR
+ io_sc.service_data->setServiceCall();
+
+ // If we got a set event notification status error, add the
+ // signature for that before adding the signature for the warning.
+ // Also do not take our normal callout action since we already will
+ // have called out the NVDIMM because of the status error.
+ if ( statusErr )
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr );
+
+ // Need to set io_errFound here so the warning signature is
+ // added to the multi-signature list instead of as the primary
+ // signature.
+ io_errFound = true;
+ }
+ else
+ {
+ // Callout NVDIMM on 1st, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ }
+
+ // Update signature depending on whether this is the first or second
+ // warning of this type.
+ if ( firstWarn )
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn1 );
+ }
+ else
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn2 );
+ }
+
+
+ io_errFound = true;
+ }
+ // BIT 1: ES_LIFETIME_WARNING
+ if ( bitList.count(1) )
+ {
+ // Adjust warning threshold.
+ uint16_t warnThReg = NVDIMM::i2cReg::ES_LIFETIME_WARNING_THRESHOLD;
+ uint16_t errThReg = NVDIMM::i2cReg::ES_LIFETIME_ERROR_THRESHOLD;
+ bool firstWarn = false;
+ bool statusErr = false;
+ o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg,
+ firstWarn, statusErr );
+ if ( SUCCESS != o_rc ) break;
+
+ // Make the log predictive, but do not mask the FIR
+ io_sc.service_data->setServiceCall();
+
+ // If we got a set event notification status error, add the
+ // signature for that before adding the signature for the warning.
+ // Also do not take our normal callout action since we already will
+ // have called out the NVDIMM because of the status error.
+ if ( statusErr )
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr );
+
+ // Need to set io_errFound here so the warning signature is
+ // added to the multi-signature list instead of as the primary
+ // signature.
+ io_errFound = true;
+ }
+ else
+ {
+ // Callout BPM (backup power module) high
+ o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
+ if ( SUCCESS != o_rc ) break;
+
+ // Callout NVDIMM low, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ }
+
+ // Update signature depending on whether this is the first or second
+ // warning of this type.
+ if ( firstWarn )
+ {
+ __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn1);
+ }
+ else
+ {
+ __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn2);
+ }
+
+ io_errFound = true;
+ }
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+/**
* @brief De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1 (0x41)
* @param i_dimm The target dimm.
* @return FAIL if unable to read/write register, else SUCCESS
@@ -698,7 +1292,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
// Read the NVDIMM_MGT_CMD1 register (0x41) 7:0
errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to read NVDIMM_MGT_CMD1. "
@@ -713,7 +1307,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
// Write the updated data back to NVDIMM_MGT_CMD1
errl = deviceWrite( i_dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to write NVDIMM_MGT_CMD1. "
@@ -732,6 +1326,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
}
#endif // HOSTBOOT_RUNTIME
+#endif // CONFIG_NVDIMM
/**
* @brief MCACALFIR[8] - Error from NVDIMM health status registers
@@ -744,13 +1339,28 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
{
#define PRDF_FUNC "[nimbus_mca::AnalyzeNvdimmHealthStatRegs] "
+ #ifdef CONFIG_NVDIMM
#ifdef __HOSTBOOT_RUNTIME
uint32_t l_rc = SUCCESS;
+ bool errFound = false;
// We need to check both dimms for errors
for ( auto & dimm : getConnected(i_chip->getTrgt(), TYPE_DIMM) )
{
+ // Skip any non-NVDIMMs
+ if ( !isNVDIMM(dimm) ) continue;
+
+ // Add SMART-specific, page 4 registers to FFDC
+ errlHndl_t mainErrl = nullptr;
+ mainErrl = ServiceGeneratorClass::ThisServiceGenerator().getErrl();
+ if ( nullptr == mainErrl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to get the global error log." );
+ continue;
+ }
+ PlatServices::nvdimmAddFfdc( dimm, mainErrl );
+
// De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1
l_rc = __deassertEventN( dimm );
if ( SUCCESS != l_rc ) continue;
@@ -762,7 +1372,7 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
// Read the Module Health Register (0xA0) 7:0
errlHndl_t errl = deviceRead( dimm, &data, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH) );
+ DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to read Module Health Register. "
@@ -775,6 +1385,30 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
// BIT 0: Persistency Lost
if ( bitList.count(0) )
{
+ // Analyze Health Status0 Reg, Health Status1 Reg,
+ // and Error Theshold Status Reg
+ l_rc = __analyzeHealthStatus0Reg( io_sc, dimm, errFound );
+ if ( SUCCESS != l_rc ) continue;
+ l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
+ if ( SUCCESS != l_rc ) continue;
+ bool esTempErr = false;
+ l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr);
+ if ( SUCCESS != l_rc ) continue;
+
+ // If we hit an ES temperature error and have not yet hit threshold,
+ // then keep the log hidden.
+ if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue;
+
+ // If we didn't find any error, then keep the log hidden.
+ if ( !errFound )
+ {
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_FirEvntGone );
+ // Callout NVDIMM
+ io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
+ continue;
+ }
+
// EVENT_N cannot be retriggered on a new PERSISTENCY_LOST_ERROR
// if a previous PERSISTENCY_LOST_ERROR still exists. Meaning, we
// cannot detect/report multiple errors that happen at different
@@ -782,43 +1416,77 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
// and make the log predictive.
io_sc.service_data->SetThresholdMaskId(0);
- // Send persistency lost message to PHYP
- l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm,
- NVDIMM::UNPROTECTED_BECAUSE_ERROR );
+ // Send message to PHYP that save/restore may work
+ l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != l_rc ) continue;
- // Analyze Health Status0 Reg, Health Status1 Reg,
- // and Error Theshold Status Reg
- l_rc = __analyzeHealthStatus0Reg( io_sc, dimm );
- if ( SUCCESS != l_rc ) continue;
- l_rc = __analyzeHealthStatus1Reg( io_sc, dimm );
- if ( SUCCESS != l_rc ) continue;
- l_rc = __analyzeErrorThrStatusReg( io_sc, dimm );
+ }
+ // BIT 1: Warning Threshold Exceeded
+ else if ( bitList.count(1) )
+ {
+ l_rc = __analyzeWarningThrStatusReg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
+
+ if ( !errFound )
+ {
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_FirEvntGone );
+ // Callout NVDIMM
+ io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
+ continue;
+ }
}
- // BIT 1: Warning Threshold Exceeded -- ignore
// BIT 2: Persistency Restored
- if ( bitList.count(2) )
+ else if ( bitList.count(2) )
{
// It would be rare to have an intermittent error that comes and
// goes so fast we only see PERSISTENCY_RESTORED and not
// PERSISTENCY_LOST_ERROR. Set predictive on threshold of 32
// per day (rule code handles the thresholding), else just keep
// as a hidden log.
- io_sc.service_data->AddSignatureList( dimm, PRDFSIG_NvdimmPersRes );
+ __addSignature( io_sc, i_chip->getTrgt(), errFound,
+ PRDFSIG_NvdimmPersRes );
+
+ // Callout NVDIMM
+ io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
+ }
+ // BIT 3: Below Warning Threshold
+ else if ( bitList.count(3) )
+ {
+ // Much like the persistency restored bit above, we don't expect
+ // to see this, so just make a hidden log.
+ __addSignature( io_sc, i_chip->getTrgt(), errFound,
+ PRDFSIG_BelowWarnTh );
+
+ // Callout NVDIMM
+ io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
+ }
+ // BIT 4: Hardware Failure -- ignore - no logic feeding this
+ // BIT 5: EVENT_N_LOW -- ignore
+ // BIT 6:7: Unused
+
+ // If we reach a threshold on MCACALFIR[8] of 32 per day, we assume
+ // some intermittent error must be triggering the FIR that isn't a
+ // persistency lost error which would cause us to mask. The rule code
+ // handles the actual thresholding here.
+ if ( io_sc.service_data->IsAtThreshold() && !errFound )
+ {
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_IntNvdimmErr );
// callout NVDIMM high, cable high, BPM high, no gard
io_sc.service_data->SetCallout( dimm, MRU_HIGH, NO_GARD );
l_rc = __addBpmCallout( dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != l_rc ) continue;
- l_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH );
+ l_rc = __addNvdimmCableCallout( dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != l_rc ) continue;
- }
- // BIT 3: Below Warning Threshold -- ignore
- // BIT 4: Hardware Failure -- ignore
- // BIT 5: EVENT_N_LOW -- ignore
- // BIT 6:7: Unused
+ // Send message to PHYP that save/restore may work
+ l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != l_rc ) continue;
+ }
}
#else // IPL only
@@ -826,7 +1494,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
PRDF_ERR( PRDF_FUNC "Unexpected call to analyze NVDIMMs at IPL." );
io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD );
- #endif
+ #endif // end runtime vs IPL check
+
+ #else // CONFIG_NVDIMM not defined
+
+ PRDF_ERR( PRDF_FUNC "CONFIG_NVDIMM not defined." );
+ io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD );
+
+ #endif // end CONFIG_NVDIMM check
return SUCCESS; // nothing to return to rule code
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C
index 4a4391c0c..0e11b1a86 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C
+++ b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2018 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -301,9 +301,9 @@ int32_t commandAddrTimeout( ExtensibleChip * i_chip,
// was executed. Restarting the command will likely fail with the same
// issue. Callout and gard all MCAs in which the command was executed.
- std::vector<ExtensibleChip *> mcaList;
+ ExtensibleChipList mcaList;
- if ( SUCCESS != getMcbistMaintPort(i_chip, mcaList) )
+ if ( SUCCESS != getMcbistMaintPort<TYPE_MCBIST>(i_chip, mcaList) )
{
PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed",
i_chip->getHuid() );
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H
index 4a284253a..44ef77ec7 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H
+++ b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016 */
+/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -36,6 +36,7 @@
// Platform includes
#include <prdfMemTdCtlr.H>
#include <prdfPlatServices.H>
+#include <prdfThresholdUtils.H>
namespace PRDF
{
@@ -81,6 +82,24 @@ class McbistDataBundle : public DataBundle
/** The Targeted Diagnostics controller. */
MemTdCtlr<TARGETING::TYPE_MCBIST> * iv_tdCtlr = nullptr;
+
+ public: // instance variables
+ #ifdef __HOSTBOOT_RUNTIME
+
+ // These are used to limit the number of times a scrub command will stop
+ // on a UE or CE on a rank. This is to prevent potential flooding of
+ // maintenance UEs or CEs. The threshold will be 16 per rank for each.
+ TimeBasedThreshold iv_ueStopCounter =
+ TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );
+ TimeBasedThreshold iv_ceStopCounter =
+ TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );
+
+ // If we stop on a UE or a CE, we will need to store the rank that the
+ // error is on so that we can clear our respective thresholds if the
+ // next error we stop on is on a different rank.
+ MemRank iv_ceUeRank;
+
+ #endif
};
/**
diff --git a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C
index ef3a143eb..fc389000a 100644
--- a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C
+++ b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C
@@ -99,7 +99,7 @@ void commitErrl( errlHndl_t i_errl, TargetHandle_t i_trgt )
template<TARGETING::TYPE T>
void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt,
- TargetHandle_t i_dimmTrgt )
+ TargetHandle_t i_dimmTrgt, bool i_nvdimmNoGard = false )
{
#define PRDF_FUNC "[RDR::__calloutDimm] "
@@ -109,9 +109,31 @@ void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt,
PRDF_ASSERT( nullptr != i_dimmTrgt );
PRDF_ASSERT( TYPE_DIMM == getTargetType(i_dimmTrgt) );
- // Callout the DIMM.
+ HWAS::DeconfigEnum deconfigPolicy = HWAS::DELAYED_DECONFIG;
+ HWAS::GARD_ErrorType gardPolicy = HWAS::GARD_Predictive;
+
+ #ifdef CONFIG_NVDIMM
+ // For the "RDR: All repairs used" case, If the DIMM is an NVDIMM, change
+ // the gard and deconfig options to no gard/deconfig and call
+ // nvdimmNotifyProtChange to indicate a save/restore may work.
+ if ( i_nvdimmNoGard )
+ {
+ deconfigPolicy = HWAS::NO_DECONFIG;
+ gardPolicy = HWAS::GARD_NULL;
+
+ uint32_t l_rc = PlatServices::nvdimmNotifyProtChange( i_dimmTrgt,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_TRAC( PRDF_FUNC "nvdimmNotifyProtChange(0x%08x) "
+ "failed.", PlatServices::getHuid(i_dimmTrgt) );
+ }
+ }
+ #endif
+
io_errl->addHwCallout( i_dimmTrgt, HWAS::SRCI_PRIORITY_HIGH,
- HWAS::DELAYED_DECONFIG, HWAS::GARD_Predictive );
+ deconfigPolicy, gardPolicy );
+
// Clear the VPD on this DIMM. The DIMM has been garded, but it is possible
// the customer will want to ungard the DIMM. Without clearing the VPD, the
@@ -120,16 +142,20 @@ void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt,
// customer takes the risk of ungarding the DIMM (that they should replace),
// the repairs will need to be rediscovered.
- std::vector<MemRank> ranks;
- getMasterRanks<T>( i_portTrgt, ranks, getDimmSlct(i_dimmTrgt) );
-
- for ( auto & rank : ranks )
+ // Do not clear the VPD if we had an NVDIMM that we avoided garding.
+ if ( !i_nvdimmNoGard )
{
- if ( SUCCESS != clearBadDqBitmap(i_portTrgt, rank) )
+ std::vector<MemRank> ranks;
+ getMasterRanks<T>( i_portTrgt, ranks, getDimmSlct(i_dimmTrgt) );
+
+ for ( auto & rank : ranks )
{
- PRDF_ERR( PRDF_FUNC "clearBadDqBitmap(0x%08x,0x%02x) failed",
- getHuid(i_portTrgt), rank.getKey() );
- continue;
+ if ( SUCCESS != clearBadDqBitmap(i_portTrgt, rank) )
+ {
+ PRDF_ERR( PRDF_FUNC "clearBadDqBitmap(0x%08x,0x%02x) failed",
+ getHuid(i_portTrgt), rank.getKey() );
+ continue;
+ }
}
}
@@ -156,11 +182,7 @@ void commitSoftError( uint32_t i_reasonCode, TargetHandle_t i_trgt,
//------------------------------------------------------------------------------
template<TARGETING::TYPE T>
-bool processRepairedRanks( TargetHandle_t i_trgt, uint8_t i_repairedRankMask );
-
-template<>
-bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
- uint8_t i_repairedRankMask )
+bool processRepairedRanks( TargetHandle_t i_trgt, uint8_t i_repairedRankMask )
{
#define PRDF_FUNC "[processRepairedRanks] "
@@ -179,7 +201,7 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
// map value has no significance.
std::map<TargetHandle_t, uint32_t> calloutList;
- ExtensibleChip * mcaChip = (ExtensibleChip *)systemPtr->GetChip(i_trgt);
+ ExtensibleChip * chip = (ExtensibleChip *)systemPtr->GetChip(i_trgt);
for ( uint8_t r = 0; r < MASTER_RANKS_PER_PORT; ++r )
{
@@ -191,20 +213,18 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
MemRank rank ( r );
MemMark cm;
- if ( SUCCESS != MarkStore::readChipMark<TYPE_MCA>( mcaChip, rank,
- cm ) )
+ if ( SUCCESS != MarkStore::readChipMark<T>( chip, rank, cm ) )
{
- PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MCA>(0x%08x,0x%02x) "
- "failed", mcaChip->getHuid(), rank.getKey() );
+ PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,0x%02x) "
+ "failed", chip->getHuid(), rank.getKey() );
continue; // skip this rank
}
MemMark sm;
- if ( SUCCESS != MarkStore::readSymbolMark<TYPE_MCA>( mcaChip, rank,
- sm ) )
+ if ( SUCCESS != MarkStore::readSymbolMark<T>( chip, rank, sm ) )
{
- PRDF_ERR( PRDF_FUNC "readSymbolMark<TYPE_MCA>(0x%08x,0x%02x) "
- "failed", mcaChip->getHuid(), rank.getKey() );
+ PRDF_ERR( PRDF_FUNC "readSymbolMark<T>(0x%08x,0x%02x) "
+ "failed", chip->getHuid(), rank.getKey() );
continue; // skip this rank
}
@@ -214,9 +234,8 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
if ( NULL == errl )
{
- errl = createErrl<TYPE_MCA>( PRDF_DETECTED_FAIL_HARDWARE,
- i_trgt,
- PRDFSIG_RdrRepairsUsed );
+ errl = createErrl<T>( PRDF_DETECTED_FAIL_HARDWARE,
+ i_trgt, PRDFSIG_RdrRepairsUsed );
}
std::vector<MemSymbol> symList;
@@ -246,16 +265,21 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
// Callout all DIMMs in the map.
for ( auto const & dimm : calloutList )
{
- __calloutDimm<TYPE_MCA>( errl, i_trgt, dimm.first );
+ bool nvdimmNoGard = false;
+ #ifdef CONFIG_NVDIMM
+ if ( isNVDIMM(dimm.first) ) nvdimmNoGard = true;
+ #endif
+
+ __calloutDimm<T>( errl, i_trgt, dimm.first, nvdimmNoGard );
}
// Commit the error log, if needed.
- commitErrl<TYPE_MCA>( errl, i_trgt );
+ commitErrl<T>( errl, i_trgt );
// Commit an additional error log indicating something failed in the
// analysis, if needed.
- commitSoftError<TYPE_MCA>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt,
- PRDFSIG_RdrInternalFail, analysisErrors );
+ commitSoftError<T>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt,
+ PRDFSIG_RdrInternalFail, analysisErrors );
}while(0);
return o_calloutMade;
@@ -263,6 +287,14 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
#undef PRDF_FUNC
}
+
+template
+bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt,
+ uint8_t i_repairedRankMask );
+template
+bool processRepairedRanks<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt,
+ uint8_t i_repairedRankMask );
+
//------------------------------------------------------------------------------
template<>
@@ -368,7 +400,12 @@ bool processRepairedRanks<TYPE_MBA>( TargetHandle_t i_trgt,
// Callout all DIMMs in the map.
for ( auto const & dimm : calloutList )
{
- __calloutDimm<TYPE_MBA>( errl, i_trgt, dimm.first );
+ bool nvdimmNoGard = false;
+ #ifdef CONFIG_NVDIMM
+ if ( isNVDIMM(dimm.first) ) nvdimmNoGard = true;
+ #endif
+
+ __calloutDimm<TYPE_MBA>(errl, i_trgt, dimm.first, nvdimmNoGard);
}
o_calloutMade = true;
@@ -392,10 +429,7 @@ bool processRepairedRanks<TYPE_MBA>( TargetHandle_t i_trgt,
template<TARGETING::TYPE T>
-bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask );
-
-template<>
-bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask )
+bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask )
{
#define PRDF_FUNC "[processBadDimms] "
@@ -421,29 +455,35 @@ bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask )
{
if ( NULL == errl )
{
- errl = createErrl<TYPE_MCA>( PRDF_DETECTED_FAIL_HARDWARE,
- i_trgt, PRDFSIG_RdrRepairUnavail );
+ errl = createErrl<T>( PRDF_DETECTED_FAIL_HARDWARE,
+ i_trgt, PRDFSIG_RdrRepairUnavail );
}
- __calloutDimm<TYPE_MCA>( errl, i_trgt, dimm );
+ __calloutDimm<T>( errl, i_trgt, dimm );
o_calloutMade = true;
}
}
// Commit the error log, if needed.
- commitErrl<TYPE_MCA>( errl, i_trgt );
+ commitErrl<T>( errl, i_trgt );
// Commit an additional error log indicating something failed in the
// analysis, if needed.
- commitSoftError<TYPE_MCA>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt,
- PRDFSIG_RdrInternalFail, analysisErrors );
+ commitSoftError<T>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt,
+ PRDFSIG_RdrInternalFail, analysisErrors );
return o_calloutMade;
#undef PRDF_FUNC
}
+template
+bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask );
+template
+bool processBadDimms<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt,
+ uint8_t i_badDimmMask );
+
//------------------------------------------------------------------------------
template<>
@@ -580,6 +620,25 @@ void deployDramSpares<TYPE_MBA>( TargetHandle_t i_trgt,
}
}
+template<>
+void deployDramSpares<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt,
+ const std::vector<MemRank> & i_ranks )
+{
+ for ( auto & rank : i_ranks )
+ {
+ MemSymbol sym = MemSymbol::fromSymbol( i_trgt, rank, 71 );
+
+ int32_t l_rc = mssSetSteerMux<TYPE_OCMB_CHIP>(i_trgt, rank, sym, false);
+ if ( SUCCESS != l_rc )
+ {
+ // mssSetSteerMux() will print a trace and commit the error log,
+ // however, we need to handle the return code or we get a compile
+ // warning in Hostboot.
+ continue;
+ }
+ }
+}
+
} // end namespace RDR
//------------------------------------------------------------------------------
@@ -680,6 +739,8 @@ template
uint32_t restoreDramRepairs<TYPE_MCA>( TargetHandle_t i_trgt );
template
uint32_t restoreDramRepairs<TYPE_MBA>( TargetHandle_t i_trgt );
+template
+uint32_t restoreDramRepairs<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt );
//------------------------------------------------------------------------------
OpenPOWER on IntegriCloud