diff options
Diffstat (limited to 'src/usr/diag/prdf/plat/mem')
21 files changed, 2637 insertions, 432 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H index 5990a902e..063e92775 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2018 */ +/* Contributors Listed Below - COPYRIGHT 2018,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -47,7 +47,7 @@ class DsdEvent : public TdEntry /** * @brief Constructor - * @param i_chip MCA or MBA. + * @param i_chip MCA, MBA, or OCMB. * @param i_rank Rank reporting chip mark. */ DsdEvent<T>( ExtensibleChip * i_chip, const MemRank & i_rank, diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C index 70a6be7f2..9dbaeeb3c 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2018 */ +/* Contributors Listed Below - COPYRIGHT 2018,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -30,6 +30,8 @@ #include <prdfMemDqBitmap.H> #include <prdfMemDsd.H> +#include <hwp_wrappers.H> + using namespace TARGETING; namespace PRDF @@ -37,18 +39,12 @@ namespace PRDF using namespace PlatServices; -//############################################################################## -// -// Specializations for MBA -// -//############################################################################## - -template<> -uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::checkEcc] " + #define PRDF_FUNC "[DsdEvent<T>::checkEcc] " uint32_t o_rc = SUCCESS; @@ -71,7 +67,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, // At this point we don't actually have an address for the UE. The // best we can do is get the address in which the command stopped. MemAddr addr; - o_rc = getMemMaintAddr<TYPE_MBA>( iv_chip, addr ); + o_rc = getMemMaintAddr<T>( iv_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -79,8 +75,8 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, break; } - o_rc = MemEcc::handleMemUe<TYPE_MBA>( iv_chip, addr, - UE_TABLE::SCRUB_UE, io_sc ); + o_rc = MemEcc::handleMemUe<T>( iv_chip, addr, + UE_TABLE::SCRUB_UE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed", @@ -101,12 +97,12 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, //------------------------------------------------------------------------------ -template<> -uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::verifySpare( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::verifySpare] " + #define PRDF_FUNC "[DsdEvent<T>::verifySpare] " uint32_t o_rc = SUCCESS; @@ -166,7 +162,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, PRDFSIG_DsdDramSpared ); // Remove the chip mark. - o_rc = MarkStore::clearChipMark<TYPE_MBA>( iv_chip, iv_rank ); + o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed", @@ -190,7 +186,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, template<> uint32_t DsdEvent<TYPE_MBA>::startCmd() { - #define PRDF_FUNC "[DsdEvent::startCmd] " + #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::startCmd] " uint32_t o_rc = SUCCESS; @@ -231,7 +227,54 @@ uint32_t DsdEvent<TYPE_MBA>::startCmd() //------------------------------------------------------------------------------ template<> -uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t DsdEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[DsdEvent<TYPE_OCMB_CHIP>::startCmd] " + + uint32_t o_rc = SUCCESS; + + #ifdef CONFIG_AXONE + + mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Start the steer cleanup procedure on this master rank. + o_rc = startTdSteerCleanup<TYPE_OCMB_CHIP>( iv_chip, iv_rank, + MASTER_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdSteerCleanup(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + break; + + case TD_PHASE_2: + // Start the superfast read procedure on this master rank. + o_rc = startTdSfRead<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK, + stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdSfRead(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) { uint32_t signature = 0; @@ -260,5 +303,9 @@ uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +// Avoid linker errors with the template. +template class DsdEvent<TYPE_MBA>; +template class DsdEvent<TYPE_OCMB_CHIP>; + } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C index 42b7eb9fc..1478a666d 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd_rt.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2018 */ +/* Contributors Listed Below - COPYRIGHT 2018,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -29,6 +29,8 @@ #include <prdfCenMbaExtraSig.H> #include <prdfMemDsd.H> +#include <hwp_wrappers.H> + using namespace TARGETING; namespace PRDF @@ -36,18 +38,12 @@ namespace PRDF using namespace PlatServices; -//############################################################################## -// -// Specializations for MBA -// -//############################################################################## - -template<> -uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::checkEcc] " + #define PRDF_FUNC "[DsdEvent<T>::checkEcc] " uint32_t o_rc = SUCCESS; @@ -64,7 +60,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, // At this point we don't actually have an address for the UE. The // best we can do is get the address in which the command stopped. MemAddr addr; - o_rc = getMemMaintAddr<TYPE_MBA>( iv_chip, addr ); + o_rc = getMemMaintAddr<T>( iv_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -72,8 +68,8 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, break; } - o_rc = MemEcc::handleMemUe<TYPE_MBA>( iv_chip, addr, - UE_TABLE::SCRUB_UE, io_sc ); + o_rc = MemEcc::handleMemUe<T>( iv_chip, addr, + UE_TABLE::SCRUB_UE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed", @@ -83,7 +79,7 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, // Because of the UE, any further TPS requests will likely have no // effect. So ban all subsequent requests. - MemDbUtils::banTps<TYPE_MBA>( iv_chip, addr.getRank() ); + MemDbUtils::banTps<T>( iv_chip, addr.getRank() ); // Leave the mark in place and abort this procedure. o_done = true; break; @@ -114,12 +110,12 @@ uint32_t DsdEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, //------------------------------------------------------------------------------ -template<> -uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::verifySpare( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::verifySpare] " + #define PRDF_FUNC "[DsdEvent<T>::verifySpare] " uint32_t o_rc = SUCCESS; @@ -134,7 +130,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, // error (i.e. a UE). bool lastAddr = false; - o_rc = didCmdStopOnLastAddr<TYPE_MBA>( iv_chip, MASTER_RANK, lastAddr ); + o_rc = didCmdStopOnLastAddr<T>( iv_chip, MASTER_RANK, lastAddr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "didCmdStopOnLastAddr(0x%08x) failed", @@ -155,7 +151,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_DsdDramSpared ); // Remove the chip mark. - o_rc = MarkStore::clearChipMark<TYPE_MBA>( iv_chip, iv_rank ); + o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed", @@ -179,7 +175,7 @@ uint32_t DsdEvent<TYPE_MBA>::verifySpare( const uint32_t & i_eccAttns, template<> uint32_t DsdEvent<TYPE_MBA>::startCmd() { - #define PRDF_FUNC "[DsdEvent::startCmd] " + #define PRDF_FUNC "[DsdEvent<TYPE_MBA>::startCmd] " uint32_t o_rc = SUCCESS; @@ -224,7 +220,38 @@ uint32_t DsdEvent<TYPE_MBA>::startCmd() //------------------------------------------------------------------------------ template<> -uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t DsdEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[DsdEvent<TYPE_OCMB_CHIP>::startCmd] " + + uint32_t o_rc = SUCCESS; + + #ifdef CONFIG_AXONE + + mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond; + + stopCond.set_pause_on_ue(mss::ON); + + // Start the time based scrub procedure on this master rank. + o_rc = startTdScrub<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK, + stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> +uint32_t DsdEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) { uint32_t signature = 0; @@ -258,5 +285,9 @@ uint32_t DsdEvent<TYPE_MBA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +// Avoid linker errors with the template. +template class DsdEvent<TYPE_MBA>; +template class DsdEvent<TYPE_OCMB_CHIP>; + } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C index 41b0de3ea..40653ee09 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2017,2018 */ +/* Contributors Listed Below - COPYRIGHT 2017,2020 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -64,7 +64,7 @@ bool isEnabled() !isMfgAvpEnabled() && !isMfgHdatAvpEnabled() ); } -int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct, +int32_t __getAddrConfig( ExtensibleChip * i_chip, uint8_t i_dslct, bool & o_twoDimmConfig, uint8_t & o_mrnkBits, uint8_t & o_srnkBits, uint8_t & o_extraRowBits ) { @@ -72,12 +72,12 @@ int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct, int32_t o_rc = SUCCESS; - SCAN_COMM_REGISTER_CLASS * reg = i_mcaChip->getRegister( "MC_ADDR_TRANS" ); + SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MC_ADDR_TRANS" ); o_rc = reg->Read(); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_mcaChip=0x%08x", - i_mcaChip->getHuid() ); + PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_chip=0x%08x", + i_chip->getHuid() ); return o_rc; } @@ -98,8 +98,8 @@ int32_t __getAddrConfig( ExtensibleChip * i_mcaChip, uint8_t i_dslct, // for some reason B2 is valid, there is definitely a bug. if ( reg->IsBitSet(i_dslct ? 28:12) ) { - PRDF_ERR( PRDF_FUNC "B2 enabled in MC_ADDR_TRANS: i_mcaChip=0x%08x " - "i_dslct=%d", i_mcaChip->getHuid(), i_dslct ); + PRDF_ERR( PRDF_FUNC "B2 enabled in MC_ADDR_TRANS: i_chip=0x%08x " + "i_dslct=%d", i_chip->getHuid(), i_dslct ); return FAIL; } @@ -386,7 +386,7 @@ int32_t __getPortAddr<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr, // Local vars for address fields uint64_t col = reverseBits(i_addr.getCol(), 7); // C9 C8 C7 C6 C5 C4 C3 uint64_t row = reverseBits(i_addr.getRow(), 18); // R17 R16 R15 .. R1 R0 - uint64_t bnk = i_addr.getBank(); // BG0 BG1 B0 B1 B2 + uint64_t bnk = i_addr.getBank(); // B0 B1 B2 BG0 BG1 uint64_t srnk = i_addr.getRank().getSlave(); // S0 S1 S2 uint64_t mrnk = i_addr.getRank().getRankSlct(); // M0 M1 uint64_t dslct = i_addr.getRank().getDimmSlct(); // D @@ -473,6 +473,266 @@ int32_t __getPortAddr<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr, return o_rc; } +void __adjustCapiAddrBitPos( uint8_t & io_bitPos ) +{ + // Note: the translation bitmaps are all 5 bits that are defined + // consistently as: + // 00000 = CAPI_Address(5) + // 00001 = CAPI_Address(6) + // 00010 = CAPI_Address(7) + // ... + // 01010 = CAPI_Address(15) + // 01011 = CAPI_Address(31) + // 01100 = CAPI_Address(32) + // ... + // 10011 = CAPI_Address(39) + // So the value from the regs can be converted to the CAPI address bit pos + // by adding 5 if the value is less than or equal to 10, or by adding 20 + // if it is above 10. + + if ( io_bitPos <= 10 ) + { + io_bitPos += 5; + } + else + { + io_bitPos += 20; + } +} + +template <> +int32_t __getPortAddr<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, MemAddr i_addr, + uint64_t & o_addr ) +{ + #define PRDF_FUNC "[MemDealloc::__getPortAddr<TYPE_OCMB_CHIP>] " + + int32_t o_rc = SUCCESS; + + o_addr = 0; + + // Local vars for address fields + uint64_t col = reverseBits(i_addr.getCol(), 7); // C9 C8 C7 C6 C5 C4 C3 + uint64_t row = reverseBits(i_addr.getRow(), 18); // R17 R16 R15 .. R1 R0 + uint64_t bnk = i_addr.getBank(); // B0 B1 B2 BG0 BG1 + uint64_t srnk = i_addr.getRank().getSlave(); // S0 S1 S2 + uint64_t mrnk = i_addr.getRank().getRankSlct(); // M0 M1 + uint64_t dslct = i_addr.getRank().getDimmSlct(); // D + + // Determine if a two DIMM config is used. Also, determine how many + // mrank (M0-M1), srnk (S0-S2), or extra row (R17-R15) bits are used. + bool twoDimmConfig; + uint8_t mrnkBits, srnkBits, extraRowBits; + o_rc = __getAddrConfig( i_chip, dslct, twoDimmConfig, mrnkBits, srnkBits, + extraRowBits ); + if ( SUCCESS != o_rc ) return o_rc; + + // Mask off the non-configured bits. If this address came from hardware, + // this would not be a problem. However, the get_mrank_range() and + // get_srank_range() HWPS got lazy just set the entire fields and did not + // take into account the actual bit ranges. + mrnk = __maskBits( mrnk, mrnkBits ); + srnk = __maskBits( srnk, srnkBits ); + row = __maskBits( row, 15 + extraRowBits ); + + // Insert the needed bits based on the config defined in the MC Address + // Translation Registers. + + uint8_t bitPos = 0; + + // Split the row into its components. + uint8_t r17 = (row & 0x20000) >> 17; + uint8_t r16 = (row & 0x10000) >> 16; + uint8_t r15 = (row & 0x08000) >> 15; + uint16_t r14_r0 = (row & 0x07fff); + + // Split the master rank and slave rank into their components + uint8_t m0 = (mrnk & 0x2) >> 1; + uint8_t m1 = (mrnk & 0x1); + + uint8_t s0 = (srnk & 0x4) >> 2; + uint8_t s1 = (srnk & 0x2) >> 1; + uint8_t s2 = (srnk & 0x1); + + // Split the column into its components + uint8_t c9 = (col & 0x40) >> 6; + uint8_t c8 = (col & 0x20) >> 5; + uint8_t c7 = (col & 0x10) >> 4; + uint8_t c6 = (col & 0x08) >> 3; + uint8_t c5 = (col & 0x04) >> 2; + uint8_t c4 = (col & 0x02) >> 1; + uint8_t c3 = (col & 0x01); + + // Split the bank and bank group into their components + // Note: B2 is not used for OCMB + uint8_t b0 = (bnk & 0x10) >> 4; + uint8_t b1 = (bnk & 0x08) >> 3; + + uint8_t bg0 = (bnk & 0x2) >> 1; + uint8_t bg1 = (bnk & 0x1); + + // Row bits 14:0 are always at CAPI addr position 30:16 + o_addr |= (r14_r0 << 16); + + // Check MC_ADDR_TRANS0 register for bit positions + SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MC_ADDR_TRANS" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS: i_chip=0x%08x", + i_chip->getHuid() ); + return o_rc; + } + + // If the DIMM select is valid, insert that bit + if ( twoDimmConfig ) + { + // DIMM bitmap: MC_ADDR_TRANS0[33:37] + bitPos = reg->GetBitFieldJustified( 33, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (dslct << bitPos); + } + + // Insert any of the master rank bits that are valid + switch( mrnkBits ) + { + case 2: + // Master rank 0 bitmap: MC_ADDR_TRANS0[38:42] + bitPos = reg->GetBitFieldJustified( 38, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (m0 << bitPos); + case 1: + // Master rank 1 bitmap: MC_ADDR_TRANS0[43:47] + bitPos = reg->GetBitFieldJustified( 43, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (m1 << bitPos); + break; + } + + // Insert any extra row bits (17:15) that are valid + switch ( extraRowBits ) + { + case 3: + // Row 17 bitmap: MC_ADDR_TRANS0[49:53] + bitPos = reg->GetBitFieldJustified( 49, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (r17 << bitPos); + case 2: + // Row 16 bitmap: MC_ADDR_TRANS0[54:58] + bitPos = reg->GetBitFieldJustified( 54, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (r16 << bitPos); + case 1: + // Row 15 bitmap: MC_ADDR_TRANS0[59:63] + bitPos = reg->GetBitFieldJustified( 59, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (r15 << bitPos); + break; + } + + // Check MC_ADDR_TRANS1 register for bit positions + reg = i_chip->getRegister( "MC_ADDR_TRANS1" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS1: i_chip=0x%08x", + i_chip->getHuid() ); + return o_rc; + } + + // Insert any of the slave rank bits that are valid + switch ( srnkBits ) + { + case 3: + // Slave rank 0 bitmap: MC_ADDR_TRANS1[3:7] + bitPos = reg->GetBitFieldJustified( 3, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (s0 << bitPos); + case 2: + // Slave rank 1 bitmap: MC_ADDR_TRANS1[11:15] + bitPos = reg->GetBitFieldJustified( 11, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (s1 << bitPos); + case 1: + // Slave rank 2 bitmap: MC_ADDR_TRANS1[19:23] + bitPos = reg->GetBitFieldJustified( 19, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (s2 << bitPos); + break; + } + + // Column 3 bitmap: MC_ADDR_TRANS1[30:34] + bitPos = reg->GetBitFieldJustified( 30, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c3 << bitPos); + + // Column 4 bitmap: MC_ADDR_TRANS1[35:39] + bitPos = reg->GetBitFieldJustified( 35, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c4 << bitPos); + + // Column 5 bitmap: MC_ADDR_TRANS1[43:47] + bitPos = reg->GetBitFieldJustified( 43, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c5 << bitPos); + + // Column 6 bitmap: MC_ADDR_TRANS1[51:55] + bitPos = reg->GetBitFieldJustified( 51, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c6 << bitPos); + + // Column 7 bitmap: MC_ADDR_TRANS1[59:63] + bitPos = reg->GetBitFieldJustified( 59, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c7 << bitPos); + + // Check MC_ADDR_TRANS2 register for bit positions + reg = i_chip->getRegister( "MC_ADDR_TRANS2" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read failed on MC_ADDR_TRANS2: i_chip=0x%08x", + i_chip->getHuid() ); + return o_rc; + } + + // Column 8 bitmap: MC_ADDR_TRANS2[3:7] + bitPos = reg->GetBitFieldJustified( 3, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c8 << bitPos); + + // Column 9 bitmap: MC_ADDR_TRANS2[11:15] + bitPos = reg->GetBitFieldJustified( 11, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (c9 << bitPos); + + // Bank 0 bitmap: MC_ADDR_TRANS2[19:23] + bitPos = reg->GetBitFieldJustified( 19, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (b0 << bitPos ); + + // Bank 1 bitmap: MC_ADDR_TRANS2[27:31] + bitPos = reg->GetBitFieldJustified( 27, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (b1 << bitPos); + + // Bank 2 bitmap: MC_ADDR_TRANS2[35:39] + // Note: Bank2 not used for OCMB + + // Bank group 0 bitmap: MC_ADDR_TRANS2[43:47] + bitPos = reg->GetBitFieldJustified( 43, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (bg0 << bitPos); + + // Bank group 1 bitmap: MC_ADDR_TRANS2[51:55] + bitPos = reg->GetBitFieldJustified( 51, 5 ); + __adjustCapiAddrBitPos( bitPos ); + o_addr |= (bg1 << bitPos); + + return o_rc; + + #undef PRDF_FUNC +} + template <> int32_t __getPortAddr<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr, uint64_t & o_addr ) @@ -566,12 +826,12 @@ int32_t __getPortAddr<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr, //------------------------------------------------------------------------------ template<TYPE T> -void __getGrpPrms( ExtensibleChip * i_chip, uint8_t o_portPos, +void __getGrpPrms( ExtensibleChip * i_chip, uint8_t & o_portPos, SCAN_COMM_REGISTER_CLASS * &o_mcfgp, SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ); template<> -void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t o_portPos, +void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t & o_portPos, SCAN_COMM_REGISTER_CLASS * &o_mcfgp, SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ) { @@ -585,7 +845,33 @@ void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t o_portPos, } template<> -void __getGrpPrms<TYPE_MBA>( ExtensibleChip * i_chip, uint8_t o_portPos, +void __getGrpPrms<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, uint8_t & o_portPos, + SCAN_COMM_REGISTER_CLASS * &o_mcfgp, + SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ) +{ + // Get the connected parent MI; + ExtensibleChip * mcc = getConnectedParent( i_chip, TYPE_MCC ); + ExtensibleChip * mi = getConnectedParent( mcc, TYPE_MI ); + + // TODO RTC 210072 - support for multiple ports + o_portPos = 0; + + // Get the position of the MCC relative to the MI (0:1) + uint8_t chnlPos = mcc->getPos() % MAX_MCC_PER_MI; + + char mcfgpName[64]; + sprintf( mcfgpName, "MCFGP%d", chnlPos ); + + char mcfgpmName[64]; + sprintf( mcfgpmName, "MCFGPM%d", chnlPos ); + + o_mcfgp = mi->getRegister( mcfgpName ); + o_mcfgpm = mi->getRegister( mcfgpmName ); + +} + +template<> +void __getGrpPrms<TYPE_MBA>( ExtensibleChip * i_chip, uint8_t & o_portPos, SCAN_COMM_REGISTER_CLASS * &o_mcfgp, SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ) { @@ -686,12 +972,67 @@ uint32_t __getGrpInfo( ExtensibleChip * i_chip, uint64_t & o_grpChnls, #undef PRDF_FUNC } +template<> +uint32_t __getGrpInfo<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + uint64_t & o_grpChnls, + uint64_t & o_grpId, uint64_t & o_grpSize, + uint64_t & o_grpBar ) +{ + #define PRDF_FUNC "[MemDealloc::__getGrpInfo] " + + uint32_t o_rc = SUCCESS; + + do + { + // Get portPos and MCFGP/M registers + uint8_t portPos = 0xFF; + SCAN_COMM_REGISTER_CLASS * mcfgp = nullptr; + SCAN_COMM_REGISTER_CLASS * mcfgpm = nullptr; + __getGrpPrms<TYPE_OCMB_CHIP>( i_chip, portPos, mcfgp, mcfgpm ); + + o_rc = mcfgp->Read(); if ( SUCCESS != o_rc ) break; + + // Get the number of channels in this group: MCFGP[40:42] + uint8_t mcGrpCnfg = mcfgp->GetBitFieldJustified( 40, 3 ); + switch ( mcGrpCnfg ) + { + case 0: o_grpChnls = 8; break; // 8MCS + case 1: o_grpChnls = 1; break; // 1MCS + case 2: o_grpChnls = 2; break; // 2MCS + case 3: o_grpChnls = 3; break; // 3MCS + case 4: o_grpChnls = 4; break; // 4MCS + case 5: o_grpChnls = 6; break; // 6MCS + default: + PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value: 0x%x " + "on 0x%08x", mcGrpCnfg, i_chip->getHuid() ); + o_rc = FAIL; + } + if ( SUCCESS != o_rc ) break; + + // Get the group ID and group size. + o_grpId = mcfgp->GetBitFieldJustified( 43, 3 ); // MCFGP[43:45] + o_grpSize = mcfgp->GetBitFieldJustified( 25, 15 ); // MCFGP[25:39] + + // TODO RTC 210072 - support for multiple ports, see generic handling + + // Get the base address (BAR). + // Channel 0 is always from the MCFGP. + o_grpBar = mcfgp->GetBitFieldJustified(1, 24); // MCFGP[1:24] + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ -uint32_t __insertGrpId( uint64_t & io_addr, uint64_t i_grpChnls, - uint64_t i_grpId ) +template <TYPE T> +uint32_t __insertGrpId( ExtensibleChip * i_chip, uint64_t & io_addr, + uint64_t i_grpChnls, uint64_t i_grpId ) { - #define PRDF_FUNC "[MemDealloc::__insertGrpId] " + #define PRDF_FUNC "[MemDealloc::__insertGrpId<T>] " uint32_t o_rc = SUCCESS; @@ -742,6 +1083,108 @@ uint32_t __insertGrpId( uint64_t & io_addr, uint64_t i_grpChnls, #undef PRDF_FUNC } +template<> +uint32_t __insertGrpId<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + uint64_t & io_addr, uint64_t i_grpChnls, + uint64_t i_grpId ) +{ + #define PRDF_FUNC "[MemDealloc::__insertGrpId<TYPE_OCMB_CHIP>] " + + uint32_t o_rc = SUCCESS; + + uint64_t upper33 = io_addr & 0xFFFFFFFF80ull; + uint64_t lower7 = io_addr & 0x000000007full; + + bool subChanAEnable = false; + bool subChanBEnable = false; + bool bothSubChansEnabled = false; + + ExtensibleChip * mcc = getConnectedParent( i_chip, TYPE_MCC ); + + // Check both subchannels whether we can get the connected OCMB to + // determine whether they are enabled. + // Check for subchannel A + ExtensibleChip * subchanA = getConnectedChild( mcc, TYPE_OCMB_CHIP, 0 ); + if ( nullptr != subchanA ) subChanAEnable = true; + + // Check for subchannel B + ExtensibleChip * subchanB = getConnectedChild( mcc, TYPE_OCMB_CHIP, 1 ); + if ( nullptr != subchanB ) subChanBEnable = true; + + // Check if both subchannels were enabled + if ( subChanAEnable && subChanBEnable ) bothSubChansEnabled = true; + + // If both subchannels are enabled, bit 56 of the address will contain the + // subchannel select bit. + if ( bothSubChansEnabled ) + { + uint8_t ocmbChnl = i_chip->getPos() % MAX_OCMB_PER_MCC; // 0:1 + uint8_t bitInsert = 0; + + switch ( i_grpChnls ) + { + case 1: // insert 1 bit for subchannel select + case 3: + case 6: + bitInsert = ( ocmbChnl & 0x1 ); + io_addr = (upper33 << 1) | (bitInsert << 7) | lower7; + break; + + case 2: // insert 1 bit for subchannel select and 1 bit for grpId + bitInsert = ( ((i_grpId & 0x1) << 1) | (ocmbChnl & 0x1) ); + io_addr = (upper33 << 2) | (bitInsert << 7) | lower7; + break; + + case 4: // insert 1 bit for subchannel select and 2 bits for grpId + bitInsert = ( ((i_grpId & 0x3) << 1) | (ocmbChnl & 0x1) ); + io_addr = (upper33 << 3) | (bitInsert << 7) | lower7; + break; + + case 8: // insert 1 bit for subchannel select and 3 bits for grpId + bitInsert = ( ((i_grpId & 0x7) << 1) | (ocmbChnl & 0x1) ); + io_addr = (upper33 << 4) | (bitInsert << 7) | lower7; + break; + + default: + PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value %d", + i_grpChnls ); + o_rc = FAIL; + } + } + else + { + switch ( i_grpChnls ) + { + case 1: // no shifting + case 3: + case 6: + break; + + case 2: // insert 1 bit + io_addr = (upper33 << 1) | ((i_grpId & 0x1) << 7) | lower7; + break; + + case 4: // insert 2 bits + io_addr = (upper33 << 2) | ((i_grpId & 0x3) << 7) | lower7; + break; + + case 8: // insert 3 bits + io_addr = (upper33 << 3) | ((i_grpId & 0x7) << 7) | lower7; + break; + + default: + PRDF_ERR( PRDF_FUNC "Invalid MC channels per group value %d", + i_grpChnls ); + o_rc = FAIL; + } + } + + return o_rc; + + #undef PRDF_FUNC + +} + //------------------------------------------------------------------------------ // The hardware uses a mod3 hashing algorithm to calculate which memory channel @@ -849,7 +1292,7 @@ void __addBar( uint64_t & io_addr, uint64_t i_grpBar ) template<TYPE T> uint32_t getSystemAddr( ExtensibleChip * i_chip, MemAddr i_addr, - uint64_t & o_addr ) + uint64_t & o_addr ) { #define PRDF_FUNC "[MemDealloc::getSystemAddr] " @@ -867,7 +1310,7 @@ uint32_t getSystemAddr( ExtensibleChip * i_chip, MemAddr i_addr, if ( SUCCESS != o_rc ) break; // Insert the group ID. - o_rc = __insertGrpId( o_addr, grpChnls, grpId ); + o_rc = __insertGrpId<T>( i_chip, o_addr, grpChnls, grpId ); if ( SUCCESS != o_rc ) break; // Notes on 3 and 6 channel per group configs: @@ -915,8 +1358,8 @@ uint32_t getSystemAddrRange( ExtensibleChip * i_chip, if ( SUCCESS != o_rc ) break; // Insert the group ID. - o_rc = __insertGrpId( o_saddr, grpChnls, grpId ); - o_rc |= __insertGrpId( o_eaddr, grpChnls, grpId ); + o_rc = __insertGrpId<T>( i_chip, o_saddr, grpChnls, grpId ); + o_rc |= __insertGrpId<T>( i_chip, o_eaddr, grpChnls, grpId ); if ( SUCCESS != o_rc ) break; // Notes on 3 and 6 channel per group configs: @@ -975,6 +1418,7 @@ int32_t page( ExtensibleChip * i_chip, MemAddr i_addr ) } template int32_t page<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr ); template int32_t page<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr ); +template int32_t page<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip, MemAddr i_addr); //------------------------------------------------------------------------------ @@ -1025,6 +1469,7 @@ int32_t rank( ExtensibleChip * i_chip, MemRank i_rank ) } template int32_t rank<TYPE_MCA>( ExtensibleChip * i_chip, MemRank i_rank ); template int32_t rank<TYPE_MBA>( ExtensibleChip * i_chip, MemRank i_rank ); +template int32_t rank<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip, MemRank i_rank); //------------------------------------------------------------------------------ @@ -1074,6 +1519,7 @@ int32_t port( ExtensibleChip * i_chip ) } template int32_t port<TYPE_MCA>( ExtensibleChip * i_chip ); template int32_t port<TYPE_MBA>( ExtensibleChip * i_chip ); +template int32_t port<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ); //------------------------------------------------------------------------------ @@ -1236,6 +1682,22 @@ int32_t dimmList( TargetHandleList & i_dimmList ) sendPredDeallocRequest( ssAddr, seAddr ); PRDF_TRAC( PRDF_FUNC "Predictive dealloc for start addr: 0x%016llx " "end addr: 0x%016llx", ssAddr, seAddr ); + + #ifdef CONFIG_NVDIMM + // If the DIMM is an NVDIMM, send a message to PHYP that a save/restore + // may work. + if ( isNVDIMM(*it) ) + { + uint32_t l_rc = PlatServices::nvdimmNotifyProtChange( *it, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) + { + PRDF_TRAC( PRDF_FUNC "nvdimmNotifyProtChange(0x%08x) " + "failed.", getHuid(*it) ); + continue; + } + } + #endif } return o_rc; @@ -1278,6 +1740,14 @@ int32_t dimmList( TargetHandleList & i_dimmList ) break; } + // Third, check for OCMBs. + list = getConnected( dimmTrgt, TYPE_OCMB_CHIP ); + if ( !list.empty() ) + { + o_rc = dimmList<TYPE_OCMB_CHIP>( i_dimmList ); + break; + } + // If we get here we did not find a supported target. PRDF_ERR( PRDF_FUNC "Unsupported connected parent to dimm 0x%08x", getHuid(dimmTrgt) ); diff --git a/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C b/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C index 869aa92e8..b257d0874 100755 --- a/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemIplCeStats.C @@ -83,8 +83,8 @@ void MemIplCeStats<TYPE_MCA>::banAnalysis( uint8_t i_dimmSlct, //------------------------------------------------------------------------------ template<> -void MemIplCeStats<TYPE_MEM_PORT>::banAnalysis( uint8_t i_dimmSlct, - uint8_t i_portSlct ) +void MemIplCeStats<TYPE_OCMB_CHIP>::banAnalysis( uint8_t i_dimmSlct, + uint8_t i_portSlct ) { PRDF_ASSERT( i_dimmSlct < MAX_DIMM_PER_PORT ); PRDF_ASSERT( 0 == i_portSlct ); @@ -117,9 +117,9 @@ void MemIplCeStats<TYPE_MCA>::banAnalysis( uint8_t i_dimmSlct ) //------------------------------------------------------------------------------ template<> -void MemIplCeStats<TYPE_MEM_PORT>::banAnalysis( uint8_t i_dimmSlct ) +void MemIplCeStats<TYPE_OCMB_CHIP>::banAnalysis( uint8_t i_dimmSlct ) { - // Only one DIMM per DIMM select on MEM_PORT. + // Only one DIMM per DIMM select on OCMB_CHIP. banAnalysis( i_dimmSlct, 0 ); } @@ -481,6 +481,6 @@ void MemIplCeStats<T>::addMruAndCommitErrl( const MemoryMru & i_memmru, // need these templates to avoid linker errors template class MemIplCeStats<TYPE_MCA>; template class MemIplCeStats<TYPE_MBA>; -template class MemIplCeStats<TYPE_MEM_PORT>; +template class MemIplCeStats<TYPE_OCMB_CHIP>; } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C index 5351b842a..bececfa21 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2019 */ +/* Contributors Listed Below - COPYRIGHT 2016,2020 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -106,17 +106,6 @@ uint32_t clearCmdCompleteAttn<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) } template<> -uint32_t clearCmdCompleteAttn<TYPE_MEM_PORT>( ExtensibleChip * i_chip ) -{ - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() ); - - ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP ); - - return clearCmdCompleteAttn<TYPE_OCMB_CHIP>( ocmbChip ); -} - -template<> uint32_t clearCmdCompleteAttn<TYPE_MBA>( ExtensibleChip * i_chip ) { // Clear MBASPA[0,8]. @@ -194,17 +183,6 @@ uint32_t clearEccCounters<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) } template<> -uint32_t clearEccCounters<TYPE_MEM_PORT>( ExtensibleChip * i_chip ) -{ - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() ); - - ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP ); - - return clearEccCounters<TYPE_OCMB_CHIP>( ocmbChip ); -} - -template<> uint32_t clearEccCounters<TYPE_MBA>( ExtensibleChip * i_chip ) { PRDF_ASSERT( nullptr != i_chip ); @@ -306,17 +284,6 @@ uint32_t clearEccFirs<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) } template<> -uint32_t clearEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip ) -{ - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() ); - - ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP ); - - return clearEccFirs<TYPE_OCMB_CHIP>( ocmbChip ); -} - -template<> uint32_t clearEccFirs<TYPE_MBA>( ExtensibleChip * i_chip ) { uint32_t o_rc = SUCCESS; @@ -409,22 +376,20 @@ uint32_t checkEccFirs<TYPE_MCA>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ template<> -uint32_t checkEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip, - uint32_t & o_eccAttns ) +uint32_t checkEccFirs<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + uint32_t & o_eccAttns ) { - #define PRDF_FUNC "[checkEccFirs<TYPE_MEM_PORT>] " + #define PRDF_FUNC "[checkEccFirs<TYPE_OCMB_CHIP>] " uint32_t o_rc = SUCCESS; o_eccAttns = MAINT_NO_ERROR; PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MEM_PORT == i_chip->getType() ); - - ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); - SCAN_COMM_REGISTER_CLASS * rdffir = ocmbChip->getRegister( "RDFFIR" ); - SCAN_COMM_REGISTER_CLASS * mcbistfir = ocmbChip->getRegister( "MCBISTFIR" ); + SCAN_COMM_REGISTER_CLASS * rdffir = i_chip->getRegister( "RDFFIR" ); + SCAN_COMM_REGISTER_CLASS * mcbistfir = i_chip->getRegister( "MCBISTFIR" ); do { @@ -453,7 +418,7 @@ uint32_t checkEccFirs<TYPE_MEM_PORT>( ExtensibleChip * i_chip, if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR: mcbChip=0x%08x", - ocmbChip->getHuid() ); + i_chip->getHuid() ); break; } @@ -733,11 +698,11 @@ uint32_t setBgScrubThresholds<TYPE_MBA>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ -template<> -uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip, - AddrRangeType i_rangeType, - bool & o_stoppedOnLastAddr, - bool i_rowRepair ) +template<TARGETING::TYPE T> +uint32_t didCmdStopOnLastAddr( ExtensibleChip * i_chip, + AddrRangeType i_rangeType, + bool & o_stoppedOnLastAddr, + bool i_rowRepair ) { #define PRDF_FUNC "[didCmdStopOnLastAddr] " @@ -749,7 +714,7 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip, { // Get the current address. MemAddr curAddr; - o_rc = getMemMaintAddr<TYPE_MBA>( i_chip, curAddr ); + o_rc = getMemMaintAddr<T>( i_chip, curAddr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -759,7 +724,7 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip, // Get the end address of the current rank. MemAddr junk, endAddr; - o_rc = getMemAddrRange<TYPE_MBA>( i_chip, curAddr.getRank(), junk, + o_rc = getMemAddrRange<T>( i_chip, curAddr.getRank(), junk, endAddr, i_rangeType ); if ( SUCCESS != o_rc ) { @@ -784,7 +749,16 @@ uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } - +template +uint32_t didCmdStopOnLastAddr<TYPE_MBA>( ExtensibleChip * i_chip, + AddrRangeType i_rangeType, + bool & o_stoppedOnLastAddr, + bool i_rowRepair ); +template +uint32_t didCmdStopOnLastAddr<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + AddrRangeType i_rangeType, + bool & o_stoppedOnLastAddr, + bool i_rowRepair ); //------------------------------------------------------------------------------ } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C index f86110458..5d310c51b 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2020 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -248,8 +248,8 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip, do { // Get all ports in which the command was run. - std::vector<ExtensibleChip *> portList; - o_rc = getMcbistMaintPort( i_chip, portList ); + ExtensibleChipList portList; + o_rc = getMcbistMaintPort<TYPE_MCBIST>( i_chip, portList ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", @@ -291,6 +291,43 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip, } template<> +uint32_t __analyzeCmdComplete<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + TdRankListEntry & o_stoppedRank, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[__analyzeCmdComplete] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + do + { + // Update iv_stoppedRank. + o_stoppedRank = __getStopRank<TYPE_OCMB_CHIP>( i_chip, i_addr ); + + // Check the OCMB for ECC errors. + bool errorsFound; + o_rc = __checkEcc<TYPE_OCMB_CHIP>( i_chip, i_addr, errorsFound, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_OCMB_CHIP>(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + if ( errorsFound ) o_errorsFound = true; + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t __analyzeCmdComplete<TYPE_MBA>( ExtensibleChip * i_chip, TdRankListEntry & o_stoppedRank, const MemAddr & i_addr, @@ -346,7 +383,7 @@ uint32_t MemTdCtlr<T>::analyzeCmdComplete( bool & o_errorsFound, // of in defaultStep() because a TD procedure could have been run // before defaultStep() and it is possible that canResumeBgScrub() // could give as a false positive in that case. - o_rc = canResumeBgScrub( iv_resumeBgScrub ); + o_rc = canResumeBgScrub( iv_resumeBgScrub, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "canResumeBgScrub(0x%08x) failed", @@ -397,9 +434,15 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, // Get the version to use. uint8_t version = TD_CTLR_DATA::VERSION_1; + bool isNimbus = false; if ( MODEL_NIMBUS == getChipModel(getMasterProc()) ) { version = TD_CTLR_DATA::VERSION_2; + isNimbus = true; + } + else if ( MODEL_AXONE == getChipModel(getMasterProc()) ) + { + version = TD_CTLR_DATA::VERSION_2; } // Get the IPL state. @@ -443,6 +486,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, if ( TD_CTLR_DATA::VERSION_2 == version ) { curPort = iv_curProcedure->getChip()->getPos() % MAX_MCA_PER_MCBIST; + if ( !isNimbus ) + { + TargetHandle_t portTrgt = iv_curProcedure->getChip()->getTrgt(); + curPort = portTrgt->getAttr<ATTR_REL_POS>(); + } } } @@ -475,6 +523,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, if ( TD_CTLR_DATA::VERSION_2 == version ) { itPort = queue[n]->getChip()->getPos() % MAX_MCA_PER_MCBIST; + if ( !isNimbus ) + { + TargetHandle_t portTrgt = queue[n]->getChip()->getTrgt(); + itPort = portTrgt->getAttr<ATTR_REL_POS>(); + } } bsb.setFieldJustify( pos, 3, itMrnk ); pos+=3; @@ -502,6 +555,7 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H index 332109b48..da969e2c1 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -54,14 +54,14 @@ class MemTdCtlr /** * @brief Constructor * - * This contructor will only be called in the MCBIST or MBA data bundle, - * which already checks for a valid type. + * This contructor will only be called in the MCBIST, MBA, or OCMB data + * bundle, which already checks for a valid type. * * Need to initialize iv_stoppedRank to a valid entry in iv_rankList. Use * the last entry in the list so that the 'next' rank is the first entry * in the list. * - * @param i_chip An MCBIST or MBA chip. + * @param i_chip An MCBIST, MBA, or OCMB chip. */ explicit MemTdCtlr( ExtensibleChip * i_chip ) : iv_chip( i_chip ), iv_rankList( i_chip ), @@ -122,7 +122,7 @@ class MemTdCtlr /** * @brief Bans TPS on the given rank. Any attempts to add a TPS procedure * to the queue for this rank will be ignored. - * @param i_chip MCA or MBA chip. + * @param i_chip MCA, MBA, or OCMB chip. * @param i_rank The target slave rank. */ void banTps( ExtensibleChip * i_chip, const MemRank & i_rank ) @@ -294,15 +294,17 @@ class MemTdCtlr /** * @param o_canResume True, if background scrubbing can be resumed. False, * if a new background scrub command must be started. + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ - uint32_t canResumeBgScrub( bool & o_canResume ); + uint32_t canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ); #endif private: // instance variables - /** An MCBIST or MBA chip associated with this TD controller. */ + /** An MCBIST, MBA, or OCMB chip associated with this TD controller. */ ExtensibleChip * const iv_chip; /** The TD queue that contains all of the pending TD procedures. */ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C index ea04d2964..401a48042 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -160,6 +160,14 @@ bool __mnfgCeCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __mnfgCeCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) ) && + ( (0 != (i_eccAttns & MAINT_NCE)) || + (0 != (i_eccAttns & MAINT_TCE)) ) ); +} + +template<> inline bool __mnfgCeCheck<TYPE_MBA>( uint32_t i_eccAttns ) { return ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) ); @@ -251,12 +259,18 @@ template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index d52ef2d1d..5565e217f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -107,6 +107,36 @@ void __recaptureRegs<TYPE_MCBIST>( STEP_CODE_DATA_STRUCT & io_sc, } template<> +void __recaptureRegs<TYPE_OCMB_CHIP>( STEP_CODE_DATA_STRUCT & io_sc, + ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[__recaptureRegs<TYPE_OCMB_CHIP>] " + + RegDataCache & cache = RegDataCache::getCachedRegisters(); + CaptureData & cd = io_sc.service_data->GetCaptureData(); + + // refresh and recapture the ocmb registers + const char * ocmbRegs[] = + { + "MCBISTFIR", "RDFFIR", "MBSEC0", "MBSEC1", "OCMB_MBSSYMEC0", + "OCMB_MBSSYMEC1", "OCMB_MBSSYMEC2", "OCMB_MBSSYMEC3", + "OCMB_MBSSYMEC4", "OCMB_MBSSYMEC5", "OCMB_MBSSYMEC6", + "OCMB_MBSSYMEC7", "OCMB_MBSSYMEC8", "MBSMSEC", "MCBMCAT", + }; + + for ( uint32_t i = 0; i < sizeof(ocmbRegs)/sizeof(char*); i++ ) + { + SCAN_COMM_REGISTER_CLASS * reg = + i_chip->getRegister( ocmbRegs[i] ); + cache.flush( i_chip, reg ); + } + + i_chip->CaptureErrorData( cd, Util::hashString("MaintCmdRegs_ocmb") ); + + #undef PRDF_FUNC +} + +template<> void __recaptureRegs<TYPE_MBA>( STEP_CODE_DATA_STRUCT & io_sc, ExtensibleChip * i_chip ) { @@ -283,7 +313,7 @@ uint32_t MemTdCtlr<T>::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) PRDF_TRAC( PRDF_FUNC "Calling resumeBgScrub<T>(0x%08x)", iv_chip->getHuid() ); - o_rc = resumeBgScrub<T>( iv_chip ); + o_rc = resumeBgScrub<T>( iv_chip, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "resumeBgScrub<T>(0x%08x) failed", @@ -358,9 +388,48 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, uint32_t count = symData.size(); switch ( T ) { - case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break; - case TYPE_MBA: PRDF_ASSERT( 1 == count ); break; - default: PRDF_ASSERT( false ); + case TYPE_MCA: + { + PRDF_ASSERT( 1 <= count && count <= 2 ); + // Increment the CE counter and store the rank we're on, + // reset the UE and CE counts if we have stopped on a new rank. + ExtensibleChip * mcb = getConnectedParent(i_chip, TYPE_MCBIST); + McbistDataBundle * mcbdb = getMcbistDataBundle(mcb); + if ( mcbdb->iv_ceUeRank != i_addr.getRank() ) + { + mcbdb->iv_ceStopCounter.reset(); + mcbdb->iv_ueStopCounter.reset(); + } + mcbdb->iv_ceStopCounter.inc( io_sc ); + mcbdb->iv_ceUeRank = i_addr.getRank(); + + break; + } + case TYPE_MBA: + { + PRDF_ASSERT( 1 == count ); + break; + } + case TYPE_OCMB_CHIP: + { + PRDF_ASSERT( 1 <= count && count <= 2 ); + // Increment the UE counter and store the rank we're on, + // reset the UE and CE counts if we have stopped on a new rank. + OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip); + if ( ocmbdb->iv_ceUeRank != i_addr.getRank() ) + { + ocmbdb->iv_ceStopCounter.reset(); + ocmbdb->iv_ueStopCounter.reset(); + } + ocmbdb->iv_ceStopCounter.inc( io_sc ); + ocmbdb->iv_ceUeRank = i_addr.getRank(); + + break; + } + default: + { + PRDF_ASSERT( false ); + } } for ( auto & d : symData ) @@ -408,6 +477,14 @@ uint32_t __handleSoftInterCeEte<TYPE_MCA>( ExtensibleChip * i_chip, } template<> +uint32_t __handleSoftInterCeEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + return __handleNceEte<TYPE_OCMB_CHIP>( i_chip, i_addr, io_sc ); +} + +template<> uint32_t __handleSoftInterCeEte<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, STEP_CODE_DATA_STRUCT & io_sc ) @@ -480,6 +557,52 @@ uint32_t __handleRceEte<TYPE_MCA>( ExtensibleChip * i_chip, } template<> +uint32_t __handleRceEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemRank & i_rank, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[__handleRceEte] " + + uint32_t o_rc = SUCCESS; + + // Should only get this attention in MNFG mode. + PRDF_ASSERT( mfgMode() ); + + do + { + // The RCE ETE attention could be from IUE, IMPE, or IRCD. Need to check + // RDFFIR[37] to determine if there was at least one IUE. + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "RDFFIR" ); + o_rc = fir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on RDFFIR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + if ( !fir->IsBitSet(37) ) break; // nothing else to do + + // Handle the IUE. + o_errorsFound = true; + io_sc.service_data->AddSignatureList( i_chip->getTrgt(), + PRDFSIG_MaintIUE ); + o_rc = MemEcc::handleMemIue<TYPE_OCMB_CHIP>( i_chip, i_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeMaintIue(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t __handleRceEte<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) @@ -698,6 +821,11 @@ template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -786,6 +914,76 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::unmaskEccAttns() //------------------------------------------------------------------------------ template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns] " + + uint32_t o_rc = SUCCESS; + + SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_OR" ); + + mask->clearAllBits(); + mask->SetBit(8); // Mainline read NCE + mask->SetBit(9); // Mainline read TCE + + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR" ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns] " + + uint32_t o_rc = SUCCESS; + + // Memory CEs were masked at the beginning of the TD procedure, so + // clear and unmask them. Also, it is possible that memory UEs have + // thresholded so clear and unmask them as well. + + SCAN_COMM_REGISTER_CLASS * fir = iv_chip->getRegister( "RDFFIR_AND" ); + SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_AND" ); + + fir->setAllBits(); mask->setAllBits(); + + // Do not unmask NCE and TCE attentions if they have been permanently + // masked due to certain TPS conditions. + if ( !(getOcmbDataBundle(iv_chip)->iv_maskMainlineNceTce) ) + { + fir->ClearBit(8); mask->ClearBit(8); // Mainline read NCE + fir->ClearBit(9); mask->ClearBit(9); // Mainline read TCE + } + fir->ClearBit(14); mask->ClearBit(14); // Mainline read UE + + o_rc = fir->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_AND" ); + } + + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_AND" ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t MemTdCtlr<TYPE_MBA>::maskEccAttns() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::maskEccAttns] " @@ -887,6 +1085,13 @@ SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_OCMB_CHIP>( + ExtensibleChip * i_chip ) +{ + return i_chip->getRegister( "RDFFIR_AND" ); +} + +template<> SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MBA>( ExtensibleChip * i_chip ) { ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); @@ -1009,6 +1214,45 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::initialize() } template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::initialize() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::initialize] " + + uint32_t o_rc = SUCCESS; + + do + { + if ( iv_initialized ) break; // nothing to do + + // Unmask the fetch attentions just in case there were masked during a + // TD procedure prior to a reset/reload. + o_rc = unmaskEccAttns(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "unmaskEccAttns() failed" ); + break; + } + + // Find all unverified chip marks. + o_rc = __findChipMarks<TYPE_OCMB_CHIP>( iv_rankList ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__findChipMarks() failed on 0x%08x", + iv_chip->getHuid() ); + break; + } + + // At this point, the TD controller is initialized. + iv_initialized = true; + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t MemTdCtlr<TYPE_MBA>::initialize() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::initialize] " @@ -1162,6 +1406,118 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::handleRrFo() //------------------------------------------------------------------------------ template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo] " + + uint32_t o_rc = SUCCESS; + + do + { + // Check if maintenance command complete attention is set. + SCAN_COMM_REGISTER_CLASS * mcbistfir = + iv_chip->getRegister("MCBISTFIR"); + o_rc = mcbistfir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR"); + break; + } + + // If there is a command complete attention, nothing to do, break out. + if ( mcbistfir->IsBitSet(10) ) + break; + + + // Check if a command is not running. + // If bit 0 of MCB_CNTLSTAT is on, a mcbist run is in progress. + SCAN_COMM_REGISTER_CLASS * mcb_cntlstat = + iv_chip->getRegister("MCB_CNTLSTAT"); + o_rc = mcb_cntlstat->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MCB_CNTLSTAT" ); + break; + } + + // If a command is not running, set command complete attn, break. + if ( !mcb_cntlstat->IsBitSet(0) ) + { + SCAN_COMM_REGISTER_CLASS * mcbistfir_or = + iv_chip->getRegister("MCBISTFIR_OR"); + mcbistfir_or->SetBit( 10 ); + + mcbistfir_or->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on MCBISTFIR_OR" ); + } + break; + } + + // Check if there are unverified chip marks. + std::vector<TdRankListEntry> vectorList = iv_rankList.getList(); + + for ( auto & entry : vectorList ) + { + ExtensibleChip * ocmbChip = entry.getChip(); + MemRank rank = entry.getRank(); + + // Get the chip mark + MemMark chipMark; + o_rc = MarkStore::readChipMark<TYPE_OCMB_CHIP>( ocmbChip, rank, + chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_OCMB_CHIP>(0x%08x,%d) " + "failed", ocmbChip->getHuid(), rank.getMaster() ); + break; + } + + if ( !chipMark.isValid() ) continue; // no chip mark present + + // Get the DQ Bitmap data. + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap( ocmbChip->getTrgt(), rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, %d)", + ocmbChip->getHuid(), rank.getMaster() ); + break; + } + + // Check if the chip mark is verified or not. + bool cmVerified = false; + o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark failed." ); + break; + } + + // If there are any unverified chip marks, stop the command, break. + if ( !cmVerified ) + { + o_rc = stopBgScrub<TYPE_OCMB_CHIP>( iv_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "stopBgScrub<TYPE_OCMB_CHIP>(0x%08x) " + "failed", iv_chip->getHuid() ); + } + break; + } + } + + } while (0); + + return o_rc; + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::handleRrFo] " @@ -1289,7 +1645,8 @@ uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo() //------------------------------------------------------------------------------ template<> -uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume ) +uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub] " @@ -1305,21 +1662,124 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume ) // can use the stop conditions, which should be unique for background scrub, // to determine if it has been configured. - SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); - o_rc = reg->Read(); - if ( SUCCESS != o_rc ) + do { - PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", - iv_chip->getHuid() ); - } - else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH - 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH - 0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH - reg->IsBitSet(34) && // pause on MPE - reg->IsBitSet(35) ) // pause on UE + SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", + iv_chip->getHuid() ); + break; + } + // Note: The stop conditions for background scrubbing can now be + // variable depending on whether we have hit threshold for the number + // of UEs or CEs that we have stopped on on a rank. + + // If we haven't hit CE or UE threshold, check the CE stop conditions + if ( !getMcbistDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) && + !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) + { + // If the stop conditions aren't set, just break out. + if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH + 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH + 0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH + { + break; + } + + } + + // If we haven't hit UE threshold yet, check the UE stop condition + if ( !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) + { + // If the stop condition isn't set, just break out + if ( !reg->IsBitSet(35) ) // pause on UE + { + break; + } + } + + // Need to check the stop on mpe stop condition regardless of whether + // we hit the UE or CE threshold. + if ( reg->IsBitSet(34) ) // pause on MPE + { + // If we reach here, all the stop conditions are set for background + // scrub, so we can resume. + o_canResume = true; + } + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub] " + + uint32_t o_rc = SUCCESS; + + o_canResume = false; + + // It is possible that we were running a TD procedure and the PRD service + // was reset. Therefore, we must check if background scrubbing was actually + // configured. There really is not a good way of doing this. A scrub command + // is a scrub command the only difference is the speed. Unfortunately, that + // speed can change depending on how the hardware team tunes it. For now, we + // can use the stop conditions, which should be unique for background scrub, + // to determine if it has been configured. + + do { - o_canResume = true; - } + SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", + iv_chip->getHuid() ); + break; + } + // Note: The stop conditions for background scrubbing can now be + // variable depending on whether we have hit threshold for the number + // of UEs or CEs that we have stopped on on a rank. + + // If we haven't hit CE or UE threshold, check the CE stop conditions + if ( !getOcmbDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) && + !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) + { + // If the stop conditions aren't set, just break out. + if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH + 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH + 0xf != reg->GetBitFieldJustified(8,4)) ) // NCE hard TH + { + break; + } + + } + + // If we haven't hit UE threshold yet, check the UE stop condition + if ( !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) + { + // If the stop condition isn't set, just break out + if ( !reg->IsBitSet(35) ) // pause on UE + { + break; + } + } + + // Need to check the stop on mpe stop condition regardless of whether + // we hit the UE or CE threshold. + if ( reg->IsBitSet(34) ) // pause on MPE + { + // If we reach here, all the stop conditions are set for background + // scrub, so we can resume. + o_canResume = true; + } + }while(0); return o_rc; @@ -1327,7 +1787,8 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume ) } template<> -uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume ) +uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::canResumeBgScrub] " @@ -1365,6 +1826,7 @@ uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume ) // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H b/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H index e61389ea2..2e833a12a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdRankList.H @@ -80,8 +80,8 @@ class TdRankListEntry private: - ExtensibleChip * iv_chip = nullptr; ///< MCA, MBA, or MEM_PORT chip. - MemRank iv_rank = MemRank(0); ///< Any rank on the MCA/MBA/MEM_PORT + ExtensibleChip * iv_chip = nullptr; ///< MCA, MBA, or OCMB chip. + MemRank iv_rank = MemRank(0); ///< Any rank on the MCA/MBA/OCMB }; /** @@ -95,7 +95,7 @@ class TdRankList /** * @brief Constructor. - * @param MCBIST or MBA chip. + * @param MCBIST, OCMB, or MBA chip. */ explicit TdRankList( ExtensibleChip * i_chip ); @@ -191,17 +191,13 @@ inline TdRankList<TARGETING::TYPE_OCMB_CHIP>::TdRankList( PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); - ExtensibleChipList memPortChipList = getConnected( i_chip, TYPE_MEM_PORT ); - for ( auto & memPortChip : memPortChipList ) - { - std::vector<MemRank> rankList; - getSlaveRanks<TYPE_MEM_PORT>( memPortChip->getTrgt(), rankList ); - PRDF_ASSERT( !rankList.empty() ); // target configured with no ranks + std::vector<MemRank> rankList; + getSlaveRanks<TYPE_OCMB_CHIP>( i_chip->getTrgt(), rankList ); + PRDF_ASSERT( !rankList.empty() ); // target configured with no ranks - for ( auto & rank : rankList ) - { - iv_list.push_back( TdRankListEntry(memPortChip, rank) ); - } + for ( auto & rank : rankList ) + { + iv_list.push_back( TdRankListEntry(i_chip, rank) ); } } diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C index de3e62e23..64eb74648 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -36,6 +36,8 @@ #include <prdfP9McaExtraSig.H> #include <prdfPlatServices.H> +#include <hwp_wrappers.H> + using namespace TARGETING; namespace PRDF @@ -125,6 +127,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( 0 != (i_eccAttns & MAINT_IUE) ); +} + +template<> inline bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns ) { // IUES are reported via RCE ETE on Centaur @@ -252,13 +260,15 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() uint32_t o_rc = SUCCESS; + #ifndef CONFIG_AXONE + // We don't need to set any stop-on-error conditions or thresholds for // soft/inter/hard CEs during Memory Diagnostics. The design is to let the // command continue to the end of the rank and we do diagnostics on the // CE counts found in the per-symbol counters. Therefore, all we need to do // is tell the hardware which CE types to count. - mss::mcbist::stop_conditions stopCond; + mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond; switch ( iv_phase ) { @@ -284,6 +294,8 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() iv_chip->getHuid(), getKey() ); } + #endif + return o_rc; #undef PRDF_FUNC @@ -362,11 +374,66 @@ uint32_t TpsEvent<TYPE_MBA>::startCmd() #undef PRDF_FUNC } +//############################################################################## +// +// Specializations for OCMB +// +//############################################################################## + +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + #ifdef CONFIG_AXONE + + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs during Memory Diagnostics. The design is to let the + // command continue to the end of the rank and we do diagnostics on the + // CE counts found in the per-symbol counters. Therefore, all we need to do + // is tell the hardware which CE types to count. + + mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only soft/inter CEs. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub<TYPE_OCMB_CHIP>(iv_chip, iv_rank, SLAVE_RANK, stopCond); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ // Avoid linker errors with the template. template class TpsEvent<TYPE_MCA>; template class TpsEvent<TYPE_MBA>; +template class TpsEvent<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 187b9b28d..8b3b220c6 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -37,6 +37,8 @@ #include <prdfP9McaExtraSig.H> #include <prdfTargetServices.H> +#include <hwp_wrappers.H> + using namespace TARGETING; namespace PRDF @@ -54,6 +56,13 @@ static const char *mcbCeStatReg[CE_REGS_PER_PORT] = "MCB_MBSSYMEC6", "MCB_MBSSYMEC7", "MCB_MBSSYMEC8" }; +static const char *ocmbCeStatReg[CE_REGS_PER_PORT] = + { + "OCMB_MBSSYMEC0", "OCMB_MBSSYMEC1", "OCMB_MBSSYMEC2", + "OCMB_MBSSYMEC3", "OCMB_MBSSYMEC4", "OCMB_MBSSYMEC5", + "OCMB_MBSSYMEC6", "OCMB_MBSSYMEC7", "OCMB_MBSSYMEC8" + }; + //------------------------------------------------------------------------------ template <TARGETING::TYPE T> @@ -66,6 +75,13 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_OCMB_CHIP>( + ExtensibleChip * i_chip ) +{ + return getOcmbDataBundle(i_chip)->getTpsFalseAlarmCounter(); +} + +template<> TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) { return getMbaDataBundle(i_chip)->getTpsFalseAlarmCounter(); @@ -73,6 +89,23 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ +template <TARGETING::TYPE T> +void __maskMainlineNceTces( ExtensibleChip * i_chip ); + +template<> +void __maskMainlineNceTces<TYPE_MCA>( ExtensibleChip * i_chip ) +{ + getMcaDataBundle(i_chip)->iv_maskMainlineNceTce = true; +} + +template<> +void __maskMainlineNceTces<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) +{ + getOcmbDataBundle(i_chip)->iv_maskMainlineNceTce = true; +} + +//------------------------------------------------------------------------------ + template<TARGETING::TYPE T> void __getNextPhase( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, @@ -98,12 +131,7 @@ void __getNextPhase( ExtensibleChip * i_chip, const MemRank & i_rank, //------------------------------------------------------------------------------ template<TARGETING::TYPE T> -bool __badDqCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_badDqCount ); - -template<> -bool __badDqCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_badDqCount ) +bool __badDqCount(MemUtils::MaintSymbols i_nibbleStats, CeCount & io_badDqCount) { bool badDqFound = false; @@ -142,11 +170,7 @@ bool __badDqCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, template<TARGETING::TYPE T> bool __badChipCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_badChipCount ); - -template<> -bool __badChipCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_badChipCount ) + CeCount & io_badChipCount ) { bool badChipFound = false; uint8_t nonZeroCount = 0; @@ -191,11 +215,7 @@ bool __badChipCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, template<TARGETING::TYPE T> void __sumAboveOneCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_sumAboveOneCount ); - -template<> -void __sumAboveOneCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_sumAboveOneCount ) + CeCount & io_sumAboveOneCount ) { uint8_t sum = 0; MemUtils::MaintSymbols symList; @@ -226,11 +246,7 @@ void __sumAboveOneCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, template<TARGETING::TYPE T> void __singleSymbolCount( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_singleSymCount ); - -template<> -void __singleSymbolCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, - CeCount & io_singleSymCount ) + CeCount & io_singleSymCount ) { uint8_t count = 0; bool multNonZeroSyms = false; @@ -315,12 +331,12 @@ uint32_t __updateVpdSumAboveOne( CeCount i_sumAboveOneCount, //------------------------------------------------------------------------------ -template <> -uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template <TARGETING::TYPE T> +uint32_t TpsEvent<T>::analyzeEccErrors( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeEccErrors] " + #define PRDF_FUNC "[TpsEvent<T>::analyzeEccErrors] " uint32_t o_rc = SUCCESS; @@ -338,7 +354,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, // At this point we don't actually have an address for the UE. The // best we can do is get the address in which the command stopped. MemAddr addr; - o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr ); + o_rc = getMemMaintAddr<T>( iv_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -346,8 +362,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, break; } - o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr, - UE_TABLE::SCRUB_UE, io_sc ); + o_rc = MemEcc::handleMemUe<T>( iv_chip, addr, + UE_TABLE::SCRUB_UE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed", @@ -357,7 +373,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, // Because of the UE, any further TPS requests will likely have no // effect. So ban all subsequent requests. - MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() ); + MemDbUtils::banTps<T>( iv_chip, addr.getRank() ); // Abort this procedure because additional repairs will likely // not help (also avoids complication of having UE and MPE at @@ -371,7 +387,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_MaintIUE ); - o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + o_rc = MemEcc::handleMemIue<T>( iv_chip, iv_rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed", @@ -397,8 +413,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_MaintMPE ); - o_rc = MemEcc::handleMpe<TYPE_MCA>( iv_chip, iv_rank, - UE_TABLE::SCRUB_MPE, io_sc ); + o_rc = MemEcc::handleMpe<T>( iv_chip, iv_rank, + UE_TABLE::SCRUB_MPE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMpe<T>(0x%08x, 0x%02x) failed", @@ -419,36 +435,51 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, } +template +uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeEccErrors(const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done); + //------------------------------------------------------------------------------ -template<> -uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) +template<TARGETING::TYPE T> +uint32_t TpsEvent<T>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) { io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_TpsFalseAlarm ); // Increase false alarm counter and check threshold. - if ( __getTpsFalseAlarmCounter<TYPE_MCA>(iv_chip)->inc( iv_rank, io_sc) ) + if ( __getTpsFalseAlarmCounter<T>(iv_chip)->inc( iv_rank, io_sc) ) { io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_TpsFalseAlarmTH ); // Permanently mask mainline NCEs and TCEs - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } return SUCCESS; } +template +uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::handleFalseAlarm( + STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ -template<> -uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, +template<TARGETING::TYPE T> +uint32_t TpsEvent<T>::analyzeCeSymbolCounts( CeCount i_badDqCount, CeCount i_badChipCount, CeCount i_sumAboveOneCount, CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts] " + #define PRDF_FUNC "[TpsEvent<T>::analyzeCeSymbolCounts] " uint32_t o_rc = SUCCESS; @@ -457,33 +488,33 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, bool tpsFalseAlarm = false; // Get the Bad DQ Bitmap. - TargetHandle_t mcaTrgt = iv_chip->getTrgt(); + TargetHandle_t trgt = iv_chip->getTrgt(); MemDqBitmap dqBitmap; - o_rc = getBadDqBitmap( mcaTrgt, iv_rank, dqBitmap ); + o_rc = getBadDqBitmap( trgt, iv_rank, dqBitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, 0x%02x) failed", - getHuid(mcaTrgt), iv_rank.getKey() ); + getHuid(trgt), iv_rank.getKey() ); break; } // Get the symbol mark. MemMark symMark; - o_rc = MarkStore::readSymbolMark<TYPE_MCA>( iv_chip, iv_rank, symMark ); + o_rc = MarkStore::readSymbolMark<T>( iv_chip, iv_rank, symMark ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "readSymbolMark<TYPE_MCA>(0x%08x, 0x%02x) " + PRDF_ERR( PRDF_FUNC "readSymbolMark<T>(0x%08x, 0x%02x) " "failed", iv_chip->getHuid(), iv_rank.getKey() ); break; } // Get the chip mark. MemMark chipMark; - o_rc = MarkStore::readChipMark<TYPE_MCA>( iv_chip, iv_rank, chipMark ); + o_rc = MarkStore::readChipMark<T>( iv_chip, iv_rank, chipMark ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MCA>(0x%08x, 0x%02x) " + PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x, 0x%02x) " "failed", iv_chip->getHuid(), iv_rank.getKey() ); break; } @@ -512,9 +543,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, // TCE. Both are still correctable after a symbol mark // is placed. // Place a symbol mark on this bad DQ. - MemMark newSymMark( mcaTrgt, iv_rank, + MemMark newSymMark( trgt, iv_rank, i_badDqCount.symList[0].symbol ); - o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + o_rc = MarkStore::writeSymbolMark<T>( iv_chip, iv_rank, newSymMark ); if ( SUCCESS != o_rc ) { @@ -552,7 +583,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs. - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } } else @@ -566,7 +597,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, else if ( 2 == i_badDqCount.count && 0 == i_badChipCount.count ) { // Permanently mask mainline NCEs and TCEs. - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); // If the symbol mark is available. if ( !symMark.isValid() ) @@ -587,9 +618,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, highSym = sym; } - MemMark newSymMark( mcaTrgt, iv_rank, + MemMark newSymMark( trgt, iv_rank, highSym.symbol ); - o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + o_rc = MarkStore::writeSymbolMark<T>( iv_chip, iv_rank, newSymMark ); if ( SUCCESS != o_rc ) { @@ -669,10 +700,10 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, // This means we have only one more potential bad DQ, which // is still correctable after a chip mark is placed. // Place a chip mark on this bad chip. - MemMark newChipMark( mcaTrgt, iv_rank, + MemMark newChipMark( trgt, iv_rank, i_badChipCount.symList[0].symbol ); - o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank, - newChipMark ); + o_rc = MarkStore::writeChipMark<T>( iv_chip, iv_rank, + newChipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) " @@ -708,7 +739,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } } else @@ -731,7 +762,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } // If the chip mark is available. if ( !chipMark.isValid() ) @@ -742,10 +773,10 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, // This means we have no more potential bad DQ or bad chips // since we can't correct those after chip mark is placed. // Place a chip mark on the bad chip. - MemMark newChipMark( mcaTrgt, iv_rank, + MemMark newChipMark( trgt, iv_rank, i_badChipCount.symList[0].symbol ); - o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank, - newChipMark ); + o_rc = MarkStore::writeChipMark<T>( iv_chip, iv_rank, + newChipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) " @@ -763,8 +794,8 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, // this chip mark, we need to clear the symbol mark now // instead of at the end of the function to make room // for the additional symbol mark. - o_rc = MarkStore::clearSymbolMark<TYPE_MCA>( iv_chip, - iv_rank ); + o_rc = MarkStore::clearSymbolMark<T>( iv_chip, + iv_rank ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "MarkStore::clearSymbolMark(" @@ -810,7 +841,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs. - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } } // If the symbol mark is available. @@ -822,9 +853,9 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, // This means we have no more potential bad DQ or bad chips // since we can't correct those after symbol mark is placed. // Place a symbol mark on this bad DQ. - MemMark newSymMark( mcaTrgt, iv_rank, + MemMark newSymMark( trgt, iv_rank, i_badDqCount.symList[0].symbol ); - o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + o_rc = MarkStore::writeSymbolMark<T>( iv_chip, iv_rank, newSymMark ); if ( SUCCESS != o_rc ) { @@ -865,7 +896,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs. - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } } @@ -888,7 +919,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, io_sc.service_data->setServiceCall(); // Permanently mask mainline NCEs and TCEs. - getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + __maskMainlineNceTces<T>( iv_chip ); } // If analysis resulted in a false alarm. @@ -903,18 +934,18 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, } // Write any updates to VPD. - o_rc = setBadDqBitmap( mcaTrgt, iv_rank, dqBitmap ); + o_rc = setBadDqBitmap( trgt, iv_rank, dqBitmap ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "setBadDqBitmap(0x%08x, 0x%02x) failed", - getHuid(mcaTrgt), iv_rank.getKey() ); + getHuid(trgt), iv_rank.getKey() ); break; } // We may have placed a chip mark so do any necessary cleanup. This must // be called after writing the bad DQ bitmap because the this function // will also write it if necessary. - o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + o_rc = MarkStore::chipMarkCleanup<T>( iv_chip, iv_rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "MarkStore::chipMarkCleanup(0x%08x,0x%02x) " @@ -929,6 +960,15 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, #undef PRDF_FUNC } +template +uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, + CeCount i_badChipCount, CeCount i_sumAboveOneCount, + CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeSymbolCounts( CeCount i_badDqCount, + CeCount i_badChipCount, CeCount i_sumAboveOneCount, + CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ template<> @@ -1031,11 +1071,110 @@ uint32_t TpsEvent<TYPE_MCA>::getSymbolCeCounts( CeCount & io_badDqCount, //------------------------------------------------------------------------------ -template <> -uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts( CeCount & io_badDqCount, + CeCount & io_badChipCount, CeCount & io_sumAboveOneCount, + CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts] " + + uint32_t o_rc = SUCCESS; + + do + { + // Get the Bad DQ Bitmap. + TargetHandle_t ocmbTrgt = iv_chip->getTrgt(); + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap( ocmbTrgt, iv_rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x,%d) failed", + getHuid(ocmbTrgt), iv_rank.getMaster() ); + break; + } + std::vector<MemSymbol> bmSymList = dqBitmap.getSymbolList(); + + const char * reg_str = nullptr; + SCAN_COMM_REGISTER_CLASS * reg = nullptr; + + for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_PORT; regIdx++ ) + { + reg_str = ocmbCeStatReg[regIdx]; + reg = iv_chip->getRegister( reg_str ); + + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s.", reg_str ); + break; + } + uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx; + + for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG; + i += MEM_SYMBOLS_PER_NIBBLE ) + { + MemUtils::MaintSymbols nibbleStats; + + // Get a nibble's worth of symbols. + for ( uint8_t n = 0; n < MEM_SYMBOLS_PER_NIBBLE; n++ ) + { + uint8_t sym = baseSymbol + (i+n); + PRDF_ASSERT( sym < SYMBOLS_PER_RANK ); + + MemUtils::SymbolData symData; + symData.symbol = MemSymbol::fromSymbol( ocmbTrgt, iv_rank, + sym, CEN_SYMBOL::ODD_SYMBOL_DQ ); + if ( !symData.symbol.isValid() ) + { + PRDF_ERR( PRDF_FUNC "MemSymbol() failed: symbol=%d", + sym ); + o_rc = FAIL; + break; + } + + // Any symbol set in the DRAM repairs VPD will have an + // automatic CE count of 0xFF + if ( std::find( bmSymList.begin(), bmSymList.end(), + symData.symbol ) != bmSymList.end() ) + symData.count = 0xFF; + else + symData.count = reg->GetBitFieldJustified(((i+n)*8), 8); + + nibbleStats.push_back( symData ); + + // Add all symbols with non-zero counts to the callout list. + if ( symData.count != 0 ) + { + MemoryMru mm { ocmbTrgt, iv_rank, symData.symbol }; + io_sc.service_data->SetCallout( mm ); + } + } + if ( SUCCESS != o_rc ) break; + + // Analyze the nibble of symbols. + __analyzeNibbleSyms<TYPE_OCMB_CHIP>( nibbleStats, io_badDqCount, + io_badChipCount, io_sumAboveOneCount, io_singleSymCount ); + + } + if ( SUCCESS != o_rc ) break; + } + + }while(0); + + return o_rc; + + #undef PRDF_FUNC + +} + +//------------------------------------------------------------------------------ + +template <TARGETING::TYPE T> +uint32_t TpsEvent<T>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[TpsEvent<TYPE_MCA>::analyzeCeStats] " + #define PRDF_FUNC "[TpsEvent<T>::analyzeCeStats] " uint32_t o_rc = SUCCESS; @@ -1086,11 +1225,18 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, } +template +uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeStats(STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done); + //------------------------------------------------------------------------------ -template<> -uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t TpsEvent<T>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { #define PRDF_FUNC "[TpsEvent::analyzePhase] " @@ -1102,11 +1248,11 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, // Analyze Ecc Attentions uint32_t eccAttns; - o_rc = checkEccFirs<TYPE_MCA>( iv_chip, eccAttns ); + o_rc = checkEccFirs<T>( iv_chip, eccAttns ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "checkEccFirs(0x%08x) failed", - iv_chip->getHuid() ); + iv_chip->getHuid() ); break; } @@ -1135,7 +1281,7 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, if ( (SUCCESS == o_rc) && o_done ) { // Clear the ECC FFDC for this master rank. - MemDbUtils::resetEccFfdc<TYPE_MCA>( iv_chip, iv_rank, SLAVE_RANK ); + MemDbUtils::resetEccFfdc<T>( iv_chip, iv_rank, SLAVE_RANK ); } return o_rc; @@ -1143,6 +1289,36 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, #undef PRDF_FUNC } +template +uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> +uint32_t TpsEvent<T>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +{ + uint32_t signature = 0; + + __getNextPhase<T>( iv_chip, iv_rank, io_sc, iv_phase, signature ); + + PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x", + iv_phase, iv_chip->getHuid(), getKey() ); + + io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature ); + + return startCmd(); +} + +template +uint32_t TpsEvent<TYPE_MCA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startNextPhase( + STEP_CODE_DATA_STRUCT & io_sc ); + //############################################################################## // // Specializations for MCA @@ -1156,13 +1332,15 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() uint32_t o_rc = SUCCESS; + #ifndef CONFIG_AXONE + // We don't need to set any stop-on-error conditions or thresholds for // soft/inter/hard CEs at runtime. The design is to let the command continue // to the end of the rank and we do diagnostics on the CE counts found in // the per-symbol counters. Therefore, all we need to do is tell the // hardware which CE types to count. - mss::mcbist::stop_conditions stopCond; + mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond; switch ( iv_phase ) { @@ -1190,26 +1368,67 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() iv_chip->getHuid(), getKey() ); } + #endif + return o_rc; #undef PRDF_FUNC } -//------------------------------------------------------------------------------ +//############################################################################## +// +// Specializations for OCMB +// +//############################################################################## template<> -uint32_t TpsEvent<TYPE_MCA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd() { - uint32_t signature = 0; + #define PRDF_FUNC "[TpsEvent::startCmd] " - __getNextPhase<TYPE_MCA>( iv_chip, iv_rank, io_sc, iv_phase, signature ); + uint32_t o_rc = SUCCESS; - PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x", - iv_phase, iv_chip->getHuid(), getKey() ); + #ifdef CONFIG_AXONE - io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature ); + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs at runtime. The design is to let the command continue + // to the end of the rank and we do diagnostics on the CE counts found in + // the per-symbol counters. Therefore, all we need to do is tell the + // hardware which CE types to count. - return startCmd(); + mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Since there are not enough hard CEs to trigger a symbol mark, set + // the per symbol counters to count all CE types. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + stopCond.set_nce_hard_symbol_count_enable( mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub<TYPE_OCMB_CHIP>(iv_chip, iv_rank, SLAVE_RANK, stopCond); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + #endif + + return o_rc; + + #undef PRDF_FUNC } //############################################################################## diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C index 8c3c4480a..784306baf 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2018 */ +/* Contributors Listed Below - COPYRIGHT 2018,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -30,6 +30,8 @@ // Platform includes #include <prdfCenMbaExtraSig.H> +#include <hwp_wrappers.H> + using namespace TARGETING; namespace PRDF @@ -39,41 +41,16 @@ using namespace PlatServices; //############################################################################## // -// Specializations for MCA +// Generic Specializations // //############################################################################## -template<> -uint32_t VcmEvent<TYPE_MCA>::startCmd() +template<TARGETING::TYPE T> +uint32_t VcmEvent<T>::handlePhaseComplete( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[VcmEvent::startCmd] " - - uint32_t o_rc = SUCCESS; - - // No stop conditions. - mss::mcbist::stop_conditions stopCond; - - // Start the time based scrub procedure on this master rank. - o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, MASTER_RANK, stopCond ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", - iv_chip->getHuid(), getKey() ); - } - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -template<> -uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) -{ - #define PRDF_FUNC "[VcmEvent<TYPE_MCA>::handlePhaseComplete] " + #define PRDF_FUNC "[VcmEvent<T>::handlePhaseComplete] " uint32_t o_rc = SUCCESS; @@ -100,6 +77,49 @@ uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +template +uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); +template +uint32_t VcmEvent<TYPE_OCMB_CHIP>::handlePhaseComplete( + const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); + +//############################################################################## +// +// Specializations for MCA +// +//############################################################################## + +template<> +uint32_t VcmEvent<TYPE_MCA>::startCmd() +{ + #define PRDF_FUNC "[VcmEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + #ifndef CONFIG_AXONE + + // No stop conditions. + mss::mcbist::stop_conditions<mss::mc_type::NIMBUS> stopCond; + + // Start the time based scrub procedure on this master rank. + o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, MASTER_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA @@ -448,6 +468,40 @@ uint32_t VcmEvent<TYPE_MBA>::handlePhaseComplete( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +//############################################################################## +// +// Specializations for OCMB +// +//############################################################################## + +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[VcmEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + #ifdef CONFIG_AXONE + + // No stop conditions. + mss::mcbist::stop_conditions<mss::mc_type::EXPLORER> stopCond; + + // Start the time based scrub procedure on this master rank. + o_rc = startTdScrub<TYPE_OCMB_CHIP>( iv_chip, iv_rank, MASTER_RANK, + stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H index b319f910b..c712d6aa3 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -342,6 +342,9 @@ class VcmEvent : public TdEntry #ifdef __HOSTBOOT_RUNTIME template<> uint32_t VcmEvent<TARGETING::TYPE_MCA>::cleanup(STEP_CODE_DATA_STRUCT & io_sc); +template<> +uint32_t VcmEvent<TARGETING::TYPE_OCMB_CHIP>::cleanup( + STEP_CODE_DATA_STRUCT & io_sc); #endif template<> diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C index 26ef1d727..5ffa9a84b 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -92,6 +92,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( 0 != (i_eccAttns & MAINT_IUE) ); +} + +template<> inline bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns ) { // IUES are reported via RCE ETE on Centaur @@ -218,6 +224,7 @@ uint32_t VcmEvent<TYPE_MBA>::startCmd() // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; +template class VcmEvent<TYPE_OCMB_CHIP>; } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C index ca4de8e5a..e64227996 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -55,6 +55,12 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +VcmFalseAlarm * __getFalseAlarmCounter<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip) +{ + return getOcmbDataBundle(i_chip)->getVcmFalseAlarmCounter(); +} + +template<> VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) { return getMbaDataBundle(i_chip)->getVcmFalseAlarmCounter(); @@ -62,16 +68,16 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) //############################################################################## // -// Specializations for MCA +// Generic Specializations // //############################################################################## -template<> -uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, - STEP_CODE_DATA_STRUCT & io_sc, - bool & o_done ) +template<TARGETING::TYPE T> +uint32_t VcmEvent<T>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) { - #define PRDF_FUNC "[VcmEvent<TYPE_MCA>::checkEcc] " + #define PRDF_FUNC "[VcmEvent<T>::checkEcc] " uint32_t o_rc = SUCCESS; @@ -88,7 +94,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, // At this point we don't actually have an address for the UE. The // best we can do is get the address in which the command stopped. MemAddr addr; - o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr ); + o_rc = getMemMaintAddr<T>( iv_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -96,7 +102,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, break; } - o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr, + o_rc = MemEcc::handleMemUe<T>( iv_chip, addr, UE_TABLE::SCRUB_UE, io_sc ); if ( SUCCESS != o_rc ) { @@ -107,7 +113,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, // Because of the UE, any further TPS requests will likely have no // effect. So ban all subsequent requests. - MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() ); + MemDbUtils::banTps<T>( iv_chip, addr.getRank() ); // Leave the mark in place and abort this procedure. o_done = true; break; @@ -118,7 +124,7 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_MaintIUE ); - o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + o_rc = MemEcc::handleMemIue<T>( iv_chip, iv_rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed", @@ -143,6 +149,14 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +template +uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); +template +uint32_t VcmEvent<TYPE_OCMB_CHIP>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ); //------------------------------------------------------------------------------ @@ -180,6 +194,41 @@ uint32_t VcmEvent<TYPE_MCA>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) #undef PRDF_FUNC } +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + do + { + o_rc = MarkStore::chipMarkCleanup<TYPE_OCMB_CHIP>( iv_chip, iv_rank, + io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // The cleanup() function is called by both verified() and falseAlarm(). + // In either case, the error log should be predictive if there has been + // a least one false alarm on any DRAM on this rank other than this + // DRAM. This is required on Nimbus because of two symbol correction, + // which does not exist on Centaur. + VcmFalseAlarm * faCntr =__getFalseAlarmCounter<TYPE_OCMB_CHIP>(iv_chip); + uint8_t dram = iv_mark.getSymbol().getDram(); + if ( faCntr->queryDrams(iv_rank, dram, io_sc) ) + io_sc.service_data->setServiceCall(); + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA @@ -386,6 +435,7 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; +template class VcmEvent<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 5f7efa274..fac29fce3 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -27,7 +27,6 @@ #include <iipServiceDataCollector.h> #include <prdfExtensibleChip.H> #include <prdfPluginMap.H> -#include <isteps/nvdimm/nvdimm.H> // Platform includes #include <prdfMemDbUtils.H> @@ -38,6 +37,10 @@ #include <prdfMemTps.H> #endif +#ifdef CONFIG_NVDIMM + #include <nvdimm.H> +#endif + using namespace TARGETING; namespace PRDF @@ -296,18 +299,9 @@ PRDF_PLUGIN_DEFINE( nimbus_mca, MemPortFailure ); // //############################################################################## +#ifdef CONFIG_NVDIMM #ifdef __HOSTBOOT_RUNTIME -enum nvdimmRegOffset -{ - NVDIMM_MGT_CMD1 = 0x041, - MODULE_HEALTH = 0x0A0, - MODULE_HEALTH_STATUS0 = 0x0A1, - MODULE_HEALTH_STATUS1 = 0x0A2, - ERROR_THRESHOLD_STATUS = 0x0A5, - WARNING_THRESHOLD_STATUS = 0x0A7, -}; - /** * @brief Gets a map list of which bits are set from a uint8_t bit list (7:0) * @param i_data uint8_t bit list (7:0) @@ -349,6 +343,7 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm, break; } + // addPartCallout will default to GARD_NULL, NO_DECONFIG mainErrl->addPartCallout( i_dimm, HWAS::BPM_PART_TYPE, i_priority ); @@ -362,10 +357,12 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm, /** * @brief Adds a callout of the cable connecting an NVDIMM to its * backup power module (BPM) + * @param i_dimm The target dimm. * @param i_priority The callout priority. * @return FAIL if unable to get the global error log, else SUCCESS */ -uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) +uint32_t __addNvdimmCableCallout( TargetHandle_t i_dimm, + HWAS::callOutPriority i_priority ) { #define PRDF_FUNC "[__addNvdimmCableCallout] " @@ -382,7 +379,9 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) break; } - mainErrl->addProcedureCallout( HWAS::EPUB_PRC_NVDIMM_ERR, i_priority ); + // addPartCallout will default to GARD_NULL, NO_DECONFIG + mainErrl->addPartCallout( i_dimm, HWAS::BPM_CABLE_PART_TYPE, + i_priority ); }while(0); @@ -391,21 +390,45 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) #undef PRDF_FUNC } +/** + * @brief If a previous error has been found, add a signature to the + * multi-signature list, else set the primary signature. + * @param io_sc The step code data struct. + * @param i_trgt The target. + * @param i_errFound Whether an error has already been found or not. + * @param i_sig The signature to be set. + */ +void __addSignature( STEP_CODE_DATA_STRUCT & io_sc, TargetHandle_t i_trgt, + bool i_errFound, uint32_t i_sig ) +{ + if ( i_errFound ) + { + io_sc.service_data->AddSignatureList( i_trgt, i_sig ); + } + else + { + io_sc.service_data->setSignature( getHuid(i_trgt), i_sig ); + } +} /** * @brief Analyze NVDIMM Health Status0 Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. * @return FAIL if unable to read register, else SUCCESS */ -uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) +uint32_t __analyzeHealthStatus0Reg(STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, bool & io_errFound) { #define PRDF_FUNC "[__analyzeHealthStatus0Reg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + do { // NVDIMM health status registers size = 1 byte @@ -413,7 +436,7 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Health Status0 Register (0xA1) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS0) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS0) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Health Status0 Register. " @@ -427,58 +450,66 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 0: Voltage Regulator Fail if ( bitList.count(0) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VoltRegFail ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VoltRegFail ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 1: VDD Lost if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VddLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VddLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 2: VPP Lost if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VppLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VppLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 3: VTT Lost if ( bitList.count(3) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VttLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VttLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 4: DRAM not Self Refresh if ( bitList.count(4) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NotSelfRefr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotSelfRefr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 5: Controller HW Error if ( bitList.count(5) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CtrlHwErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_CtrlHwErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 6: NVM Controller Error if ( bitList.count(6) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmCtrlErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmCtrlErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 7: NVM Lifetime Error if ( bitList.count(7) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmLifeErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } }while(0); @@ -491,18 +522,22 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, /** * @brief Analyze NVDIMM Health Status1 Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. * @return FAIL if unable to read register, else SUCCESS */ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) + TargetHandle_t i_dimm, bool & io_errFound ) { #define PRDF_FUNC "[__analyzeHealthStatus1Reg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + do { // NVDIMM health status registers size = 1 byte @@ -510,7 +545,7 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Health Status1 Register (0xA2) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Health Status1 Register. " @@ -524,83 +559,90 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 0: Insufficient Energy if ( bitList.count(0) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_InsuffEnergy); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_InsuffEnergy ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 1: Invalid Firmware if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_InvFwErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_InvFwErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 2: Configuration Data Error if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CnfgDataErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_CnfgDataErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 3: No Energy Source if ( bitList.count(3) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_NoEsPres); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NoEsPres ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 4: Energy Policy Not Set if ( bitList.count(4) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsPolNotSet ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsPolNotSet ); // Callout FW (Level2 Support) High io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); // Callout NVDIMM low on 1st, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 5: Energy Source HW Error if ( bitList.count(5) ) { - io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsHwFail ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHwFail ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 6: Energy Source Health Assessment Error if ( bitList.count(6) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_EsHlthAssess); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHlthAssess); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 7: Reserved @@ -613,18 +655,105 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, } /** + * @brief Reads and merges the data from two ES_TEMP registers to get the + * correct temperature format. + * @param i_dimm The target nvdimm. + * @param i_tempMsbReg The address of the register that contains the most + * significant byte of the temperature data. + * @param i_tempLsbReg The address of the register that contains the least + * significant byte of the temperature data. + * @param o_tempData The 16 bit temperature data. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg, + uint16_t i_tempLsbReg, uint16_t & o_tempData ) +{ + #define PRDF_FUNC "[__readTemp] " + + /* + * -NOTE: Example showing how to read the temperature format: + * ES_TEMP1 = 0x03 (MSB: bits 15-8) + * ES_TEMP0 = 0x48 (LSB: bits 7-0) + * + * 0x0348 = 0000 0011 0100 1000 = 52.5 C + * + * -NOTE: bit definition: + * [15:13]Reserved + * [12]Sign 0 = positive, 1 = negative; 0°C should be expressed as positive + * [11] 128°C + * [10] 64°C + * [9] 32°C + * [8] 16°C + * [7] 8°C + * [6] 4°C + * [5] 2°C + * [4] 1°C + * [3] 0.5°C + * [2] 0.25°C + * [1] 0.125°C Optional for temp fields; not used for temp th fields + * [0]0.0625°C Optional for temp fields; not used for temp th fields + */ + uint32_t o_rc = SUCCESS; + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + uint8_t msbData = 0; + uint8_t lsbData = 0; + + // Read the two inputted temperature registers. + errlHndl_t errl = deviceRead( i_dimm, &msbData, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_tempMsbReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature MSB Register. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + errl = deviceRead( i_dimm, &lsbData, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_tempLsbReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature LSB Register. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + o_tempData = ((uint16_t)msbData << 8) | lsbData; + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** * @brief Analyze NVDIMM Error Threshold Status Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. + * @param o_esTempErr A flag for whether we hit an ES TEMP error or not. * @return FAIL if unable to read register, else SUCCESS */ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) + TargetHandle_t i_dimm, bool & io_errFound, + bool & o_esTempErr ) { #define PRDF_FUNC "[__analyzeErrorThrStatusReg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + o_esTempErr = false; + + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); do { @@ -633,7 +762,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Error Threshold Status Register (0xA5) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(ERROR_THRESHOLD_STATUS) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::ERROR_THRESHOLD_STATUS) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Status Reg. " @@ -648,7 +777,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 1: ES Lifetime Error if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsLifeErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsLifeErr ); // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -656,11 +785,60 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 2: ES Temperature Error if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsTmpErr ); + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + + // Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values + uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; + uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; + uint16_t esTemp = 0; + o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); + if ( SUCCESS != o_rc ) break; + + uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1; + uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0; + uint16_t esTempHighTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); + if ( SUCCESS != o_rc ) break; + + msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + + // Check to see if the ES_TEMP is negative (bit 12) + bool esTempNeg = false; + if ( esTemp & 0x1000 ) esTempNeg = true; + + // If ES_TEMP is equal or above ES_TEMP_ERROR_HIGH_THRESHOLD + // Just in case ES_TEMP has moved before we read it out, we'll add + // a 2°C margin when comparing to the threshold. + if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrHigh ); + } + // Else check if the error hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrLow ); + } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrFa ); + } // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -668,6 +846,9 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + + o_esTempErr = true; + io_errFound = true; } // BIT 3:7: Reserved @@ -680,6 +861,419 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, } /** + * @brief Adjusts the warning threshold so that future warnings are allowed + * to report. + * @param io_sc The step code data struct. + * @param i_dimm The target nvdimm. + * @param i_warnThReg The address of the relevant warning threshold register. + * @param i_errThReg The address of the relevant error threshold register. + * @param o_firstWarn Flag if this is the first warning of this type. + * @param o_statusErr Flag to tell if we found an error from checking the + * notification status register. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, uint16_t i_warnThReg, + uint16_t i_errThReg, bool & o_firstWarn, + bool & o_statusErr ) +{ + #define PRDF_FUNC "[__adjustThreshold] " + + uint32_t o_rc = SUCCESS; + uint16_t notifCmdReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD; + uint16_t notifStatusReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_STATUS; + o_firstWarn = false; + o_statusErr = false; + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + + // Read the corresponding warning threshold + uint8_t warnTh = 0; + errlHndl_t errl = deviceRead( i_dimm, &warnTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_warnThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Reg. HUID: " + "0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Read the corresponding error threshold + uint8_t errTh = 0; + errl = deviceRead( i_dimm, &errTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_errThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Reg. HUID: " + "0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // If the warning threshold is not set to the error threshold+1, + // move the threshold. + if ( warnTh != (errTh+1) ) + { + o_firstWarn = true; + + // SET_EVENT_NOTIFICATION_CMD is a write only register that is + // used to change the SET_EVENT_NOTIFICATION_STATUS register. + // The only bits within it that are used are bits 0 and 1, as such + // we can safely set the rest to 0. It is defined as: + // [0]: Persistency Notification + // [1]: Warning Threshold Notification + // [2]: Obsolete + // [3]: Firmware Activation Notification (Not Used) + // [4:7]: Reserved + + // Clear SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + uint8_t notifCmd = 0x01; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification " + "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Check SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + uint8_t notifStat = 0; + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 1 + if ( bitList.count(1) || !bitList.count(2) || bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } + + + // Set the warning threshold to error threshold + 1 + warnTh = errTh+1; + errl = deviceWrite( i_dimm, &warnTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_warnThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to write Warning Threshold Reg. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Set SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + notifCmd = 0x03; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification " + "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Recheck SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 0 + if ( bitList.count(1) || !bitList.count(2) || !bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } + } + // Note: moving the threshold should clear the warning from + // WARNING_THRESHOLD_STATUS, which allows future warnings to report. + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** + * @brief Analyze NVDIMM Warning Threshold Status Register for errors + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, bool & io_errFound) +{ + #define PRDF_FUNC "[__analyzeWarningThrStatusReg] " + + uint32_t o_rc = SUCCESS; + uint8_t data = 0; + + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + + // Read the Warning Threshold Status Register (0xA7) 7:0 + errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::WARNING_THRESHOLD_STATUS) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Status Reg. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data ); + + // Analyze Bit 2 First + // BIT 2: ES_TEMP_WARNING + if ( bitList.count(2) ) + { + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + + // Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values + uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; + uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; + uint16_t esTemp = 0; + o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); + if ( SUCCESS != o_rc ) break; + + uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1; + uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0; + uint16_t esTempHighTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); + if ( SUCCESS != o_rc ) break; + + msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + + // Check to see if the ES_TEMP is negative (bit 12) + bool esTempNeg = false; + if ( esTemp & 0x1000 ) esTempNeg = true; + + // If ES_TEMP is equal or above ES_TEMP_WARNING_HIGH_THRESHOLD + // Just in case ES_TEMP has moved before we read it out, we'll add + // a 2°C margin when comparing to the threshold. + if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnHigh ); + } + // Else check if the warning hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnLow ); + } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnFa ); + } + + // Callout BPM (backup power module) high + o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != o_rc ) break; + + // Callout NVDIMM low, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + + // Because of the possibility of intermittent ES temperature + // false alarm readings, we will keep the log hidden. If there is + // an actual ES temperature problem, we assume we will continue + // to be called to handle the temperature warning and hit threshold. + + // Only send the save/restore message to PHYP if we hit threshold. + if ( io_sc.service_data->IsAtThreshold() ) + { + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + } + + io_errFound = true; + } + // BIT 0: NVM_LIFETIME_WARNING + if ( bitList.count(0) ) + { + // Adjust warning threshold. + uint16_t warnThReg = NVDIMM::i2cReg::NVM_LIFETIME_WARNING_THRESHOLD; + uint16_t errThReg = NVDIMM::i2cReg::NVM_LIFETIME_ERROR_THRESHOLD; + bool firstWarn = false; + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); + if ( SUCCESS != o_rc ) break; + + // Make the log predictive, but do not mask the FIR + io_sc.service_data->setServiceCall(); + + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout NVDIMM on 1st, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + } + + // Update signature depending on whether this is the first or second + // warning of this type. + if ( firstWarn ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn1 ); + } + else + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn2 ); + } + + + io_errFound = true; + } + // BIT 1: ES_LIFETIME_WARNING + if ( bitList.count(1) ) + { + // Adjust warning threshold. + uint16_t warnThReg = NVDIMM::i2cReg::ES_LIFETIME_WARNING_THRESHOLD; + uint16_t errThReg = NVDIMM::i2cReg::ES_LIFETIME_ERROR_THRESHOLD; + bool firstWarn = false; + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); + if ( SUCCESS != o_rc ) break; + + // Make the log predictive, but do not mask the FIR + io_sc.service_data->setServiceCall(); + + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout BPM (backup power module) high + o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != o_rc ) break; + + // Callout NVDIMM low, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + } + + // Update signature depending on whether this is the first or second + // warning of this type. + if ( firstWarn ) + { + __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn1); + } + else + { + __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn2); + } + + io_errFound = true; + } + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** * @brief De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1 (0x41) * @param i_dimm The target dimm. * @return FAIL if unable to read/write register, else SUCCESS @@ -698,7 +1292,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) // Read the NVDIMM_MGT_CMD1 register (0x41) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read NVDIMM_MGT_CMD1. " @@ -713,7 +1307,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) // Write the updated data back to NVDIMM_MGT_CMD1 errl = deviceWrite( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to write NVDIMM_MGT_CMD1. " @@ -732,6 +1326,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) } #endif // HOSTBOOT_RUNTIME +#endif // CONFIG_NVDIMM /** * @brief MCACALFIR[8] - Error from NVDIMM health status registers @@ -744,13 +1339,28 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, { #define PRDF_FUNC "[nimbus_mca::AnalyzeNvdimmHealthStatRegs] " + #ifdef CONFIG_NVDIMM #ifdef __HOSTBOOT_RUNTIME uint32_t l_rc = SUCCESS; + bool errFound = false; // We need to check both dimms for errors for ( auto & dimm : getConnected(i_chip->getTrgt(), TYPE_DIMM) ) { + // Skip any non-NVDIMMs + if ( !isNVDIMM(dimm) ) continue; + + // Add SMART-specific, page 4 registers to FFDC + errlHndl_t mainErrl = nullptr; + mainErrl = ServiceGeneratorClass::ThisServiceGenerator().getErrl(); + if ( nullptr == mainErrl ) + { + PRDF_ERR( PRDF_FUNC "Failed to get the global error log." ); + continue; + } + PlatServices::nvdimmAddFfdc( dimm, mainErrl ); + // De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1 l_rc = __deassertEventN( dimm ); if ( SUCCESS != l_rc ) continue; @@ -762,7 +1372,7 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // Read the Module Health Register (0xA0) 7:0 errlHndl_t errl = deviceRead( dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Module Health Register. " @@ -775,6 +1385,30 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // BIT 0: Persistency Lost if ( bitList.count(0) ) { + // Analyze Health Status0 Reg, Health Status1 Reg, + // and Error Theshold Status Reg + l_rc = __analyzeHealthStatus0Reg( io_sc, dimm, errFound ); + if ( SUCCESS != l_rc ) continue; + l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound ); + if ( SUCCESS != l_rc ) continue; + bool esTempErr = false; + l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr); + if ( SUCCESS != l_rc ) continue; + + // If we hit an ES temperature error and have not yet hit threshold, + // then keep the log hidden. + if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue; + + // If we didn't find any error, then keep the log hidden. + if ( !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_FirEvntGone ); + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + continue; + } + // EVENT_N cannot be retriggered on a new PERSISTENCY_LOST_ERROR // if a previous PERSISTENCY_LOST_ERROR still exists. Meaning, we // cannot detect/report multiple errors that happen at different @@ -782,43 +1416,77 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // and make the log predictive. io_sc.service_data->SetThresholdMaskId(0); - // Send persistency lost message to PHYP - l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm, - NVDIMM::UNPROTECTED_BECAUSE_ERROR ); + // Send message to PHYP that save/restore may work + l_rc = PlatServices::nvdimmNotifyProtChange( dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); if ( SUCCESS != l_rc ) continue; - // Analyze Health Status0 Reg, Health Status1 Reg, - // and Error Theshold Status Reg - l_rc = __analyzeHealthStatus0Reg( io_sc, dimm ); - if ( SUCCESS != l_rc ) continue; - l_rc = __analyzeHealthStatus1Reg( io_sc, dimm ); - if ( SUCCESS != l_rc ) continue; - l_rc = __analyzeErrorThrStatusReg( io_sc, dimm ); + } + // BIT 1: Warning Threshold Exceeded + else if ( bitList.count(1) ) + { + l_rc = __analyzeWarningThrStatusReg( io_sc, dimm, errFound ); if ( SUCCESS != l_rc ) continue; + + if ( !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_FirEvntGone ); + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + continue; + } } - // BIT 1: Warning Threshold Exceeded -- ignore // BIT 2: Persistency Restored - if ( bitList.count(2) ) + else if ( bitList.count(2) ) { // It would be rare to have an intermittent error that comes and // goes so fast we only see PERSISTENCY_RESTORED and not // PERSISTENCY_LOST_ERROR. Set predictive on threshold of 32 // per day (rule code handles the thresholding), else just keep // as a hidden log. - io_sc.service_data->AddSignatureList( dimm, PRDFSIG_NvdimmPersRes ); + __addSignature( io_sc, i_chip->getTrgt(), errFound, + PRDFSIG_NvdimmPersRes ); + + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + } + // BIT 3: Below Warning Threshold + else if ( bitList.count(3) ) + { + // Much like the persistency restored bit above, we don't expect + // to see this, so just make a hidden log. + __addSignature( io_sc, i_chip->getTrgt(), errFound, + PRDFSIG_BelowWarnTh ); + + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + } + // BIT 4: Hardware Failure -- ignore - no logic feeding this + // BIT 5: EVENT_N_LOW -- ignore + // BIT 6:7: Unused + + // If we reach a threshold on MCACALFIR[8] of 32 per day, we assume + // some intermittent error must be triggering the FIR that isn't a + // persistency lost error which would cause us to mask. The rule code + // handles the actual thresholding here. + if ( io_sc.service_data->IsAtThreshold() && !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_IntNvdimmErr ); // callout NVDIMM high, cable high, BPM high, no gard io_sc.service_data->SetCallout( dimm, MRU_HIGH, NO_GARD ); l_rc = __addBpmCallout( dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != l_rc ) continue; - l_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + l_rc = __addNvdimmCableCallout( dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != l_rc ) continue; - } - // BIT 3: Below Warning Threshold -- ignore - // BIT 4: Hardware Failure -- ignore - // BIT 5: EVENT_N_LOW -- ignore - // BIT 6:7: Unused + // Send message to PHYP that save/restore may work + l_rc = PlatServices::nvdimmNotifyProtChange( dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) continue; + } } #else // IPL only @@ -826,7 +1494,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, PRDF_ERR( PRDF_FUNC "Unexpected call to analyze NVDIMMs at IPL." ); io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); - #endif + #endif // end runtime vs IPL check + + #else // CONFIG_NVDIMM not defined + + PRDF_ERR( PRDF_FUNC "CONFIG_NVDIMM not defined." ); + io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); + + #endif // end CONFIG_NVDIMM check return SUCCESS; // nothing to return to rule code diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C index 4a4391c0c..0e11b1a86 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -301,9 +301,9 @@ int32_t commandAddrTimeout( ExtensibleChip * i_chip, // was executed. Restarting the command will likely fail with the same // issue. Callout and gard all MCAs in which the command was executed. - std::vector<ExtensibleChip *> mcaList; + ExtensibleChipList mcaList; - if ( SUCCESS != getMcbistMaintPort(i_chip, mcaList) ) + if ( SUCCESS != getMcbistMaintPort<TYPE_MCBIST>(i_chip, mcaList) ) { PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", i_chip->getHuid() ); diff --git a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H index 4a284253a..44ef77ec7 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H +++ b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -36,6 +36,7 @@ // Platform includes #include <prdfMemTdCtlr.H> #include <prdfPlatServices.H> +#include <prdfThresholdUtils.H> namespace PRDF { @@ -81,6 +82,24 @@ class McbistDataBundle : public DataBundle /** The Targeted Diagnostics controller. */ MemTdCtlr<TARGETING::TYPE_MCBIST> * iv_tdCtlr = nullptr; + + public: // instance variables + #ifdef __HOSTBOOT_RUNTIME + + // These are used to limit the number of times a scrub command will stop + // on a UE or CE on a rank. This is to prevent potential flooding of + // maintenance UEs or CEs. The threshold will be 16 per rank for each. + TimeBasedThreshold iv_ueStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS ); + TimeBasedThreshold iv_ceStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS ); + + // If we stop on a UE or a CE, we will need to store the rank that the + // error is on so that we can clear our respective thresholds if the + // next error we stop on is on a different rank. + MemRank iv_ceUeRank; + + #endif }; /** diff --git a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C index ef3a143eb..fc389000a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C +++ b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C @@ -99,7 +99,7 @@ void commitErrl( errlHndl_t i_errl, TargetHandle_t i_trgt ) template<TARGETING::TYPE T> void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt, - TargetHandle_t i_dimmTrgt ) + TargetHandle_t i_dimmTrgt, bool i_nvdimmNoGard = false ) { #define PRDF_FUNC "[RDR::__calloutDimm] " @@ -109,9 +109,31 @@ void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt, PRDF_ASSERT( nullptr != i_dimmTrgt ); PRDF_ASSERT( TYPE_DIMM == getTargetType(i_dimmTrgt) ); - // Callout the DIMM. + HWAS::DeconfigEnum deconfigPolicy = HWAS::DELAYED_DECONFIG; + HWAS::GARD_ErrorType gardPolicy = HWAS::GARD_Predictive; + + #ifdef CONFIG_NVDIMM + // For the "RDR: All repairs used" case, If the DIMM is an NVDIMM, change + // the gard and deconfig options to no gard/deconfig and call + // nvdimmNotifyProtChange to indicate a save/restore may work. + if ( i_nvdimmNoGard ) + { + deconfigPolicy = HWAS::NO_DECONFIG; + gardPolicy = HWAS::GARD_NULL; + + uint32_t l_rc = PlatServices::nvdimmNotifyProtChange( i_dimmTrgt, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) + { + PRDF_TRAC( PRDF_FUNC "nvdimmNotifyProtChange(0x%08x) " + "failed.", PlatServices::getHuid(i_dimmTrgt) ); + } + } + #endif + io_errl->addHwCallout( i_dimmTrgt, HWAS::SRCI_PRIORITY_HIGH, - HWAS::DELAYED_DECONFIG, HWAS::GARD_Predictive ); + deconfigPolicy, gardPolicy ); + // Clear the VPD on this DIMM. The DIMM has been garded, but it is possible // the customer will want to ungard the DIMM. Without clearing the VPD, the @@ -120,16 +142,20 @@ void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt, // customer takes the risk of ungarding the DIMM (that they should replace), // the repairs will need to be rediscovered. - std::vector<MemRank> ranks; - getMasterRanks<T>( i_portTrgt, ranks, getDimmSlct(i_dimmTrgt) ); - - for ( auto & rank : ranks ) + // Do not clear the VPD if we had an NVDIMM that we avoided garding. + if ( !i_nvdimmNoGard ) { - if ( SUCCESS != clearBadDqBitmap(i_portTrgt, rank) ) + std::vector<MemRank> ranks; + getMasterRanks<T>( i_portTrgt, ranks, getDimmSlct(i_dimmTrgt) ); + + for ( auto & rank : ranks ) { - PRDF_ERR( PRDF_FUNC "clearBadDqBitmap(0x%08x,0x%02x) failed", - getHuid(i_portTrgt), rank.getKey() ); - continue; + if ( SUCCESS != clearBadDqBitmap(i_portTrgt, rank) ) + { + PRDF_ERR( PRDF_FUNC "clearBadDqBitmap(0x%08x,0x%02x) failed", + getHuid(i_portTrgt), rank.getKey() ); + continue; + } } } @@ -156,11 +182,7 @@ void commitSoftError( uint32_t i_reasonCode, TargetHandle_t i_trgt, //------------------------------------------------------------------------------ template<TARGETING::TYPE T> -bool processRepairedRanks( TargetHandle_t i_trgt, uint8_t i_repairedRankMask ); - -template<> -bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, - uint8_t i_repairedRankMask ) +bool processRepairedRanks( TargetHandle_t i_trgt, uint8_t i_repairedRankMask ) { #define PRDF_FUNC "[processRepairedRanks] " @@ -179,7 +201,7 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, // map value has no significance. std::map<TargetHandle_t, uint32_t> calloutList; - ExtensibleChip * mcaChip = (ExtensibleChip *)systemPtr->GetChip(i_trgt); + ExtensibleChip * chip = (ExtensibleChip *)systemPtr->GetChip(i_trgt); for ( uint8_t r = 0; r < MASTER_RANKS_PER_PORT; ++r ) { @@ -191,20 +213,18 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, MemRank rank ( r ); MemMark cm; - if ( SUCCESS != MarkStore::readChipMark<TYPE_MCA>( mcaChip, rank, - cm ) ) + if ( SUCCESS != MarkStore::readChipMark<T>( chip, rank, cm ) ) { - PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MCA>(0x%08x,0x%02x) " - "failed", mcaChip->getHuid(), rank.getKey() ); + PRDF_ERR( PRDF_FUNC "readChipMark<T>(0x%08x,0x%02x) " + "failed", chip->getHuid(), rank.getKey() ); continue; // skip this rank } MemMark sm; - if ( SUCCESS != MarkStore::readSymbolMark<TYPE_MCA>( mcaChip, rank, - sm ) ) + if ( SUCCESS != MarkStore::readSymbolMark<T>( chip, rank, sm ) ) { - PRDF_ERR( PRDF_FUNC "readSymbolMark<TYPE_MCA>(0x%08x,0x%02x) " - "failed", mcaChip->getHuid(), rank.getKey() ); + PRDF_ERR( PRDF_FUNC "readSymbolMark<T>(0x%08x,0x%02x) " + "failed", chip->getHuid(), rank.getKey() ); continue; // skip this rank } @@ -214,9 +234,8 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, if ( NULL == errl ) { - errl = createErrl<TYPE_MCA>( PRDF_DETECTED_FAIL_HARDWARE, - i_trgt, - PRDFSIG_RdrRepairsUsed ); + errl = createErrl<T>( PRDF_DETECTED_FAIL_HARDWARE, + i_trgt, PRDFSIG_RdrRepairsUsed ); } std::vector<MemSymbol> symList; @@ -246,16 +265,21 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, // Callout all DIMMs in the map. for ( auto const & dimm : calloutList ) { - __calloutDimm<TYPE_MCA>( errl, i_trgt, dimm.first ); + bool nvdimmNoGard = false; + #ifdef CONFIG_NVDIMM + if ( isNVDIMM(dimm.first) ) nvdimmNoGard = true; + #endif + + __calloutDimm<T>( errl, i_trgt, dimm.first, nvdimmNoGard ); } // Commit the error log, if needed. - commitErrl<TYPE_MCA>( errl, i_trgt ); + commitErrl<T>( errl, i_trgt ); // Commit an additional error log indicating something failed in the // analysis, if needed. - commitSoftError<TYPE_MCA>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt, - PRDFSIG_RdrInternalFail, analysisErrors ); + commitSoftError<T>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt, + PRDFSIG_RdrInternalFail, analysisErrors ); }while(0); return o_calloutMade; @@ -263,6 +287,14 @@ bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, #undef PRDF_FUNC } + +template +bool processRepairedRanks<TYPE_MCA>( TargetHandle_t i_trgt, + uint8_t i_repairedRankMask ); +template +bool processRepairedRanks<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt, + uint8_t i_repairedRankMask ); + //------------------------------------------------------------------------------ template<> @@ -368,7 +400,12 @@ bool processRepairedRanks<TYPE_MBA>( TargetHandle_t i_trgt, // Callout all DIMMs in the map. for ( auto const & dimm : calloutList ) { - __calloutDimm<TYPE_MBA>( errl, i_trgt, dimm.first ); + bool nvdimmNoGard = false; + #ifdef CONFIG_NVDIMM + if ( isNVDIMM(dimm.first) ) nvdimmNoGard = true; + #endif + + __calloutDimm<TYPE_MBA>(errl, i_trgt, dimm.first, nvdimmNoGard); } o_calloutMade = true; @@ -392,10 +429,7 @@ bool processRepairedRanks<TYPE_MBA>( TargetHandle_t i_trgt, template<TARGETING::TYPE T> -bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask ); - -template<> -bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask ) +bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask ) { #define PRDF_FUNC "[processBadDimms] " @@ -421,29 +455,35 @@ bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask ) { if ( NULL == errl ) { - errl = createErrl<TYPE_MCA>( PRDF_DETECTED_FAIL_HARDWARE, - i_trgt, PRDFSIG_RdrRepairUnavail ); + errl = createErrl<T>( PRDF_DETECTED_FAIL_HARDWARE, + i_trgt, PRDFSIG_RdrRepairUnavail ); } - __calloutDimm<TYPE_MCA>( errl, i_trgt, dimm ); + __calloutDimm<T>( errl, i_trgt, dimm ); o_calloutMade = true; } } // Commit the error log, if needed. - commitErrl<TYPE_MCA>( errl, i_trgt ); + commitErrl<T>( errl, i_trgt ); // Commit an additional error log indicating something failed in the // analysis, if needed. - commitSoftError<TYPE_MCA>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt, - PRDFSIG_RdrInternalFail, analysisErrors ); + commitSoftError<T>( PRDF_DETECTED_FAIL_SOFTWARE, i_trgt, + PRDFSIG_RdrInternalFail, analysisErrors ); return o_calloutMade; #undef PRDF_FUNC } +template +bool processBadDimms<TYPE_MCA>( TargetHandle_t i_trgt, uint8_t i_badDimmMask ); +template +bool processBadDimms<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt, + uint8_t i_badDimmMask ); + //------------------------------------------------------------------------------ template<> @@ -580,6 +620,25 @@ void deployDramSpares<TYPE_MBA>( TargetHandle_t i_trgt, } } +template<> +void deployDramSpares<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt, + const std::vector<MemRank> & i_ranks ) +{ + for ( auto & rank : i_ranks ) + { + MemSymbol sym = MemSymbol::fromSymbol( i_trgt, rank, 71 ); + + int32_t l_rc = mssSetSteerMux<TYPE_OCMB_CHIP>(i_trgt, rank, sym, false); + if ( SUCCESS != l_rc ) + { + // mssSetSteerMux() will print a trace and commit the error log, + // however, we need to handle the return code or we get a compile + // warning in Hostboot. + continue; + } + } +} + } // end namespace RDR //------------------------------------------------------------------------------ @@ -680,6 +739,8 @@ template uint32_t restoreDramRepairs<TYPE_MCA>( TargetHandle_t i_trgt ); template uint32_t restoreDramRepairs<TYPE_MBA>( TargetHandle_t i_trgt ); +template +uint32_t restoreDramRepairs<TYPE_OCMB_CHIP>( TargetHandle_t i_trgt ); //------------------------------------------------------------------------------ |