diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2019-05-08 14:35:48 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2019-06-05 10:15:49 -0500 |
commit | 957a96a941279500f8c935d2a3b5497ad3abc575 (patch) | |
tree | 7e8fc8ccbf1d9c159a495154183d34d4b498fa8f /src/usr/diag/prdf/plat/mem | |
parent | 68367c57c139593c8c660c8471a82124c646693a (diff) | |
download | talos-hostboot-957a96a941279500f8c935d2a3b5497ad3abc575.tar.gz talos-hostboot-957a96a941279500f8c935d2a3b5497ad3abc575.zip |
PRD: Axone/Explorer TdCtlr and DataBundle Updates
Change-Id: Id4b1f8c2fc3898d2a82501b84cf8bebe41a3af2b
RTC: 207388
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75792
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamen G. Tyner <ben.tyner@ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/78326
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf/plat/mem')
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C | 117 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C | 77 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H | 12 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C | 16 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 449 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C | 54 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 1070 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm.C | 62 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C | 9 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C | 127 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C | 6 |
11 files changed, 1981 insertions, 18 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C index 41b0de3ea..0cf4bfa7c 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2017,2018 */ +/* Contributors Listed Below - COPYRIGHT 2017,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -474,6 +474,104 @@ int32_t __getPortAddr<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr, } template <> +int32_t __getPortAddr<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, MemAddr i_addr, + uint64_t & o_addr ) +{ + int32_t o_rc = SUCCESS; + + o_addr = 0; + // TODO RTC 198756 + // Local vars for address fields + uint64_t col = reverseBits(i_addr.getCol(), 7); // C9 C8 C7 C6 C5 C4 C3 + uint64_t row = reverseBits(i_addr.getRow(), 18); // R17 R16 R15 .. R1 R0 + uint64_t bnk = i_addr.getBank(); // BG0 BG1 B0 B1 B2 + uint64_t srnk = i_addr.getRank().getSlave(); // S0 S1 S2 + uint64_t mrnk = i_addr.getRank().getRankSlct(); // M0 M1 + uint64_t dslct = i_addr.getRank().getDimmSlct(); // D + + // Determine if a two DIMM config is used. Also, determine how many + // mrank (M0-M1), srnk (S0-S2), or extra row (R17-R15) bits are used. + bool twoDimmConfig; + uint8_t mrnkBits, srnkBits, extraRowBits; + o_rc = __getAddrConfig( i_chip, dslct, twoDimmConfig, mrnkBits, srnkBits, + extraRowBits ); + if ( SUCCESS != o_rc ) return o_rc; + + // Mask off the non-configured bits. If this address came from hardware, + // this would not be a problem. However, the get_mrank_range() and + // get_srank_range() HWPS got lazy just set the entire fields and did not + // take into account the actual bit ranges. + mrnk = __maskBits( mrnk, mrnkBits ); + srnk = __maskBits( srnk, srnkBits ); + row = __maskBits( row, 15 + extraRowBits ); + + // Combine master and slave ranks. + uint64_t rnk = (mrnk << srnkBits) | srnk; + uint8_t rnkBits = mrnkBits + srnkBits; + + // Now split the DIMM select and combined rank into components. + uint64_t rnk_pt1 = 0, rnk_pt2 = 0, rnk_pt3 = 0; + uint8_t rnkBits_pt1 = 0, rnkBits_pt2 = 0, rnkBits_pt3 = 0; + + if ( 0 == rnkBits ) + { + if ( twoDimmConfig ) // The DIMM select goes into part 3. + { + rnk_pt3 = dslct; rnkBits_pt3 = 1; + } + } + else // At least one master or slave. + { + // Put the LSB of the combined rank in part 3 and the rest in part 2. + rnk_pt3 = rnk & 0x1; rnkBits_pt3 = 1; + rnk_pt2 = rnk >> 1; rnkBits_pt2 = rnkBits - 1; + + if ( twoDimmConfig ) // The DIMM select goes into part 1. + { + rnk_pt1 = dslct; rnkBits_pt1 = 1; + } + } + + // Split the row into its components. + uint64_t r17_r15 = (row & 0x38000) >> 15; + uint64_t r14 = (row & 0x04000) >> 14; + uint64_t r13 = (row & 0x02000) >> 13; + uint64_t r12_r0 = (row & 0x01fff); + + // Split the column into its components. + uint64_t c9_c4 = (col & 0x7e) >> 1; + uint64_t c3 = (col & 0x01); + + // Split the bank into its components. + uint64_t b0 = (bnk & 0x10) >> 4; + uint64_t b1 = (bnk & 0x08) >> 3; + // NOTE: B2 is not supported on Nimbus. + uint64_t bg0_bg1 = (bnk & 0x03); + + // Now start building the flexible part of the address (bits 0-7,23-33). + o_addr = (o_addr << rnkBits_pt1 ) | rnk_pt1; + o_addr = (o_addr << extraRowBits) | r17_r15; + o_addr = (o_addr << rnkBits_pt2 ) | rnk_pt2; + o_addr = (o_addr << 6 ) | c9_c4; + o_addr = (o_addr << 1 ) | b0; + o_addr = (o_addr << rnkBits_pt3 ) | rnk_pt3; + o_addr = (o_addr << 1 ) | b1; + o_addr = (o_addr << 2 ) | bg0_bg1; + o_addr = (o_addr << 1 ) | c3; + + // C2 is in bit 34, but the Nimbus physical address does not contain a C2. + // It will be set to 0 for now. Also, bits 35-39 are the rest of the cache + // line address, which we do not need. So, that will be set to 0 as well. + o_addr <<= 6; + + // Finally, insert R14,R12-R0,R13 into bits 8-22. + o_addr = ((o_addr & 0xfffffe0000ull) << 15) | (o_addr & 0x000001ffffull); + o_addr |= ((r14 << 14) | (r12_r0 << 1) | r13) << 17; + + return o_rc; +} + +template <> int32_t __getPortAddr<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr, uint64_t & o_addr ) { @@ -585,6 +683,22 @@ void __getGrpPrms<TYPE_MCA>( ExtensibleChip * i_chip, uint8_t o_portPos, } template<> +void __getGrpPrms<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, uint8_t o_portPos, + SCAN_COMM_REGISTER_CLASS * &o_mcfgp, + SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ) +{ + PRDF_ERR( "__getGrpPrms: Function not supported yet" ); + /* TODO RTC 198756 + // Get the connected MCS chip and MCA target position. + ExtensibleChip * mcs_chip = getConnectedParent( i_chip, TYPE_MCS ); + o_portPos = i_chip->getPos() % MAX_MCA_PER_MCS; + + o_mcfgp = mcs_chip->getRegister("MCFGP"); + o_mcfgpm = mcs_chip->getRegister("MCFGPM"); + */ +} + +template<> void __getGrpPrms<TYPE_MBA>( ExtensibleChip * i_chip, uint8_t o_portPos, SCAN_COMM_REGISTER_CLASS * &o_mcfgp, SCAN_COMM_REGISTER_CLASS * &o_mcfgpm ) @@ -975,6 +1089,7 @@ int32_t page( ExtensibleChip * i_chip, MemAddr i_addr ) } template int32_t page<TYPE_MCA>( ExtensibleChip * i_chip, MemAddr i_addr ); template int32_t page<TYPE_MBA>( ExtensibleChip * i_chip, MemAddr i_addr ); +template int32_t page<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip, MemAddr i_addr); //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C index f86110458..6dff91e82 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -248,8 +248,8 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip, do { // Get all ports in which the command was run. - std::vector<ExtensibleChip *> portList; - o_rc = getMcbistMaintPort( i_chip, portList ); + ExtensibleChipList portList; + o_rc = getMcbistMaintPort<TYPE_MCBIST>( i_chip, portList ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", @@ -291,6 +291,60 @@ uint32_t __analyzeCmdComplete<TYPE_MCBIST>( ExtensibleChip * i_chip, } template<> +uint32_t __analyzeCmdComplete<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + TdRankListEntry & o_stoppedRank, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[__analyzeCmdComplete] " + + uint32_t o_rc = SUCCESS; + + o_errorsFound = false; + + do + { + // Get all ports in which the command was run. + ExtensibleChipList portList; + o_rc = getMcbistMaintPort<TYPE_OCMB_CHIP>( i_chip, portList ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + // In broadcast mode, the rank configuration for all ports will be the + // same. In non-broadcast mode, there will only be one MEM_PORT in + // the list. Therefore, we can simply use the first MEM_PORT in the + // list for all configs. + ExtensibleChip * stopChip = portList.front(); + + // Update iv_stoppedRank. + o_stoppedRank = __getStopRank<TYPE_MEM_PORT>( stopChip, i_addr ); + + // Check the OCMB for ECC errors. + bool errorsFound; + o_rc = __checkEcc<TYPE_OCMB_CHIP>( i_chip, i_addr, errorsFound, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__checkEcc<TYPE_OCMB_CHIP>(0x%08x) failed", + i_chip->getHuid() ); + o_rc |= o_rc; + break; + } + + if ( errorsFound ) o_errorsFound = true; + if ( SUCCESS != o_rc ) break; + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t __analyzeCmdComplete<TYPE_MBA>( ExtensibleChip * i_chip, TdRankListEntry & o_stoppedRank, const MemAddr & i_addr, @@ -397,9 +451,15 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, // Get the version to use. uint8_t version = TD_CTLR_DATA::VERSION_1; + bool isNimbus = false; if ( MODEL_NIMBUS == getChipModel(getMasterProc()) ) { version = TD_CTLR_DATA::VERSION_2; + isNimbus = true; + } + else if ( MODEL_AXONE == getChipModel(getMasterProc()) ) + { + version = TD_CTLR_DATA::VERSION_2; } // Get the IPL state. @@ -443,6 +503,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, if ( TD_CTLR_DATA::VERSION_2 == version ) { curPort = iv_curProcedure->getChip()->getPos() % MAX_MCA_PER_MCBIST; + if ( !isNimbus ) + { + TargetHandle_t portTrgt = iv_curProcedure->getChip()->getTrgt(); + curPort = portTrgt->getAttr<ATTR_REL_POS>(); + } } } @@ -475,6 +540,11 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, if ( TD_CTLR_DATA::VERSION_2 == version ) { itPort = queue[n]->getChip()->getPos() % MAX_MCA_PER_MCBIST; + if ( !isNimbus ) + { + TargetHandle_t portTrgt = queue[n]->getChip()->getTrgt(); + itPort = portTrgt->getAttr<ATTR_REL_POS>(); + } } bsb.setFieldJustify( pos, 3, itMrnk ); pos+=3; @@ -502,6 +572,7 @@ void MemTdCtlr<T>::collectStateCaptureData( STEP_CODE_DATA_STRUCT & io_sc, // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H index 332109b48..f1c072eea 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -54,14 +54,14 @@ class MemTdCtlr /** * @brief Constructor * - * This contructor will only be called in the MCBIST or MBA data bundle, - * which already checks for a valid type. + * This contructor will only be called in the MCBIST, MBA, or OCMB data + * bundle, which already checks for a valid type. * * Need to initialize iv_stoppedRank to a valid entry in iv_rankList. Use * the last entry in the list so that the 'next' rank is the first entry * in the list. * - * @param i_chip An MCBIST or MBA chip. + * @param i_chip An MCBIST, MBA, or OCMB chip. */ explicit MemTdCtlr( ExtensibleChip * i_chip ) : iv_chip( i_chip ), iv_rankList( i_chip ), @@ -122,7 +122,7 @@ class MemTdCtlr /** * @brief Bans TPS on the given rank. Any attempts to add a TPS procedure * to the queue for this rank will be ignored. - * @param i_chip MCA or MBA chip. + * @param i_chip MCA, MBA, or OCMB chip. * @param i_rank The target slave rank. */ void banTps( ExtensibleChip * i_chip, const MemRank & i_rank ) @@ -302,7 +302,7 @@ class MemTdCtlr private: // instance variables - /** An MCBIST or MBA chip associated with this TD controller. */ + /** An MCBIST, MBA, or OCMB chip associated with this TD controller. */ ExtensibleChip * const iv_chip; /** The TD queue that contains all of the pending TD procedures. */ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C index ea04d2964..401a48042 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -160,6 +160,14 @@ bool __mnfgCeCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __mnfgCeCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) ) && + ( (0 != (i_eccAttns & MAINT_NCE)) || + (0 != (i_eccAttns & MAINT_TCE)) ) ); +} + +template<> inline bool __mnfgCeCheck<TYPE_MBA>( uint32_t i_eccAttns ) { return ( 0 != (i_eccAttns & MAINT_HARD_NCE_ETE) ); @@ -251,12 +259,18 @@ template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index d52ef2d1d..356b5b530 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -107,6 +107,36 @@ void __recaptureRegs<TYPE_MCBIST>( STEP_CODE_DATA_STRUCT & io_sc, } template<> +void __recaptureRegs<TYPE_OCMB_CHIP>( STEP_CODE_DATA_STRUCT & io_sc, + ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[__recaptureRegs<TYPE_OCMB_CHIP>] " + + RegDataCache & cache = RegDataCache::getCachedRegisters(); + CaptureData & cd = io_sc.service_data->GetCaptureData(); + + // refresh and recapture the ocmb registers + const char * ocmbRegs[] = + { + "MCBISTFIR", "RDFFIR", "MBSEC0", "MBSEC1", "OCMB_MBSSYMEC0", + "OCMB_MBSSYMEC1", "OCMB_MBSSYMEC2", "OCMB_MBSSYMEC3", + "OCMB_MBSSYMEC4", "OCMB_MBSSYMEC5", "OCMB_MBSSYMEC6", + "OCMB_MBSSYMEC7", "OCMB_MBSSYMEC8", "MBSMSEC", "MCBMCAT", + }; + + for ( uint32_t i = 0; i < sizeof(ocmbRegs)/sizeof(char*); i++ ) + { + SCAN_COMM_REGISTER_CLASS * reg = + i_chip->getRegister( ocmbRegs[i] ); + cache.flush( i_chip, reg ); + } + + i_chip->CaptureErrorData( cd, Util::hashString("MaintCmdRegs_ocmb") ); + + #undef PRDF_FUNC +} + +template<> void __recaptureRegs<TYPE_MBA>( STEP_CODE_DATA_STRUCT & io_sc, ExtensibleChip * i_chip ) { @@ -358,8 +388,9 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, uint32_t count = symData.size(); switch ( T ) { - case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break; - case TYPE_MBA: PRDF_ASSERT( 1 == count ); break; + case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break; + case TYPE_MBA: PRDF_ASSERT( 1 == count ); break; + case TYPE_OCMB_CHIP: PRDF_ASSERT( 1 <= count && count <= 2 ); break; default: PRDF_ASSERT( false ); } @@ -408,6 +439,14 @@ uint32_t __handleSoftInterCeEte<TYPE_MCA>( ExtensibleChip * i_chip, } template<> +uint32_t __handleSoftInterCeEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + return __handleNceEte<TYPE_OCMB_CHIP>( i_chip, i_addr, io_sc ); +} + +template<> uint32_t __handleSoftInterCeEte<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, STEP_CODE_DATA_STRUCT & io_sc ) @@ -480,6 +519,52 @@ uint32_t __handleRceEte<TYPE_MCA>( ExtensibleChip * i_chip, } template<> +uint32_t __handleRceEte<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemRank & i_rank, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[__handleRceEte] " + + uint32_t o_rc = SUCCESS; + + // Should only get this attention in MNFG mode. + PRDF_ASSERT( mfgMode() ); + + do + { + // The RCE ETE attention could be from IUE, IMPE, or IRCD. Need to check + // RDFFIR[37] to determine if there was at least one IUE. + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "RDFFIR" ); + o_rc = fir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on RDFFIR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + if ( !fir->IsBitSet(37) ) break; // nothing else to do + + // Handle the IUE. + o_errorsFound = true; + io_sc.service_data->AddSignatureList( i_chip->getTrgt(), + PRDFSIG_MaintIUE ); + o_rc = MemEcc::handleMemIue<TYPE_OCMB_CHIP>( i_chip, i_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeMaintIue(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t __handleRceEte<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) @@ -698,6 +783,11 @@ template uint32_t __checkEcc<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t __checkEcc<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + bool & o_errorsFound, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -786,6 +876,76 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::unmaskEccAttns() //------------------------------------------------------------------------------ template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::maskEccAttns] " + + uint32_t o_rc = SUCCESS; + + SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_OR" ); + + mask->clearAllBits(); + mask->SetBit(8); // Mainline read NCE + mask->SetBit(9); // Mainline read TCE + + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR" ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::unmaskEccAttns] " + + uint32_t o_rc = SUCCESS; + + // Memory CEs were masked at the beginning of the TD procedure, so + // clear and unmask them. Also, it is possible that memory UEs have + // thresholded so clear and unmask them as well. + + SCAN_COMM_REGISTER_CLASS * fir = iv_chip->getRegister( "RDFFIR_AND" ); + SCAN_COMM_REGISTER_CLASS * mask = iv_chip->getRegister( "RDFFIR_MASK_AND" ); + + fir->setAllBits(); mask->setAllBits(); + + // Do not unmask NCE and TCE attentions if they have been permanently + // masked due to certain TPS conditions. + if ( !(getOcmbDataBundle(iv_chip)->iv_maskMainlineNceTce) ) + { + fir->ClearBit(8); mask->ClearBit(8); // Mainline read NCE + fir->ClearBit(9); mask->ClearBit(9); // Mainline read TCE + } + fir->ClearBit(14); mask->ClearBit(14); // Mainline read UE + + o_rc = fir->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_AND" ); + } + + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_AND" ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t MemTdCtlr<TYPE_MBA>::maskEccAttns() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::maskEccAttns] " @@ -887,6 +1047,21 @@ SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_OCMB_CHIP>( + ExtensibleChip * i_chip ) +{ + return i_chip->getRegister( "RDFFIR_AND" ); +} + +template<> +SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MEM_PORT>( + ExtensibleChip * i_chip ) +{ + ExtensibleChip * ocmbChip = getConnectedParent( i_chip, TYPE_OCMB_CHIP ); + return ocmbChip->getRegister( "RDFFIR_AND" ); +} + +template<> SCAN_COMM_REGISTER_CLASS * __getEccFirAnd<TYPE_MBA>( ExtensibleChip * i_chip ) { ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); @@ -969,6 +1144,85 @@ uint32_t __findChipMarks( TdRankList<TC> & i_rankList ) #undef PRDF_FUNC } +template <> +uint32_t __findChipMarks<TYPE_MEM_PORT>( + TdRankList<TYPE_OCMB_CHIP> & i_rankList ) +{ + #define PRDF_FUNC "[__findChipMarks] " + + uint32_t o_rc = SUCCESS; + + for ( auto & entry : i_rankList.getList() ) + { + ExtensibleChip * memPort = entry.getChip(); + MemRank rank = entry.getRank(); + + ExtensibleChip * ocmb = getConnectedParent( memPort, TYPE_OCMB_CHIP ); + + // Call readChipMark to get MemMark. + MemMark chipMark; + o_rc = MarkStore::readChipMark<TYPE_OCMB_CHIP>( ocmb, rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed", + ocmb->getHuid(), rank.getKey() ); + break; + } + + if ( !chipMark.isValid() ) continue; // no chip mark present + + // Get the DQ Bitmap data. + MemDqBitmap dqBitmap; + o_rc = getBadDqBitmap( memPort->getTrgt(), rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x,0x%02x)", + memPort->getHuid(), rank.getKey() ); + break; + } + + // Check if the chip mark is verified or not. + bool cmVerified = false; + o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark() failed on 0x%08x " + "0x%02x", memPort->getHuid(), rank.getKey() ); + break; + } + + // If the chip mark is unverified, add a VcmEvent to the TD queue. + if ( !cmVerified ) + { + // Chip mark is not present in VPD. Add it to queue. + TdEntry * e = new VcmEvent<TYPE_OCMB_CHIP>{ ocmb, rank, chipMark }; + MemDbUtils::pushToQueue<TYPE_OCMB_CHIP>( ocmb, e ); + + // We will want to clear the MPE attention for the unverified chip + // mark so we don't get any redundant attentions for chip marks that + // are already in the queue. This is reset/reload safe because + // initialize() will be called again and we can redetect the + // unverified chip marks. + SCAN_COMM_REGISTER_CLASS* reg =__getEccFirAnd<TYPE_OCMB_CHIP>(ocmb); + reg->setAllBits(); + reg->ClearBit( 0 + rank.getMaster() ); // fetch + reg->ClearBit( 20 + rank.getMaster() ); // scrub + o_rc = reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on ECC FIR AND: 0x%08x", + ocmb->getHuid() ); + break; + } + } + } + + return o_rc; + + #undef PRDF_FUNC +} + + template<> uint32_t MemTdCtlr<TYPE_MCBIST>::initialize() { @@ -1009,6 +1263,45 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::initialize() } template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::initialize() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::initialize] " + + uint32_t o_rc = SUCCESS; + + do + { + if ( iv_initialized ) break; // nothing to do + + // Unmask the fetch attentions just in case there were masked during a + // TD procedure prior to a reset/reload. + o_rc = unmaskEccAttns(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "unmaskEccAttns() failed" ); + break; + } + + // Find all unverified chip marks. + o_rc = __findChipMarks<TYPE_MEM_PORT>( iv_rankList ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__findChipMarks() failed on 0x%08x", + iv_chip->getHuid() ); + break; + } + + // At this point, the TD controller is initialized. + iv_initialized = true; + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t MemTdCtlr<TYPE_MBA>::initialize() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::initialize] " @@ -1162,6 +1455,119 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::handleRrFo() //------------------------------------------------------------------------------ template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo() +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::handleRrFo] " + + uint32_t o_rc = SUCCESS; + + do + { + // Check if maintenance command complete attention is set. + SCAN_COMM_REGISTER_CLASS * mcbistfir = + iv_chip->getRegister("MCBISTFIR"); + o_rc = mcbistfir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR"); + break; + } + + // If there is a command complete attention, nothing to do, break out. + if ( mcbistfir->IsBitSet(10) ) + break; + + + // Check if a command is not running. + // If bit 0 of MCB_CNTLSTAT is on, a mcbist run is in progress. + SCAN_COMM_REGISTER_CLASS * mcb_cntlstat = + iv_chip->getRegister("MCB_CNTLSTAT"); + o_rc = mcb_cntlstat->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MCB_CNTLSTAT" ); + break; + } + + // If a command is not running, set command complete attn, break. + if ( !mcb_cntlstat->IsBitSet(0) ) + { + SCAN_COMM_REGISTER_CLASS * mcbistfir_or = + iv_chip->getRegister("MCBISTFIR_OR"); + mcbistfir_or->SetBit( 10 ); + + mcbistfir_or->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on MCBISTFIR_OR" ); + } + break; + } + + // Check if there are unverified chip marks. + std::vector<TdRankListEntry> vectorList = iv_rankList.getList(); + + for ( auto & entry : vectorList ) + { + ExtensibleChip * memPortChip = entry.getChip(); + MemRank rank = entry.getRank(); + + // Get the chip mark + MemMark chipMark; + o_rc = MarkStore::readChipMark<TYPE_MEM_PORT>( memPortChip, rank, + chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MEM_PORT>(0x%08x,%d) " + "failed", memPortChip->getHuid(), rank.getMaster() ); + break; + } + + if ( !chipMark.isValid() ) continue; // no chip mark present + + // Get the DQ Bitmap data. + TargetHandle_t memPortTrgt = memPortChip->GetChipHandle(); + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap( memPortTrgt, rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, %d)", + getHuid(memPortTrgt), rank.getMaster() ); + break; + } + + // Check if the chip mark is verified or not. + bool cmVerified = false; + o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark failed." ); + break; + } + + // If there are any unverified chip marks, stop the command, break. + if ( !cmVerified ) + { + o_rc = stopBgScrub<TYPE_OCMB_CHIP>( iv_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "stopBgScrub<TYPE_OCMB_CHIP>(0x%08x) " + "failed", iv_chip->getHuid() ); + } + break; + } + } + + } while (0); + + return o_rc; + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t MemTdCtlr<TYPE_MBA>::handleRrFo() { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::handleRrFo] " @@ -1327,6 +1733,44 @@ uint32_t MemTdCtlr<TYPE_MCBIST>::canResumeBgScrub( bool & o_canResume ) } template<> +uint32_t MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub( bool & o_canResume ) +{ + #define PRDF_FUNC "[MemTdCtlr<TYPE_OCMB_CHIP>::canResumeBgScrub] " + + uint32_t o_rc = SUCCESS; + + o_canResume = false; + + // It is possible that we were running a TD procedure and the PRD service + // was reset. Therefore, we must check if background scrubbing was actually + // configured. There really is not a good way of doing this. A scrub command + // is a scrub command the only difference is the speed. Unfortunately, that + // speed can change depending on how the hardware team tunes it. For now, we + // can use the stop conditions, which should be unique for background scrub, + // to determine if it has been configured. + + SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", + iv_chip->getHuid() ); + } + else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH + 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH + 0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH + reg->IsBitSet(34) && // pause on MPE + reg->IsBitSet(35) ) // pause on UE + { + o_canResume = true; + } + + return o_rc; + + #undef PRDF_FUNC +} + +template<> uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume ) { #define PRDF_FUNC "[MemTdCtlr<TYPE_MBA>::canResumeBgScrub] " @@ -1365,6 +1809,7 @@ uint32_t MemTdCtlr<TYPE_MBA>::canResumeBgScrub( bool & o_canResume ) // Avoid linker errors with the template. template class MemTdCtlr<TYPE_MCBIST>; template class MemTdCtlr<TYPE_MBA>; +template class MemTdCtlr<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C index 5fc67afb4..6aaa2702a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C @@ -125,6 +125,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( 0 != (i_eccAttns & MAINT_IUE) ); +} + +template<> inline bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns ) { // IUES are reported via RCE ETE on Centaur @@ -289,6 +295,53 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() #undef PRDF_FUNC } +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs during Memory Diagnostics. The design is to let the + // command continue to the end of the rank and we do diagnostics on the + // CE counts found in the per-symbol counters. Therefore, all we need to do + // is tell the hardware which CE types to count. + + mss::mcbist::stop_conditions stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only soft/inter CEs. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + */ + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA @@ -367,6 +420,7 @@ uint32_t TpsEvent<TYPE_MBA>::startCmd() // Avoid linker errors with the template. template class TpsEvent<TYPE_MCA>; template class TpsEvent<TYPE_MBA>; +template class TpsEvent<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 635bd0361..e5b3ef74f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -66,6 +66,13 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_OCMB_CHIP>( + ExtensibleChip * i_chip ) +{ + return getOcmbDataBundle(i_chip)->getTpsFalseAlarmCounter(); +} + +template<> TpsFalseAlarm * __getTpsFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) { return getMbaDataBundle(i_chip)->getTpsFalseAlarmCounter(); @@ -138,6 +145,45 @@ bool __badDqCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, return badDqFound; } +template<> +bool __badDqCount<TYPE_OCMB_CHIP>( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_badDqCount ) +{ + bool badDqFound = false; + + PRDF_ERR( "__badDqCount: Function not supported yet" ); + /* TODO RTC 208263 + for ( auto symData : i_nibbleStats ) + { + // If one of the four symbols has a count of at least 8. + if ( symData.count >= 8 ) + { + // And the sum of the other three symbols is 1 or less. + uint8_t sum = 0; + for ( auto sumCheck : i_nibbleStats) + { + if ( !(symData.symbol == sumCheck.symbol) ) + { + // Check for overflow. + if ( (sum + sumCheck.count) > 0xFF ) + sum = 0xFF; + else + sum += sumCheck.count; + } + } + if ( sum <= 1 ) + { + io_badDqCount.count++; + io_badDqCount.symList.push_back(symData); + badDqFound = true; + break; + } + } + } + */ + + return badDqFound; +} //------------------------------------------------------------------------------ template<TARGETING::TYPE T> @@ -187,6 +233,53 @@ bool __badChipCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, return badChipFound; } +template<> +bool __badChipCount<TYPE_OCMB_CHIP>( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_badChipCount ) +{ + bool badChipFound = false; + + PRDF_ERR( "__badChipCount: Function not supported yet" ); + /* TODO RTC 208263 + uint8_t nonZeroCount = 0; + uint8_t minCountTwo = 0; + uint8_t sum = 0; + MemUtils::SymbolData highSym; + + for ( auto symData : i_nibbleStats ) + { + // Check for overflow. + if ( (sum + symData.count) > 0xFF ) + sum = 0xFF; + else + sum += symData.count; + + if ( symData.count > 0 ) + nonZeroCount++; + if ( symData.count >= 2 ) + minCountTwo++; + if ( symData.count > highSym.count ) + highSym = symData; + } + + // If the total sum for all four symbols has a count of at least 5 + if ( sum >= 5 ) + { + // And either: + // 3 or more symbols have a non-zero value. + // or 2 symbols, both with a minimum count of 2. + if ( nonZeroCount >= 3 || minCountTwo >= 2 ) + { + io_badChipCount.count++; + io_badChipCount.symList.push_back(highSym); + badChipFound = true; + } + } + */ + + return badChipFound; +} + //------------------------------------------------------------------------------ template<TARGETING::TYPE T> @@ -222,6 +315,38 @@ void __sumAboveOneCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, } } +template<> +void __sumAboveOneCount<TYPE_OCMB_CHIP>( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_sumAboveOneCount ) +{ + PRDF_ERR( "__sumAboveOneCount: Function not supported yet" ); + /* TODO RTC 208263 + uint8_t sum = 0; + MemUtils::MaintSymbols symList; + for ( auto symData : i_nibbleStats ) + { + if ( symData.count > 0 ) + { + if ( (sum + symData.count) > 0xFF ) + sum = 0xFF; + else + sum += symData.count; + + symList.push_back(symData); + } + } + // If the sum is greater than 1 + if ( sum > 1 ) + { + io_sumAboveOneCount.count++; + for ( auto sym : symList ) + { + io_sumAboveOneCount.symList.push_back(sym); + } + } + */ +} + //------------------------------------------------------------------------------ template<TARGETING::TYPE T> @@ -254,6 +379,35 @@ void __singleSymbolCount<TYPE_MCA>( MemUtils::MaintSymbols i_nibbleStats, io_singleSymCount.count++; } +template<> +void __singleSymbolCount<TYPE_OCMB_CHIP>( MemUtils::MaintSymbols i_nibbleStats, + CeCount & io_singleSymCount ) +{ + PRDF_ERR( "__singleSymbolCount: Function not supported yet" ); + /* TODO RTC 208263 + uint8_t count = 0; + bool multNonZeroSyms = false; + + for ( auto symData : i_nibbleStats ) + { + if ( symData.count > 0 ) + { + if ( 0 != count ) + { + // There are more than one symbol counts that are non-zero + multNonZeroSyms = true; + break; + } + count = symData.count; + } + } + + // If there is only one symbol with a non-zero count and that count > 1 + if ( count > 1 && !multNonZeroSyms ) + io_singleSymCount.count++; + */ +} + //------------------------------------------------------------------------------ template<TARGETING::TYPE T> @@ -421,6 +575,114 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeEccErrors( const uint32_t & i_eccAttns, //------------------------------------------------------------------------------ +template <> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeEccErrors(const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done) +{ + #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::analyzeEccErrors] " + + uint32_t o_rc = SUCCESS; + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + do + { + // If there was a UE. + if ( i_eccAttns & MAINT_UE ) + { + PRDF_TRAC( PRDF_FUNC "UE Detected: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_MaintUE ); + + // At this point we don't actually have an address for the UE. The + // best we can do is get the address in which the command stopped. + MemAddr addr; + o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr, + UE_TABLE::SCRUB_UE, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // Because of the UE, any further TPS requests will likely have no + // effect. So ban all subsequent requests. + MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() ); + + // Abort this procedure because additional repairs will likely + // not help (also avoids complication of having UE and MPE at + // the same time). + o_done = true; break; + } + + // If there was an IUE (MNFG only). + if ( mfgMode() && (i_eccAttns & MAINT_IUE) ) + { + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_MaintIUE ); + + o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // If service call is set, then IUE threshold was reached. + if ( io_sc.service_data->queryServiceCall() ) + { + PRDF_TRAC( PRDF_FUNC "IUE threshold detected: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + // Abort this procedure because port failure will be triggered + // after analysis is complete. + o_done = true; break; + } + } + + // If there was an MPE. + if ( i_eccAttns & MAINT_MPE ) + { + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_MaintMPE ); + + o_rc = MemEcc::handleMpe<TYPE_MCA>( iv_chip, iv_rank, + UE_TABLE::SCRUB_MPE, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMpe<T>(0x%08x, 0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // Abort this procedure because the chip mark may have fixed the + // symbol that triggered TPS + o_done = true; break; + } + + }while(0); + */ + + return o_rc; + + #undef PRDF_FUNC + +} + +//------------------------------------------------------------------------------ + template<> uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) { @@ -443,6 +705,31 @@ uint32_t TpsEvent<TYPE_MCA>::handleFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::handleFalseAlarm( + STEP_CODE_DATA_STRUCT & io_sc ) +{ + PRDF_ERR( "handleFalseAlarm: Function not supported yet" ); + /* TODO RTC 208263 + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsFalseAlarm ); + + // Increase false alarm counter and check threshold. + if ( __getTpsFalseAlarmCounter<TYPE_MCA>(iv_chip)->inc( iv_rank, io_sc) ) + { + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsFalseAlarmTH ); + + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + */ + + return SUCCESS; +} + +//------------------------------------------------------------------------------ + +template<> uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, CeCount i_badChipCount, CeCount i_sumAboveOneCount, CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) @@ -932,6 +1219,497 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, //------------------------------------------------------------------------------ template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeSymbolCounts( CeCount i_badDqCount, + CeCount i_badChipCount, CeCount i_sumAboveOneCount, + CeCount i_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) +{ + + #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::analyzeCeSymbolCounts] " + + uint32_t o_rc = SUCCESS; + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + do + { + bool tpsFalseAlarm = false; + + // Get the Bad DQ Bitmap. + TargetHandle_t mcaTrgt = iv_chip->getTrgt(); + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap( mcaTrgt, iv_rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x, 0x%02x) failed", + getHuid(mcaTrgt), iv_rank.getKey() ); + break; + } + + // Get the symbol mark. + MemMark symMark; + o_rc = MarkStore::readSymbolMark<TYPE_MCA>( iv_chip, iv_rank, symMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readSymbolMark<TYPE_MCA>(0x%08x, 0x%02x) " + "failed", iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // Get the chip mark. + MemMark chipMark; + o_rc = MarkStore::readChipMark<TYPE_MCA>( iv_chip, iv_rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark<TYPE_MCA>(0x%08x, 0x%02x) " + "failed", iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // If the bad DQ nibble count is 0 and the bad chip nibble count is 0. + if ( 0 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // There is nothing to repair. Any other non-zero counts are + // considered acceptable noise. + // Set false alarm flag to true. + tpsFalseAlarm = true; + } + // If the bad DQ nibble count is 1 and the bad chip nibble count is 0. + else if ( 1 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the sum above one nibble count is <= 1 or sum above one + // nibble count == 2 and single sym nibble count == 2 + if ( (i_sumAboveOneCount.count <= 1) || + (i_sumAboveOneCount.count == 2 && + i_singleSymCount.count == 2) ) + { + // This means we have a potential future chip kill or + // TCE. Both are still correctable after a symbol mark + // is placed. + // Place a symbol mark on this bad DQ. + MemMark newSymMark( mcaTrgt, iv_rank, + i_badDqCount.symList[0].symbol ); + o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymbolMark ); + + // Update VPD with the symbol mark. + o_rc = dqBitmap.setSymbol( i_badDqCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + else + { + // Placing a symbol mark risks a UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne(i_sumAboveOneCount, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "__updateVpdSumAboveOne() failed."); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymUeRisk ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + else + { + // Otherwise assume the symbol mark is fixing this bad DQ. + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + } + // Else if bad DQ nibble count is 2 and bad chip nibble count is 0. + else if ( 2 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the sum above one nibble count is = 0 or sum above one + // nibble count = 1 and single sym nibble count = 1 + if ( (i_sumAboveOneCount.count == 0) || + (i_sumAboveOneCount.count == 1 && + i_singleSymCount.count == 1) ) + { + // This means we have only one more potential bad DQ, which + // is correctable after a symbol mark is placed. + // Place a symbol mark on this bad DQ with the highest count + MemUtils::SymbolData highSym; + for ( auto sym : i_badDqCount.symList ) + { + if ( sym.count > highSym.count ) + highSym = sym; + } + + MemMark newSymMark( mcaTrgt, iv_rank, + highSym.symbol ); + o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymbolMark ); + + // Update VPD with both symbols. + for ( auto sym : i_badDqCount.symList ) + { + o_rc = dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + if ( SUCCESS != o_rc ) break; + } + else + { + // Placing a symbol mark risks a UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne(i_sumAboveOneCount, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "__updateVpdSumAboveOne() failed."); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymUeRisk ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + + } + else + { + // Otherwise assume the symbol mark is fixing a bad DQ. + // Update VPD with the unrepaired symbol. + for ( auto sym : i_badDqCount.symList ) + { + if ( sym.symbol == symMark.getSymbol() ) continue; + + o_rc = dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + if ( SUCCESS != o_rc ) break; + + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + + } + // Else if bad DQ nibble count is 0 and bad chip nibble count is 1 + else if ( 0 == i_badDqCount.count && 1 == i_badChipCount.count ) + { + // If the chip mark is available. + if ( !chipMark.isValid() ) + { + // If the sum above one nibble count is = 0 or the sum above one + // nibble count = 1 and the single sym nibble count = 1 + if ( (i_sumAboveOneCount.count == 0) || + (i_sumAboveOneCount.count == 1 && + i_singleSymCount.count == 1) ) + { + // This means we have only one more potential bad DQ, which + // is still correctable after a chip mark is placed. + // Place a chip mark on this bad chip. + MemMark newChipMark( mcaTrgt, iv_rank, + i_badChipCount.symList[0].symbol ); + o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank, + newChipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsChipMark ); + // Update VPD with the chip mark. + o_rc = dqBitmap.setDram( i_badChipCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setDram failed." ); + break; + } + } + else + { + // Placing a mark risks a UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne(i_sumAboveOneCount, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "__updateVpdSumAboveOne() failed."); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsChipUeRisk ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + else + { + // Assume the chip mark is being used to fix the bad chip. + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + } + // Else if bad DQ nibble count is 1 and bad chip nibble count is 1 + else if ( 1 == i_badDqCount.count && 1 == i_badChipCount.count ) + { + // If neither chip nor symbol mark is available. + if ( chipMark.isValid() && symMark.isValid() ) + { + // Assume the chip and symbol marks are already being used to + // fix the bad chip and DQ and some other nibble under + // threshold triggered TPS. + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + // If the chip mark is available. + if ( !chipMark.isValid() ) + { + // If the sum above one nibble count is 0 + if ( 0 == i_sumAboveOneCount.count ) + { + // This means we have no more potential bad DQ or bad chips + // since we can't correct those after chip mark is placed. + // Place a chip mark on the bad chip. + MemMark newChipMark( mcaTrgt, iv_rank, + i_badChipCount.symList[0].symbol ); + o_rc = MarkStore::writeChipMark<TYPE_MCA>( iv_chip, iv_rank, + newChipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Check if the current symbol mark is on the same DRAM as + // this newly placed chip mark. + if ( symMark.isValid() && + ( symMark.getSymbol().getDram() == + newChipMark.getSymbol().getDram() ) ) + { + // Since we need to set a symbol mark in addition to + // this chip mark, we need to clear the symbol mark now + // instead of at the end of the function to make room + // for the additional symbol mark. + o_rc = MarkStore::clearSymbolMark<TYPE_MCA>( iv_chip, + iv_rank ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::clearSymbolMark(" + "0x%08x,0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // Now refresh the symMark variable since the mark has + // been removed. + symMark = MemMark(); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsChipMark ); + + // Update VPD with the chip mark. + o_rc = dqBitmap.setDram( i_badChipCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setDram failed." ); + break; + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + else + { + // Placing a chip mark risks a UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne(i_sumAboveOneCount, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "__updateVpdSumAboveOne() failed."); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsChipUeRisk ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the sum above one nibble count is 0 + if ( 0 == i_sumAboveOneCount.count ) + { + // This means we have no more potential bad DQ or bad chips + // since we can't correct those after symbol mark is placed. + // Place a symbol mark on this bad DQ. + MemMark newSymMark( mcaTrgt, iv_rank, + i_badDqCount.symList[0].symbol ); + o_rc = MarkStore::writeSymbolMark<TYPE_MCA>( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymbolMark ); + + // Update VPD with the symbol mark. + o_rc = dqBitmap.setSymbol( i_badDqCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + else + { + // Placing the symbol mark risks a UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne(i_sumAboveOneCount, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "__updateVpdSumAboveOne() failed."); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsSymUeRisk ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + + } + else + { + // There are enough errors that this could be a potential UE. + // For nibbles under threshold with a sum greater than 1, + // update VPD with it's non-zero symbols. + o_rc = __updateVpdSumAboveOne( i_sumAboveOneCount, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdSumAboveOne() failed." ); + } + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsPotentialUe ); + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + + // If analysis resulted in a false alarm. + if ( tpsFalseAlarm ) + { + o_rc = handleFalseAlarm( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleFalseAlarm() failed on 0x%08x, " + "0x%02x", iv_chip->getHuid(), getKey() ); + } + } + + // Write any updates to VPD. + o_rc = setBadDqBitmap( mcaTrgt, iv_rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "setBadDqBitmap(0x%08x, 0x%02x) failed", + getHuid(mcaTrgt), iv_rank.getKey() ); + break; + } + + // We may have placed a chip mark so do any necessary cleanup. This must + // be called after writing the bad DQ bitmap because the this function + // will also write it if necessary. + o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::chipMarkCleanup(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + } while (0); + */ + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t TpsEvent<TYPE_MCA>::getSymbolCeCounts( CeCount & io_badDqCount, CeCount & io_badChipCount, CeCount & io_sumAboveOneCount, CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) @@ -1031,6 +1809,109 @@ uint32_t TpsEvent<TYPE_MCA>::getSymbolCeCounts( CeCount & io_badDqCount, //------------------------------------------------------------------------------ +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts( CeCount & io_badDqCount, + CeCount & io_badChipCount, CeCount & io_sumAboveOneCount, + CeCount & io_singleSymCount, STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::getSymbolCeCounts] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + do + { + // Get the Bad DQ Bitmap. + TargetHandle_t mcaTrgt = iv_chip->getTrgt(); + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap( mcaTrgt, iv_rank, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap(0x%08x,%d) failed", + getHuid(mcaTrgt), iv_rank.getMaster() ); + break; + } + std::vector<MemSymbol> bmSymList = dqBitmap.getSymbolList(); + + ExtensibleChip * mcbChip = getConnectedParent( iv_chip, TYPE_MCBIST ); + const char * reg_str = nullptr; + SCAN_COMM_REGISTER_CLASS * reg = nullptr; + + for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_PORT; regIdx++ ) + { + reg_str = mcbCeStatReg[regIdx]; + reg = mcbChip->getRegister( reg_str ); + + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s.", reg_str ); + break; + } + uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx; + + for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG; + i += MEM_SYMBOLS_PER_NIBBLE ) + { + MemUtils::MaintSymbols nibbleStats; + + // Get a nibble's worth of symbols. + for ( uint8_t n = 0; n < MEM_SYMBOLS_PER_NIBBLE; n++ ) + { + uint8_t sym = baseSymbol + (i+n); + PRDF_ASSERT( sym < SYMBOLS_PER_RANK ); + + MemUtils::SymbolData symData; + symData.symbol = MemSymbol::fromSymbol( mcaTrgt, iv_rank, + sym, CEN_SYMBOL::ODD_SYMBOL_DQ ); + if ( !symData.symbol.isValid() ) + { + PRDF_ERR( PRDF_FUNC "MemSymbol() failed: symbol=%d", + sym ); + o_rc = FAIL; + break; + } + + // Any symbol set in the DRAM repairs VPD will have an + // automatic CE count of 0xFF + if ( std::find( bmSymList.begin(), bmSymList.end(), + symData.symbol ) != bmSymList.end() ) + symData.count = 0xFF; + else + symData.count = reg->GetBitFieldJustified(((i+n)*8), 8); + + nibbleStats.push_back( symData ); + + // Add all symbols with non-zero counts to the callout list. + if ( symData.count != 0 ) + { + MemoryMru mm { mcaTrgt, iv_rank, symData.symbol }; + io_sc.service_data->SetCallout( mm ); + } + } + if ( SUCCESS != o_rc ) break; + + // Analyze the nibble of symbols. + __analyzeNibbleSyms<TYPE_MCA>( nibbleStats, io_badDqCount, + io_badChipCount, io_sumAboveOneCount, io_singleSymCount ); + + } + if ( SUCCESS != o_rc ) break; + } + + }while(0); + */ + + return o_rc; + + #undef PRDF_FUNC + +} + +//------------------------------------------------------------------------------ + template <> uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) @@ -1088,6 +1969,66 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc, //------------------------------------------------------------------------------ +template <> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzeCeStats(STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done) +{ + #define PRDF_FUNC "[TpsEvent<TYPE_OCMB_CHIP>::analyzeCeStats] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + do + { + // The symbol CE counts will be summarized in the following buckets: + // Number of nibbles with a bad DQ + // Number of nibbles with a bad chip + // Number of nibbles under threshold with a sum greater than 1 + // Number of nibbles under threshold with only a single symbol with a + // non-zero count, and that count is > 1 + CeCount badDqCount, badChipCount, sumAboveOneCount, singleSymCount; + + // Get the symbol CE counts. + o_rc = getSymbolCeCounts( badDqCount, badChipCount, sumAboveOneCount, + singleSymCount, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getSymbolCeCounts failed." ); + break; + } + + // If DRAM repairs are disabled, make the error log predictive and + // abort this procedure. + if ( areDramRepairsDisabled() ) + { + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_TpsDramDisabled ); + + io_sc.service_data->setServiceCall(); + break; + } + + // Analyze the symbol CE counts. + o_rc = analyzeCeSymbolCounts(badDqCount, badChipCount, sumAboveOneCount, + singleSymCount, io_sc); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeCeSymbolCounts failed." ); + break; + } + + }while(0); + */ + + return o_rc; + + #undef PRDF_FUNC + +} + +//------------------------------------------------------------------------------ + template<> uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) @@ -1143,6 +2084,66 @@ uint32_t TpsEvent<TYPE_MCA>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::analyzePhase( STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) +{ + #define PRDF_FUNC "[TpsEvent::analyzePhase] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + do + { + if ( TD_PHASE_0 == iv_phase ) break; // Nothing to analyze yet. + + // Analyze Ecc Attentions + uint32_t eccAttns; + o_rc = checkEccFirs<TYPE_MCA>( iv_chip, eccAttns ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "checkEccFirs(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + o_rc = analyzeEccErrors( eccAttns, io_sc, o_done ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeEccErrors() failed." ); + break; + } + if ( o_done ) break; + + // Analyze CEs + o_rc = analyzeCeStats( io_sc, o_done ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeCeStats() failed." ); + break; + } + if ( o_done ) break; + + // At this point, we are done with the procedure. + o_done = true; + + } while (0); + + if ( (SUCCESS == o_rc) && o_done ) + { + // Clear the ECC FFDC for this master rank. + MemDbUtils::resetEccFfdc<TYPE_MCA>( iv_chip, iv_rank, SLAVE_RANK ); + } + */ + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MCA @@ -1195,6 +2196,55 @@ uint32_t TpsEvent<TYPE_MCA>::startCmd() #undef PRDF_FUNC } +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208263 + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs at runtime. The design is to let the command continue + // to the end of the rank and we do diagnostics on the CE counts found in + // the per-symbol counters. Therefore, all we need to do is tell the + // hardware which CE types to count. + + mss::mcbist::stop_conditions stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Since there are not enough hard CEs to trigger a symbol mark, set + // the per symbol counters to count all CE types. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + stopCond.set_nce_hard_symbol_count_enable( mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + */ + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ template<> @@ -1212,6 +2262,26 @@ uint32_t TpsEvent<TYPE_MCA>::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) return startCmd(); } +//------------------------------------------------------------------------------ + +template<> +uint32_t TpsEvent<TYPE_OCMB_CHIP>::startNextPhase(STEP_CODE_DATA_STRUCT & io_sc) +{ + PRDF_ERR( "startNextPhase: Function not supported yet" ); + /* TODO RTC 208263 + uint32_t signature = 0; + + __getNextPhase<TYPE_MCA>( iv_chip, iv_rank, io_sc, iv_phase, signature ); + + PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x", + iv_phase, iv_chip->getHuid(), getKey() ); + + io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature ); + */ + + return startCmd(); +} + //############################################################################## // // Specializations for MBA diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C index 1bf84ad59..611bd42fa 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C @@ -66,6 +66,32 @@ uint32_t VcmEvent<TYPE_MCA>::startCmd() #undef PRDF_FUNC } +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::startCmd() +{ + #define PRDF_FUNC "[VcmEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208262 + // No stop conditions. + mss::mcbist::stop_conditions stopCond; + + // Start the time based scrub procedure on this master rank. + o_rc = startTdScrub<TYPE_MCA>( iv_chip, iv_rank, MASTER_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + */ + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ template<> @@ -100,6 +126,42 @@ uint32_t VcmEvent<TYPE_MCA>::handlePhaseComplete( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::handlePhaseComplete( + const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) +{ + #define PRDF_FUNC "[VcmEvent<TYPE_OCMB_CHIP>::handlePhaseComplete] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208262 + do + { + if ( TD_PHASE_2 == iv_phase ) + { + // Determine if the chip mark has been verified. + o_rc = (i_eccAttns & MAINT_MCE) ? verified(io_sc) + : falseAlarm(io_sc); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "verified()/falseAlarm() failed" ); + break; + } + + o_done = true; // Procedure is complete. + } + + } while (0); + */ + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C index 26ef1d727..5ffa9a84b 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -92,6 +92,12 @@ bool __iueCheck<TYPE_MCA>( uint32_t i_eccAttns ) } template<> inline +bool __iueCheck<TYPE_OCMB_CHIP>( uint32_t i_eccAttns ) +{ + return ( 0 != (i_eccAttns & MAINT_IUE) ); +} + +template<> inline bool __iueCheck<TYPE_MBA>( uint32_t i_eccAttns ) { // IUES are reported via RCE ETE on Centaur @@ -218,6 +224,7 @@ uint32_t VcmEvent<TYPE_MBA>::startCmd() // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; +template class VcmEvent<TYPE_OCMB_CHIP>; } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C index ca4de8e5a..67c64b90a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -55,6 +55,12 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip ) } template<> +VcmFalseAlarm * __getFalseAlarmCounter<TYPE_OCMB_CHIP>(ExtensibleChip * i_chip) +{ + return getOcmbDataBundle(i_chip)->getVcmFalseAlarmCounter(); +} + +template<> VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) { return getMbaDataBundle(i_chip)->getVcmFalseAlarmCounter(); @@ -144,6 +150,87 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::checkEcc( const uint32_t & i_eccAttns, + STEP_CODE_DATA_STRUCT & io_sc, + bool & o_done ) +{ + #define PRDF_FUNC "[VcmEvent<TYPE_OCMB_CHIP>::checkEcc] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208262 + do + { + if ( i_eccAttns & MAINT_UE ) + { + PRDF_TRAC( PRDF_FUNC "UE Detected: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_MaintUE ); + + // At this point we don't actually have an address for the UE. The + // best we can do is get the address in which the command stopped. + MemAddr addr; + o_rc = getMemMaintAddr<TYPE_MCA>( iv_chip, addr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + o_rc = MemEcc::handleMemUe<TYPE_MCA>( iv_chip, addr, + UE_TABLE::SCRUB_UE, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // Because of the UE, any further TPS requests will likely have no + // effect. So ban all subsequent requests. + MemDbUtils::banTps<TYPE_MCA>( iv_chip, addr.getRank() ); + + // Leave the mark in place and abort this procedure. + o_done = true; break; + } + + if ( mfgMode() && (i_eccAttns & MAINT_IUE) ) + { + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_MaintIUE ); + + o_rc = MemEcc::handleMemIue<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // If service call is set, then IUE threshold was reached. + if ( io_sc.service_data->queryServiceCall() ) + { + PRDF_TRAC( PRDF_FUNC "IUE threshold detected: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + // Leave the mark in place and abort this procedure. + o_done = true; break; + } + } + + } while (0); + */ + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ template<> @@ -180,6 +267,43 @@ uint32_t VcmEvent<TYPE_MCA>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) #undef PRDF_FUNC } +template<> +uint32_t VcmEvent<TYPE_OCMB_CHIP>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + PRDF_ERR( PRDF_FUNC "Function not supported yet" ); + /* TODO RTC 208262 + do + { + o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // The cleanup() function is called by both verified() and falseAlarm(). + // In either case, the error log should be predictive if there has been + // a least one false alarm on any DRAM on this rank other than this + // DRAM. This is required on Nimbus because of two symbol correction, + // which does not exist on Centaur. + VcmFalseAlarm * faCntr = __getFalseAlarmCounter<TYPE_MCA>(iv_chip); + uint8_t dram = iv_mark.getSymbol().getDram(); + if ( faCntr->queryDrams(iv_rank, dram, io_sc) ) + io_sc.service_data->setServiceCall(); + + } while (0); + */ + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA @@ -386,6 +510,7 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; +template class VcmEvent<TYPE_OCMB_CHIP>; //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C index 4a4391c0c..0e11b1a86 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2018 */ +/* Contributors Listed Below - COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -301,9 +301,9 @@ int32_t commandAddrTimeout( ExtensibleChip * i_chip, // was executed. Restarting the command will likely fail with the same // issue. Callout and gard all MCAs in which the command was executed. - std::vector<ExtensibleChip *> mcaList; + ExtensibleChipList mcaList; - if ( SUCCESS != getMcbistMaintPort(i_chip, mcaList) ) + if ( SUCCESS != getMcbistMaintPort<TYPE_MCBIST>(i_chip, mcaList) ) { PRDF_ERR( PRDF_FUNC "getMcbistMaintPort(0x%08x) failed", i_chip->getHuid() ); |