diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2018-05-20 15:30:09 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-05-23 15:11:41 -0400 |
commit | b8037fcdbe638eff35b075b571f4c709689a2b21 (patch) | |
tree | 36b8e517864fa0938a9bb9fb7ffb3c84f1f733c0 | |
parent | c7867f1449a1434338c2513c90b4a40438fa94d2 (diff) | |
download | talos-hostboot-b8037fcdbe638eff35b075b571f4c709689a2b21.tar.gz talos-hostboot-b8037fcdbe638eff35b075b571f4c709689a2b21.zip |
PRD: cleanup after placing/verifying a chip mark on Centaur
Change-Id: I0688d00875ce97595a18b3338aff3f8f59e19ff3
RTC: 193261
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59117
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59233
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C | 17 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemMark.C | 287 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemMark.H | 67 | ||||
-rwxr-xr-x | src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C | 81 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemDsd.H | 9 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 24 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm.H | 34 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C | 40 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C | 232 |
9 files changed, 455 insertions, 336 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index 467441c5a..bf0508d70 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -1111,25 +1111,14 @@ uint32_t analyzeImpe<TYPE_MCA>( ExtensibleChip * i_chip, break; } - o_rc = MarkStore::balance<TYPE_MCA>( i_chip, rank, io_sc ); + o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( i_chip, rank, + io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "balance(0x%08x,0x%02x) failed", + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", i_chip->getHuid(), rank.getKey() ); break; } - - // Set the dram in DRAM Repairs VPD. - o_rc = setDramInVpd<TYPE_MCA>( i_chip, rank, symbol ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", - i_chip->getHuid(), rank.getKey() ); - break; - } - - // Add a DRAM sparing procedure to the queue, if supported. - // TODO: RTC 157888 } } diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C index 84e7c09b5..35f7803e7 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C @@ -30,6 +30,8 @@ #include <prdfMemDbUtils.H> #ifdef __HOSTBOOT_MODULE +#include <prdfCenMbaExtraSig.H> +#include <prdfMemDsd.H> #include <prdfMemVcm.H> #endif @@ -872,7 +874,290 @@ uint32_t writeSymbolMark<TYPE_MBA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } -//------------------------------------------------------------------------------ +//############################################################################## +// Utilities to cleanup markstore after a chip mark is verified +//############################################################################## + +#ifdef __HOSTBOOT_MODULE // Not supported on FSP. + +template<TARGETING::TYPE T> +uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + const MemMark & i_chipMark, + const MemMark & i_symMark ); + +template<> +uint32_t __applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + const MemMark & i_chipMark, + const MemMark & i_symMark ) +{ + // There is no DRAM sparing on Nimbus so simply check if both the chip and + // symbol mark have been used. + if ( i_chipMark.isValid() && i_symMark.isValid() ) + { + io_sc.service_data->setServiceCall(); + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_AllDramRepairs ); + + #ifdef __HOSTBOOT_RUNTIME + // No more repairs left so no point doing any more TPS procedures. + MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank ); + #endif + } + + return SUCCESS; +} + +template<> +uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + const MemMark & i_chipMark, + const MemMark & i_symMark ) +{ + #define PRDF_FUNC "[__applyRasPolicies<TYPE_MBA>] " + + uint32_t o_rc = SUCCESS; + + bool allRepairsUsed = false; + + do + { + const uint8_t ps = i_chipMark.getSymbol().getPortSlct(); + const uint8_t dram = i_chipMark.getSymbol().getDram(); + + const bool isX4 = isDramWidthX4( i_chip->getTrgt() ); + + // Determine if DRAM sparing is enabled. + bool isEnabled = isX4; // Always an ECC spare in x4 mode. + + if ( !isEnabled ) + { + /* TODO RTC 189221 + // Check for any DRAM spares. + uint8_t cnfg = ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE; + o_rc = getDimmSpareConfig<TYPE_MBA>( i_chip, i_rank, ps, cnfg ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getDimmSpareConfig(0x%08x,0x%02x,%d) " + "failed", i_chip->getHuid(), i_rank.getKey(), ps ); + break; + } + isEnabled = (ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE != cnfg); + */ + } + + if ( isEnabled ) + { + // Sparing is enabled. Get the current spares in hardware. + MemSymbol sp0, sp1, ecc; + /* TODO RTC 189221 + o_rc = mssGetSteerMux<TYPE_MBA>( i_chip, i_rank, sp0, sp1, ecc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "mssGetSteerMux(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + */ + + // Add the spares to the callout list if they exist. + if ( sp0.isValid() ) + { + MemoryMru mm { i_chip->getTrgt(), i_rank, sp0 }; + io_sc.service_data->SetCallout( mm ); + } + if ( sp1.isValid() ) + { + MemoryMru mm { i_chip->getTrgt(), i_rank, sp1 }; + io_sc.service_data->SetCallout( mm ); + } + if ( ecc.isValid() ) + { + MemoryMru mm { i_chip->getTrgt(), i_rank, ecc }; + io_sc.service_data->SetCallout( mm ); + } + + // If the chip mark is on a spare then the spare is bad and hardware + // can not steer it to another DRAM even if one is available (e.g. + // the ECC spare). In this this case, make error log predictive. + if ( ( dram == (0 == ps ? sp0.getDram() : sp1.getDram()) ) || + ( dram == ecc.getDram() ) ) + { + allRepairsUsed = true; + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_VcmBadSpare ); + break; // Nothing more to do. + } + + // Certain DIMMs may have had spares intentially made unavailable by + // the manufacturer. Check the VPD for available spares. + bool dramSparePossible = false; + bool eccSparePossible = false; + /* TODO RTC 189221 + o_rc = bitmap.isSpareAvailable( ps, dramSparePossible, + eccSparePossible ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "isDramSpareAvailable() failed" ); + break; + } + */ + + if ( dramSparePossible && + (0 == ps ? !sp0.isValid() : !sp1.isValid()) ) + { + // A spare DRAM is available. + TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, + i_chipMark }; + MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e ); + } + else if ( eccSparePossible && !ecc.isValid() ) + { + // The ECC spare is available. + TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, + i_chipMark, true }; + MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e ); + } + else + { + // Chip mark is in place and sparing is not possible. + allRepairsUsed = true; + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_AllDramRepairs ); + } + } + // There is no DRAM sparing so simply check if both the chip and symbol + // mark have been used. + else if ( i_chipMark.isValid() && i_symMark.isValid() ) + { + allRepairsUsed = true; + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_AllDramRepairs ); + } + + } while (0); + + if ( allRepairsUsed ) + { + io_sc.service_data->setServiceCall(); + + #ifdef __HOSTBOOT_RUNTIME + // No more repairs left so no point doing any more TPS procedures. + MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank ); + #endif + } + + return o_rc; + + #undef PRDF_FUNC +} + +template<TARGETING::TYPE T> +uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[chipMarkCleanup] " + + uint32_t o_rc = SUCCESS; + + do + { + // Get the chip mark. + MemMark chipMark; + o_rc = readChipMark<T>( i_chip, i_rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + + // There is nothing else to do if there is no chip mark. + if ( !chipMark.isValid() ) break; + + // Add the chip mark to the callout list. + MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() }; + io_sc.service_data->SetCallout( cm_mm ); + + // Get the symbol mark. + MemMark symMark; + o_rc = readSymbolMark<T>( i_chip, i_rank, symMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readSymbolMark(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + + // If both the chip and symbol mark are on the same DRAM, clear the + // symbol mark. + if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) + { + o_rc = clearSymbolMark<T>( i_chip, i_rank ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearSymbolMark(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + + // Reset the symbol mark variable to invalid. + symMark = MemMark(); + } + + // Add the symbol mark to the callout list if it exists. + if ( symMark.isValid() ) + { + MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() }; + io_sc.service_data->SetCallout( sm_mm ); + } + + // Make the error log predictive and exit if DRAM repairs are disabled. + if ( areDramRepairsDisabled() ) + { + io_sc.service_data->setServiceCall(); + break; // nothing else to do + } + + // Set the chip mark in the DRAM Repairs VPD. + o_rc = setDramInVpd<TYPE_MCA>( i_chip, i_rank, chipMark.getSymbol() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + + // Apply RAS policies. + o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark, + symMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__applyRasPolicies(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template +uint32_t chipMarkCleanup<TYPE_MCA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t chipMarkCleanup<TYPE_MBA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); + +#endif // not supported on FSP } // end namespace MarkStore diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H index 8ea692ea9..fca039258 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H @@ -174,62 +174,29 @@ template<TARGETING::TYPE T> uint32_t clearSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank ); /** - * @brief If a rank contains a symbol mark that is on the same DRAM as the chip - * mark, the symbol mark is removed. This is done to free up available - * repairs. Will also apply RAS policies where necessary. + * @brief If a chip mark has been verified or explicitly set due other RAS + * policies, this function does all the necessary cleanup. + * + * Function details: + * - Adds the chip mark to the callout list. + * - Removes the symbol mark if it is on the same DRAM as the chip mark. + * - Adds the symbol mark to the callout list if it exists on another DRAM. + * - If DRAM repairs are disabled: + * - Makes the error log predictive. + * - Otherwise: + * - Sets the DRAM in the DRAM Repair VPD. + * - Makes the error log predictive if RAS policies apply. + * - Adds a DSD procedure to the TD queue is a DRAM spare is available. + * - Bans TPS on the rank if all repairs are used. + * * @param i_chip MBA or MCA chip. * @param i_rank Target rank. * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails. SUCCESS otherwise. */ template<TARGETING::TYPE T> -uint32_t balance( ExtensibleChip * i_chip, const MemRank & i_rank, - STEP_CODE_DATA_STRUCT & io_sc ) -{ - uint32_t o_rc = SUCCESS; - - do - { - // Get the chip mark. - MemMark chipMark; - o_rc = readChipMark<T>( i_chip, i_rank, chipMark ); - if ( SUCCESS != o_rc ) break; - if ( !chipMark.isValid() ) break; // nothing to do. - - // Get the symbol mark. - MemMark symMark; - o_rc = readSymbolMark<T>( i_chip, i_rank, symMark ); - if ( SUCCESS != o_rc ) break; - if ( !symMark.isValid() ) break; // nothing to do. - - // If both the chip and symbol mark are on the same DRAM, clear the - // symbol mark. - if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) - { - o_rc = clearSymbolMark<T>( i_chip, i_rank ); - if ( SUCCESS != o_rc ) break; - } - else - { - // Both a chip and symbol mark exist, but they are on separate - // DRAMs. So, make the error log predictive. - io_sc.service_data->setServiceCall(); - io_sc.service_data->setSignature( i_chip->getHuid(), - PRDFSIG_AllDramRepairs ); - - // The chip and symbol mark may be on different DIMMs (Centaur ranks - // span two DIMMs). Therefore, we must add both to the callout list - // to ensure all DIMMs are in the callout list. - MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() }; - MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() }; - io_sc.service_data->SetCallout( cm_mm ); - io_sc.service_data->SetCallout( sm_mm ); - } - - } while (0); - - return o_rc; -} +uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); } // end namespace MarkStore diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C index 88fd4dc00..f3ee6884f 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C @@ -596,87 +596,6 @@ int32_t CenMbaTdCtlr::startTpsPhase1( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::addTdQueueEntryTPS( const CenRank & i_rank, - STEP_CODE_DATA_STRUCT & io_sc, - bool i_banTps ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::addTdQueueEntryTPS] " - - int32_t o_rc = SUCCESS; - - do - { - if ( iv_tpsRankData.isBanned(i_rank, io_sc) ) - { - // TPS is banned, do not add the request to the queue. - break; - } - - // Check for any available repairs. There is no point doing TPS if we - // cannot apply a repair. - CenMark mark; - o_rc = mssGetMarkStore( iv_mbaTrgt, i_rank, mark ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "mssGetMarkStore() failed." ); - break; - } - if ( mark.getCM().isValid() && - (iv_x4Dimm || (!iv_x4Dimm && mark.getSM().isValid())) ) - { - bool port0Available, port1Available; - o_rc = checkForAvailableSpares( 0, port0Available ); - o_rc |= checkForAvailableSpares( 1, port1Available ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "checkForAvailableSpares() failed." ); - break; - } - - if ( !port0Available && !port1Available ) - { - // Ban TPS to avoid rechecking with subsequent TPS requests. - iv_tpsRankData.ban( iv_rank ); - - // TPS is banned, do not add the request to the queue. - break; - } - } - - if ( i_banTps ) - { - // Ban all future TPS requests for this rank (not including - // this one). - iv_tpsRankData.ban( i_rank ); - } - - // Push the TD request to the queue. - iv_queue.push( TdQueueEntry(TPS_EVENT, i_rank) ); - - // Mark this rank as bad. - o_rc = iv_masterRanks.setBad( i_rank ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setBad() failed" ); - break; - } - - } while(0); - - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Failed: i_rank=m%ds%d i_banTps=%c", - i_rank.getMaster(), i_rank.getSlave(), - i_banTps ? 'T' : 'F' ); - } - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::handleUe_Td( STEP_CODE_DATA_STRUCT & io_sc, const CenAddr & i_stopAddr, bool i_addTpsRequest ) diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H index de1816927..73f133832 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDsd.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemDsd.H @@ -51,8 +51,9 @@ class DsdEvent : public TdEntry * @param i_rank Rank reporting chip mark. */ DsdEvent<T>( ExtensibleChip * i_chip, const MemRank & i_rank, - const MemMark & i_mark ) : - TdEntry(DSD_EVENT, i_chip, i_rank), iv_mark(i_mark) + const MemMark & i_mark, bool i_eccSpare = false ) : + TdEntry(DSD_EVENT, i_chip, i_rank), iv_mark(i_mark), + iv_eccSpare(i_eccSpare) { PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); @@ -166,7 +167,9 @@ class DsdEvent : public TdEntry private: // instance variables - const MemMark iv_mark; ///< The chip mark from hardware. + const MemMark iv_mark; ///< The chip mark from hardware. + const bool iv_eccSpare; ///< True if the spare should be applied to the x4 + ///< DRAM ECC spare. }; } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 57443c036..97cfeb557 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -947,16 +947,6 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, } } - // We may have placed a chip mark, so if a symbol mark is being used on - // the same chip, undo the symbol mark after the chip mark is in place. - o_rc = MarkStore::balance<TYPE_MCA>( iv_chip, iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", - iv_chip->getHuid(), getKey() ); - break; - } - // Write any updates to VPD. o_rc = setBadDqBitmap<DIMMS_PER_RANK::MCA>(mcaTrgt, iv_rank, dqBitmap); if ( SUCCESS != o_rc ) @@ -966,7 +956,19 @@ uint32_t TpsEvent<TYPE_MCA>::analyzeCeSymbolCounts( CeCount i_badDqCount, iv_rank.getKey()); break; } - }while(0); + + // We may have placed a chip mark so do any necessary cleanup. This must + // be called after writing the bad DQ bitmap because the this function + // will also write it if necessary. + o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::chipMarkCleanup(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + } while (0); return o_rc; diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H index ccfa4475c..0de5f2fcf 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H @@ -277,19 +277,11 @@ class VcmEvent : public TdEntry io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_VcmVerified ); - if ( PlatServices::areDramRepairsDisabled() ) + // Leave the chip mark in place and do any necessary cleanup. + o_rc = cleanup( io_sc ); + if ( SUCCESS != o_rc ) { - // Make the error log predictive, nothing else to do. - io_sc.service_data->setServiceCall(); - } - else - { - // Leave the chip mark in place and do any necessary cleanup. - o_rc = cleanup( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "cleanup() failed" ); - } + PRDF_ERR( PRDF_FUNC "cleanup() failed" ); } return o_rc; @@ -305,7 +297,23 @@ class VcmEvent : public TdEntry * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ - uint32_t cleanup( STEP_CODE_DATA_STRUCT & io_sc ); + uint32_t cleanup( STEP_CODE_DATA_STRUCT & io_sc ) + { + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + o_rc = MarkStore::chipMarkCleanup<T>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + } + + return o_rc; + + #undef PRDF_FUNC + } /** * @brief Verification failed. Do additional processing such as removing diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C index 69e8a26e6..d3de6d20d 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C @@ -83,46 +83,6 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ template<TARGETING::TYPE T> -uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[VcmEvent::cleanup] " - - uint32_t o_rc = SUCCESS; - - do - { - // If there is a symbol mark on the same DRAM as the newly verified chip - // mark, remove the symbol mark. - o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", - iv_chip->getHuid(), getKey() ); - break; - } - - // Set the dram in DRAM Repairs VPD. - o_rc = setDramInVpd<T>( iv_chip, iv_rank, iv_mark.getSymbol() ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", - iv_chip->getHuid(), iv_rank.getKey() ); - break; - } - - // Add a DRAM sparing procedure to the queue, if supported. - // TODO: RTC 157888 - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -template<TARGETING::TYPE T> bool __iueCheck( uint32_t i_eccAttns ); template<> inline diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C index 64f760486..b7cb13653 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C @@ -62,129 +62,6 @@ VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) //############################################################################## // -// Generic template functions -// -//############################################################################## - -template<TARGETING::TYPE T> -uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[VcmEvent::falseAlarm] " - - uint32_t o_rc = SUCCESS; - - PRDF_TRAC( PRDF_FUNC "Chip mark false alarm: 0x%08x,0x%02x", - iv_chip->getHuid(), getKey() ); - - io_sc.service_data->setSignature( iv_chip->getHuid(), - PRDFSIG_VcmFalseAlarm ); - - do - { - // If DRAM repairs are disabled, make the error log predictive. - if ( areDramRepairsDisabled() ) - { - io_sc.service_data->setServiceCall(); - break; // Nothing more to do. - } - - // Increment the false alarm counter and check threshold. - uint8_t dram = iv_mark.getSymbol().getDram(); - if ( __getFalseAlarmCounter<T>(iv_chip)->inc(iv_rank, dram, io_sc) ) - { - // False alarm threshold has been reached. - - io_sc.service_data->setSignature( iv_chip->getHuid(), - PRDFSIG_VcmFalseAlarmTH ); - - PRDF_TRAC( PRDF_FUNC "False alarm threshold: 0x%08x,0x%02x", - iv_chip->getHuid(), getKey() ); - - // Leave the chip mark in place and do any necessary cleanup. - o_rc = cleanup( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "cleanup() failed" ); - break; - } - } - else - { - // Remove the chip mark. - o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed", - iv_chip->getHuid(), getKey() ); - break; - } - } - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -template<TARGETING::TYPE T> -uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[VcmEvent::cleanup] " - - uint32_t o_rc = SUCCESS; - - do - { - // If there is a symbol mark on the same DRAM as the newly verified chip - // mark, remove the symbol mark. - o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", - iv_chip->getHuid(), getKey() ); - break; - } - - // Set the dram in DRAM Repairs VPD. - o_rc = setDramInVpd<T>( iv_chip, iv_rank, iv_mark.getSymbol() ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", - iv_chip->getHuid(), iv_rank.getKey() ); - break; - } - - // Add a DRAM sparing procedure to the queue, if supported. - // TODO: RTC 157888 - - // The cleanup() function is called by both verified() and falseAlarm(). - // In either case, we can pass in the DRAM characterized by iv_mark to - // determine if there has been a least one false alarm on any DRAM on - // this rank other than this DRAM. If so, the error log should be - // predictive. - VcmFalseAlarm * faCntr = __getFalseAlarmCounter<T>(iv_chip); - uint8_t dram = iv_mark.getSymbol().getDram(); - if ( faCntr->queryDrams(iv_rank, dram, io_sc) ) - io_sc.service_data->setServiceCall(); - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -// Avoid linker errors with the template. -template class VcmEvent<TYPE_MCA>; -template class VcmEvent<TYPE_MBA>; - -//############################################################################## -// // Specializations for MCA // //############################################################################## @@ -288,6 +165,42 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + +template<> +uint32_t VcmEvent<TYPE_MCA>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + do + { + o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // The cleanup() function is called by both verified() and falseAlarm(). + // In either case, the error log should be predictive if there has been + // a least one false alarm on any DRAM on this rank other than this + // DRAM. This is required on Nimbus because of two symbol correction, + // which does not exist on Centaur. + VcmFalseAlarm * faCntr = __getFalseAlarmCounter<TYPE_MCA>(iv_chip); + uint8_t dram = iv_mark.getSymbol().getDram(); + if ( faCntr->queryDrams(iv_rank, dram, io_sc) ) + io_sc.service_data->setServiceCall(); + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + //############################################################################## // // Specializations for MBA @@ -400,6 +313,79 @@ uint32_t VcmEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, #undef PRDF_FUNC } +//############################################################################## +// +// Generic template functions +// +//############################################################################## + +template<TARGETING::TYPE T> +uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::falseAlarm] " + + uint32_t o_rc = SUCCESS; + + PRDF_TRAC( PRDF_FUNC "Chip mark false alarm: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmFalseAlarm ); + + do + { + // If DRAM repairs are disabled, make the error log predictive. + if ( areDramRepairsDisabled() ) + { + io_sc.service_data->setServiceCall(); + break; // Nothing more to do. + } + + // Increment the false alarm counter and check threshold. + uint8_t dram = iv_mark.getSymbol().getDram(); + if ( __getFalseAlarmCounter<T>(iv_chip)->inc(iv_rank, dram, io_sc) ) + { + // False alarm threshold has been reached. + + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmFalseAlarmTH ); + + PRDF_TRAC( PRDF_FUNC "False alarm threshold: 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); + + // Leave the chip mark in place and do any necessary cleanup. + o_rc = cleanup( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "cleanup() failed" ); + break; + } + } + else + { + // Remove the chip mark. + o_rc = MarkStore::clearChipMark<T>( iv_chip, iv_rank ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearChipMark(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +// Avoid linker errors with the template. +template class VcmEvent<TYPE_MCA>; +template class VcmEvent<TYPE_MBA>; + //------------------------------------------------------------------------------ } // end namespace PRDF |