diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2018-06-02 17:28:45 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-06-08 22:45:47 -0400 |
commit | 38666ab58f157b82c3dca2d782667cf071a75cb2 (patch) | |
tree | b26c051744ee9abb117e7c58094ecc6d8ec4ea40 | |
parent | e38d6b0d199b045cce44db69f8594eaaa0990a9c (diff) | |
download | talos-hostboot-38666ab58f157b82c3dca2d782667cf071a75cb2.tar.gz talos-hostboot-38666ab58f157b82c3dca2d782667cf071a75cb2.zip |
PRD: create MarkStore::applyRasPolicies()
Change-Id: Ifd08172b960b5c526a014076e79d5c45df54ee45
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59819
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60136
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemMark.C | 193 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemMark.H | 49 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 26 |
3 files changed, 180 insertions, 88 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C index 9a94e3fbb..64193ea66 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C @@ -391,7 +391,7 @@ template<> uint32_t __clearFetchAttn<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank ) { - #define PRDF_FUNC "[__readMarks<TYPE_MBA>] " + #define PRDF_FUNC "[__clearFetchAttn<TYPE_MBA>] " uint32_t o_rc = SUCCESS; @@ -880,31 +880,43 @@ uint32_t writeSymbolMark<TYPE_MBA>( ExtensibleChip * i_chip, #ifdef __HOSTBOOT_MODULE // Not supported on FSP. +//------------------------------------------------------------------------------ + +void __addCallout( ExtensibleChip * i_chip, const MemRank & i_rank, + const MemSymbol & i_symbol, STEP_CODE_DATA_STRUCT & io_sc ) +{ + if ( i_symbol.isValid() ) + { + MemoryMru mm { i_chip->getTrgt(), i_rank, i_symbol }; + io_sc.service_data->SetCallout( mm ); + } +} + +//------------------------------------------------------------------------------ + template<TARGETING::TYPE T> uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ); + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, bool & o_allRepairsUsed ); template<> uint32_t __applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ) + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, + bool & o_allRepairsUsed ) { // There is no DRAM sparing on Nimbus so simply check if both the chip and // symbol mark have been used. if ( i_chipMark.isValid() && i_symMark.isValid() ) { - io_sc.service_data->setServiceCall(); + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); - - #ifdef __HOSTBOOT_RUNTIME - // No more repairs left so no point doing any more TPS procedures. - MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank ); - #endif } return SUCCESS; @@ -915,14 +927,14 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ) + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, + bool & o_allRepairsUsed ) { #define PRDF_FUNC "[__applyRasPolicies<TYPE_MBA>] " uint32_t o_rc = SUCCESS; - bool allRepairsUsed = false; - do { const uint8_t ps = i_chipMark.getSymbol().getPortSlct(); @@ -964,21 +976,9 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, */ // Add the spares to the callout list if they exist. - if ( sp0.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, sp0 }; - io_sc.service_data->SetCallout( mm ); - } - if ( sp1.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, sp1 }; - io_sc.service_data->SetCallout( mm ); - } - if ( ecc.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, ecc }; - io_sc.service_data->SetCallout( mm ); - } + __addCallout( i_chip, i_rank, sp0, io_sc ); + __addCallout( i_chip, i_rank, sp1, io_sc ); + __addCallout( i_chip, i_rank, ecc, io_sc ); // If the chip mark is on a spare then the spare is bad and hardware // can not steer it to another DRAM even if one is available (e.g. @@ -987,7 +987,7 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, ( (1 == ps) && sp1.isValid() && (dram == sp1.getDram()) ) || ( isX4 && ecc.isValid() && (dram == ecc.getDram()) ) ) { - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_VcmBadSpare ); break; // Nothing more to do. @@ -1011,21 +1011,19 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, (0 == ps ? !sp0.isValid() : !sp1.isValid()) ) { // A spare DRAM is available. - TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, - i_chipMark }; - MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e ); + o_dsdEvent = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, + i_chipMark }; } else if ( eccSparePossible && !ecc.isValid() ) { // The ECC spare is available. - TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, - i_chipMark, true }; - MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e ); + o_dsdEvent = new DsdEvent<TYPE_MBA>{ i_chip, i_rank, + i_chipMark, true }; } else { // Chip mark is in place and sparing is not possible. - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); } @@ -1034,36 +1032,36 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, // mark have been used. else if ( i_chipMark.isValid() && i_symMark.isValid() ) { - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); } } while (0); - if ( allRepairsUsed ) - { - io_sc.service_data->setServiceCall(); - - #ifdef __HOSTBOOT_RUNTIME - // No more repairs left so no point doing any more TPS procedures. - MemDbUtils::banTps<TYPE_MBA>( i_chip, i_rank ); - #endif - } - return o_rc; #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + template<TARGETING::TYPE T> -uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, - STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ) { - #define PRDF_FUNC "[chipMarkCleanup] " + #define PRDF_FUNC "[MarkStore::applyRasPolicies] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; + delete o_dsdEvent; o_dsdEvent = nullptr; // just in case + + bool allRepairsUsed = false; + do { // Get the chip mark. @@ -1080,8 +1078,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, if ( !chipMark.isValid() ) break; // Add the chip mark to the callout list. - MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() }; - io_sc.service_data->SetCallout( cm_mm ); + __addCallout( i_chip, i_rank, chipMark.getSymbol(), io_sc ); // Get the symbol mark. MemMark symMark; @@ -1095,7 +1092,8 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, // If both the chip and symbol mark are on the same DRAM, clear the // symbol mark. - if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) + if ( symMark.isValid() && + chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) { o_rc = clearSymbolMark<T>( i_chip, i_rank ); if ( SUCCESS != o_rc ) @@ -1110,11 +1108,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, } // Add the symbol mark to the callout list if it exists. - if ( symMark.isValid() ) - { - MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() }; - io_sc.service_data->SetCallout( sm_mm ); - } + __addCallout( i_chip, i_rank, symMark.getSymbol(), io_sc ); // Make the error log predictive and exit if DRAM repairs are disabled. if ( areDramRepairsDisabled() ) @@ -1123,25 +1117,96 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, break; // nothing else to do } - // Set the chip mark in the DRAM Repairs VPD. - o_rc = setDramInVpd<T>( i_chip, i_rank, chipMark.getSymbol() ); + // Apply type specific RAS policies. + o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark, symMark, + o_dsdEvent, allRepairsUsed ); + if ( SUCCESS != o_rc ) break; + + } while (0); + + if ( allRepairsUsed ) + { + io_sc.service_data->setServiceCall(); + + #ifdef __HOSTBOOT_RUNTIME + // No more repairs left so no point doing any more TPS procedures. + MemDbUtils::banTps<T>( i_chip, i_rank ); + #endif + } + + return o_rc; + + #undef PRDF_FUNC +} + +template +uint32_t applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); +template +uint32_t applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> +uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[chipMarkCleanup] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( T == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // It is possible this function was called and there is no chip mark. So + // first check if one exists. + MemMark chipMark; + o_rc = readChipMark<T>( i_chip, i_rank, chipMark ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", + PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed", i_chip->getHuid(), i_rank.getKey() ); break; } - // Apply RAS policies. - o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark, - symMark ); + // There is nothing else to do if there is no chip mark. + if ( !chipMark.isValid() ) break; + + // Apply all RAS policies. + TdEntry * dsdEvent = nullptr; + o_rc = applyRasPolicies<T>( i_chip, i_rank, io_sc, dsdEvent ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__applyRasPolicies(0x%08x,0x%02x) failed", + PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x,0x%02x) failed", i_chip->getHuid(), i_rank.getKey() ); break; } + // Add the DRAM spare event to the queue if needed. + if ( nullptr != dsdEvent ) + { + MemDbUtils::pushToQueue<T>( i_chip, dsdEvent ); + } + + // Set the chip mark in the DRAM Repairs VPD. + if ( !areDramRepairsDisabled() ) + { + o_rc = setDramInVpd<T>( i_chip, i_rank, chipMark.getSymbol() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + } + } while (0); return o_rc; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H index fca039258..d0f8c57ef 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H @@ -36,6 +36,10 @@ #include <prdfP9McaExtraSig.H> #include <prdfPlatServices.H> +#ifdef __HOSTBOOT_MODULE + #include <prdfMemTdQueue.H> +#endif + //############################################################################## // class MemMark //############################################################################## @@ -173,21 +177,44 @@ uint32_t writeSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank, template<TARGETING::TYPE T> uint32_t clearSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank ); +#ifdef __HOSTBOOT_MODULE // Not supported on FSP. + +/** + * @brief Applies RAS policies on a rank of memory based on based on the number + * of repairs available on that rank. + * + * Function details: + * - Removes the symbol mark if it is on the same DRAM as the chip mark. + * - Adds the following to the callout list if they exist: chip mark, symbol + * mark, DRAM spares, and ECC spare. + * - Makes the error log predictive and exits if DRAM repairs are disabled. + * - Makes the error log predictive and bans TPS on this rank if all available + * repairs have been used. + * - Returns a new DsdEvent if DRAM sparing is available. + * + * @param i_chip MBA or MCA chip. + * @param i_rank Target rank. + * @param io_sc The step code data struct. + * @param o_dsdEvent A new DsdEvent if DRAM sparing is available. Otherwise, + * nullptr. Note that this is not used in all cases so the + * event will need to be manually deleted if not added to the + * TD queue. + * @return Non-SUCCESS if an internal function fails. SUCCESS otherwise. + */ +template<TARGETING::TYPE T> +uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); + /** * @brief If a chip mark has been verified or explicitly set due other RAS * policies, this function does all the necessary cleanup. * * Function details: - * - Adds the chip mark to the callout list. - * - Removes the symbol mark if it is on the same DRAM as the chip mark. - * - Adds the symbol mark to the callout list if it exists on another DRAM. - * - If DRAM repairs are disabled: - * - Makes the error log predictive. - * - Otherwise: - * - Sets the DRAM in the DRAM Repair VPD. - * - Makes the error log predictive if RAS policies apply. - * - Adds a DSD procedure to the TD queue is a DRAM spare is available. - * - Bans TPS on the rank if all repairs are used. + * - Calls applyRasPolicies() to make any necessary callouts. + * - If DRAM repairs are not disabled: + * - Sets the DRAM in the DRAM Repair VPD if DRAM repairs. + * - Adds a DSD procedure to the TD queue if a DRAM spare is available * * @param i_chip MBA or MCA chip. * @param i_rank Target rank. @@ -198,6 +225,8 @@ template<TARGETING::TYPE T> uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc ); +#endif // Not supported on FSP. + } // end namespace MarkStore } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 29270ae34..dfd36e9be 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -1520,25 +1520,23 @@ uint32_t TpsEvent<TYPE_MBA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc ) // after the VCM procedure. if ( chipMark.isValid() ) { - /* TODO RTC 189221 DRAM sparing support - bool available; - o_rc = checkForAvailableSpares( iv_mark.getCM().getPortSlct(), - available ); - if ( SUCCESS != o_rc ) + TdEntry * dsdEvent = nullptr; + o_rc = MarkStore::applyRasPolicies<TYPE_MBA>( iv_chip, iv_rank, + io_sc, dsdEvent ); + if ( nullptr != dsdEvent ) { - PRDF_ERR( PRDF_FUNC "checkForAvailableSpares() failed" ); - break; + // We don't want to do the DRAM spare procedure at this time, + // because we haven't even run the VCM procedure yet. So just + // delete the procedure instead of adding it to the queue. + delete dsdEvent; dsdEvent = nullptr; } - if ( !available ) + if ( SUCCESS != o_rc ) { - // Spares have been used. Callout the mark. Make the error log - // predictive. - CalloutUtil::calloutMark( iv_mbaTrgt, iv_rank, iv_mark, io_sc ); - setTdSignature( io_sc, PRDFSIG_TpsCmAndSpare ); - io_sc.service_data->setServiceCall(); + PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x, 0x%02x) failed.", + iv_chip->getHuid(), iv_rank.getKey() ); + break; } - */ } } while (0); |