From 38666ab58f157b82c3dca2d782667cf071a75cb2 Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Sat, 2 Jun 2018 17:28:45 -0500 Subject: PRD: create MarkStore::applyRasPolicies() Change-Id: Ifd08172b960b5c526a014076e79d5c45df54ee45 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59819 Reviewed-by: Caleb N. Palmer Reviewed-by: Matt Derksen Reviewed-by: Brian J. Stegmiller Reviewed-by: Benjamin J. Weisenbeck Tested-by: Jenkins Server Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60136 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW --- src/usr/diag/prdf/common/plat/mem/prdfMemMark.C | 193 ++++++++++++++++-------- src/usr/diag/prdf/common/plat/mem/prdfMemMark.H | 49 ++++-- src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 26 ++-- 3 files changed, 180 insertions(+), 88 deletions(-) diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C index 9a94e3fbb..64193ea66 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C @@ -391,7 +391,7 @@ template<> uint32_t __clearFetchAttn( ExtensibleChip * i_chip, const MemRank & i_rank ) { - #define PRDF_FUNC "[__readMarks] " + #define PRDF_FUNC "[__clearFetchAttn] " uint32_t o_rc = SUCCESS; @@ -880,31 +880,43 @@ uint32_t writeSymbolMark( ExtensibleChip * i_chip, #ifdef __HOSTBOOT_MODULE // Not supported on FSP. +//------------------------------------------------------------------------------ + +void __addCallout( ExtensibleChip * i_chip, const MemRank & i_rank, + const MemSymbol & i_symbol, STEP_CODE_DATA_STRUCT & io_sc ) +{ + if ( i_symbol.isValid() ) + { + MemoryMru mm { i_chip->getTrgt(), i_rank, i_symbol }; + io_sc.service_data->SetCallout( mm ); + } +} + +//------------------------------------------------------------------------------ + template uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ); + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, bool & o_allRepairsUsed ); template<> uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ) + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, + bool & o_allRepairsUsed ) { // There is no DRAM sparing on Nimbus so simply check if both the chip and // symbol mark have been used. if ( i_chipMark.isValid() && i_symMark.isValid() ) { - io_sc.service_data->setServiceCall(); + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); - - #ifdef __HOSTBOOT_RUNTIME - // No more repairs left so no point doing any more TPS procedures. - MemDbUtils::banTps( i_chip, i_rank ); - #endif } return SUCCESS; @@ -915,14 +927,14 @@ uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc, const MemMark & i_chipMark, - const MemMark & i_symMark ) + const MemMark & i_symMark, + TdEntry * & o_dsdEvent, + bool & o_allRepairsUsed ) { #define PRDF_FUNC "[__applyRasPolicies] " uint32_t o_rc = SUCCESS; - bool allRepairsUsed = false; - do { const uint8_t ps = i_chipMark.getSymbol().getPortSlct(); @@ -964,21 +976,9 @@ uint32_t __applyRasPolicies( ExtensibleChip * i_chip, */ // Add the spares to the callout list if they exist. - if ( sp0.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, sp0 }; - io_sc.service_data->SetCallout( mm ); - } - if ( sp1.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, sp1 }; - io_sc.service_data->SetCallout( mm ); - } - if ( ecc.isValid() ) - { - MemoryMru mm { i_chip->getTrgt(), i_rank, ecc }; - io_sc.service_data->SetCallout( mm ); - } + __addCallout( i_chip, i_rank, sp0, io_sc ); + __addCallout( i_chip, i_rank, sp1, io_sc ); + __addCallout( i_chip, i_rank, ecc, io_sc ); // If the chip mark is on a spare then the spare is bad and hardware // can not steer it to another DRAM even if one is available (e.g. @@ -987,7 +987,7 @@ uint32_t __applyRasPolicies( ExtensibleChip * i_chip, ( (1 == ps) && sp1.isValid() && (dram == sp1.getDram()) ) || ( isX4 && ecc.isValid() && (dram == ecc.getDram()) ) ) { - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_VcmBadSpare ); break; // Nothing more to do. @@ -1011,21 +1011,19 @@ uint32_t __applyRasPolicies( ExtensibleChip * i_chip, (0 == ps ? !sp0.isValid() : !sp1.isValid()) ) { // A spare DRAM is available. - TdEntry * e = new DsdEvent{ i_chip, i_rank, - i_chipMark }; - MemDbUtils::pushToQueue( i_chip, e ); + o_dsdEvent = new DsdEvent{ i_chip, i_rank, + i_chipMark }; } else if ( eccSparePossible && !ecc.isValid() ) { // The ECC spare is available. - TdEntry * e = new DsdEvent{ i_chip, i_rank, - i_chipMark, true }; - MemDbUtils::pushToQueue( i_chip, e ); + o_dsdEvent = new DsdEvent{ i_chip, i_rank, + i_chipMark, true }; } else { // Chip mark is in place and sparing is not possible. - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); } @@ -1034,36 +1032,36 @@ uint32_t __applyRasPolicies( ExtensibleChip * i_chip, // mark have been used. else if ( i_chipMark.isValid() && i_symMark.isValid() ) { - allRepairsUsed = true; + o_allRepairsUsed = true; io_sc.service_data->setSignature( i_chip->getHuid(), PRDFSIG_AllDramRepairs ); } } while (0); - if ( allRepairsUsed ) - { - io_sc.service_data->setServiceCall(); - - #ifdef __HOSTBOOT_RUNTIME - // No more repairs left so no point doing any more TPS procedures. - MemDbUtils::banTps( i_chip, i_rank ); - #endif - } - return o_rc; #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + template -uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, - STEP_CODE_DATA_STRUCT & io_sc ) +uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ) { - #define PRDF_FUNC "[chipMarkCleanup] " + #define PRDF_FUNC "[MarkStore::applyRasPolicies] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; + delete o_dsdEvent; o_dsdEvent = nullptr; // just in case + + bool allRepairsUsed = false; + do { // Get the chip mark. @@ -1080,8 +1078,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, if ( !chipMark.isValid() ) break; // Add the chip mark to the callout list. - MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() }; - io_sc.service_data->SetCallout( cm_mm ); + __addCallout( i_chip, i_rank, chipMark.getSymbol(), io_sc ); // Get the symbol mark. MemMark symMark; @@ -1095,7 +1092,8 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, // If both the chip and symbol mark are on the same DRAM, clear the // symbol mark. - if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) + if ( symMark.isValid() && + chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() ) { o_rc = clearSymbolMark( i_chip, i_rank ); if ( SUCCESS != o_rc ) @@ -1110,11 +1108,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, } // Add the symbol mark to the callout list if it exists. - if ( symMark.isValid() ) - { - MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() }; - io_sc.service_data->SetCallout( sm_mm ); - } + __addCallout( i_chip, i_rank, symMark.getSymbol(), io_sc ); // Make the error log predictive and exit if DRAM repairs are disabled. if ( areDramRepairsDisabled() ) @@ -1123,25 +1117,96 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, break; // nothing else to do } - // Set the chip mark in the DRAM Repairs VPD. - o_rc = setDramInVpd( i_chip, i_rank, chipMark.getSymbol() ); + // Apply type specific RAS policies. + o_rc = __applyRasPolicies( i_chip, i_rank, io_sc, chipMark, symMark, + o_dsdEvent, allRepairsUsed ); + if ( SUCCESS != o_rc ) break; + + } while (0); + + if ( allRepairsUsed ) + { + io_sc.service_data->setServiceCall(); + + #ifdef __HOSTBOOT_RUNTIME + // No more repairs left so no point doing any more TPS procedures. + MemDbUtils::banTps( i_chip, i_rank ); + #endif + } + + return o_rc; + + #undef PRDF_FUNC +} + +template +uint32_t applyRasPolicies( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); +template +uint32_t applyRasPolicies( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); + +//------------------------------------------------------------------------------ + +template +uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[chipMarkCleanup] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( T == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // It is possible this function was called and there is no chip mark. So + // first check if one exists. + MemMark chipMark; + o_rc = readChipMark( i_chip, i_rank, chipMark ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", + PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed", i_chip->getHuid(), i_rank.getKey() ); break; } - // Apply RAS policies. - o_rc = __applyRasPolicies( i_chip, i_rank, io_sc, chipMark, - symMark ); + // There is nothing else to do if there is no chip mark. + if ( !chipMark.isValid() ) break; + + // Apply all RAS policies. + TdEntry * dsdEvent = nullptr; + o_rc = applyRasPolicies( i_chip, i_rank, io_sc, dsdEvent ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__applyRasPolicies(0x%08x,0x%02x) failed", + PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x,0x%02x) failed", i_chip->getHuid(), i_rank.getKey() ); break; } + // Add the DRAM spare event to the queue if needed. + if ( nullptr != dsdEvent ) + { + MemDbUtils::pushToQueue( i_chip, dsdEvent ); + } + + // Set the chip mark in the DRAM Repairs VPD. + if ( !areDramRepairsDisabled() ) + { + o_rc = setDramInVpd( i_chip, i_rank, chipMark.getSymbol() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed", + i_chip->getHuid(), i_rank.getKey() ); + break; + } + } + } while (0); return o_rc; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H index fca039258..d0f8c57ef 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H @@ -36,6 +36,10 @@ #include #include +#ifdef __HOSTBOOT_MODULE + #include +#endif + //############################################################################## // class MemMark //############################################################################## @@ -173,21 +177,44 @@ uint32_t writeSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank, template uint32_t clearSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank ); +#ifdef __HOSTBOOT_MODULE // Not supported on FSP. + +/** + * @brief Applies RAS policies on a rank of memory based on based on the number + * of repairs available on that rank. + * + * Function details: + * - Removes the symbol mark if it is on the same DRAM as the chip mark. + * - Adds the following to the callout list if they exist: chip mark, symbol + * mark, DRAM spares, and ECC spare. + * - Makes the error log predictive and exits if DRAM repairs are disabled. + * - Makes the error log predictive and bans TPS on this rank if all available + * repairs have been used. + * - Returns a new DsdEvent if DRAM sparing is available. + * + * @param i_chip MBA or MCA chip. + * @param i_rank Target rank. + * @param io_sc The step code data struct. + * @param o_dsdEvent A new DsdEvent if DRAM sparing is available. Otherwise, + * nullptr. Note that this is not used in all cases so the + * event will need to be manually deleted if not added to the + * TD queue. + * @return Non-SUCCESS if an internal function fails. SUCCESS otherwise. + */ +template +uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc, + TdEntry * & o_dsdEvent ); + /** * @brief If a chip mark has been verified or explicitly set due other RAS * policies, this function does all the necessary cleanup. * * Function details: - * - Adds the chip mark to the callout list. - * - Removes the symbol mark if it is on the same DRAM as the chip mark. - * - Adds the symbol mark to the callout list if it exists on another DRAM. - * - If DRAM repairs are disabled: - * - Makes the error log predictive. - * - Otherwise: - * - Sets the DRAM in the DRAM Repair VPD. - * - Makes the error log predictive if RAS policies apply. - * - Adds a DSD procedure to the TD queue is a DRAM spare is available. - * - Bans TPS on the rank if all repairs are used. + * - Calls applyRasPolicies() to make any necessary callouts. + * - If DRAM repairs are not disabled: + * - Sets the DRAM in the DRAM Repair VPD if DRAM repairs. + * - Adds a DSD procedure to the TD queue if a DRAM spare is available * * @param i_chip MBA or MCA chip. * @param i_rank Target rank. @@ -198,6 +225,8 @@ template uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc ); +#endif // Not supported on FSP. + } // end namespace MarkStore } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 29270ae34..dfd36e9be 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -1520,25 +1520,23 @@ uint32_t TpsEvent::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc ) // after the VCM procedure. if ( chipMark.isValid() ) { - /* TODO RTC 189221 DRAM sparing support - bool available; - o_rc = checkForAvailableSpares( iv_mark.getCM().getPortSlct(), - available ); - if ( SUCCESS != o_rc ) + TdEntry * dsdEvent = nullptr; + o_rc = MarkStore::applyRasPolicies( iv_chip, iv_rank, + io_sc, dsdEvent ); + if ( nullptr != dsdEvent ) { - PRDF_ERR( PRDF_FUNC "checkForAvailableSpares() failed" ); - break; + // We don't want to do the DRAM spare procedure at this time, + // because we haven't even run the VCM procedure yet. So just + // delete the procedure instead of adding it to the queue. + delete dsdEvent; dsdEvent = nullptr; } - if ( !available ) + if ( SUCCESS != o_rc ) { - // Spares have been used. Callout the mark. Make the error log - // predictive. - CalloutUtil::calloutMark( iv_mbaTrgt, iv_rank, iv_mark, io_sc ); - setTdSignature( io_sc, PRDFSIG_TpsCmAndSpare ); - io_sc.service_data->setServiceCall(); + PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x, 0x%02x) failed.", + iv_chip->getHuid(), iv_rank.getKey() ); + break; } - */ } } while (0); -- cgit v1.2.1