From 574346780f244fc333f4fcc407214eac3c10b9f8 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Thu, 14 Nov 2019 15:04:58 -0600 Subject: PRD: Update CE/UE flood threshold to reset on new ranks Change-Id: I89dce691642ebf4d753812bfae111d14c52753e3 CQ: SW480922 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/87032 Tested-by: Jenkins Server Reviewed-by: Zane C Shelley Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/88203 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- .../resolution/prdfThresholdResolutions.H | 13 ++-- .../diag/prdf/common/plat/mem/prdfMemEccAnalysis.C | 28 +++++++++ .../diag/prdf/common/plat/mem/prdfOcmbDataBundle.H | 11 +++- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C | 2 +- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H | 4 +- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 69 +++++++++++++++++----- .../diag/prdf/plat/mem/prdfP9McbistDataBundle.H | 12 +++- src/usr/diag/prdf/plat/prdfPlatServices.C | 8 +-- src/usr/diag/prdf/plat/prdfPlatServices_rt.C | 33 +++++------ src/usr/diag/prdf/plat/prdfPlatServices_rt.H | 4 +- 10 files changed, 137 insertions(+), 47 deletions(-) (limited to 'src') diff --git a/src/usr/diag/prdf/common/framework/resolution/prdfThresholdResolutions.H b/src/usr/diag/prdf/common/framework/resolution/prdfThresholdResolutions.H index e412460dc..b61699159 100755 --- a/src/usr/diag/prdf/common/framework/resolution/prdfThresholdResolutions.H +++ b/src/usr/diag/prdf/common/framework/resolution/prdfThresholdResolutions.H @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2003,2014 */ +/* Contributors Listed Below - COPYRIGHT 2003,2019 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -100,10 +102,11 @@ class ThresholdResolution : public MaskResolution enum TimeBase { - ONE_SEC = 1, - ONE_MIN = ONE_SEC * 60, - ONE_HOUR = ONE_MIN * 60, - ONE_DAY = ONE_HOUR * 24, + ONE_SEC = 1, + ONE_MIN = ONE_SEC * 60, + ONE_HOUR = ONE_MIN * 60, + TEN_HOURS = ONE_HOUR * 10, + ONE_DAY = ONE_HOUR * 24, NONE = 0xffffffff, }; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index e58e50e5c..f206a074e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -127,6 +127,20 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr, i_chip->getHuid(), i_type ); break; } + + #ifdef __HOSTBOOT_RUNTIME + // Increment the UE counter and store the rank we're on, resetting + // the UE and CE counts if we have stopped on a new rank. + ExtensibleChip * mcb = getConnectedParent( i_chip, TYPE_MCBIST ); + McbistDataBundle * mcbdb = getMcbistDataBundle(mcb); + if ( mcbdb->iv_ceUeRank != i_addr.getRank() ) + { + mcbdb->iv_ceStopCounter.reset(); + mcbdb->iv_ueStopCounter.reset(); + } + mcbdb->iv_ueStopCounter.inc( io_sc ); + mcbdb->iv_ceUeRank = i_addr.getRank(); + #endif } } while (0); @@ -180,6 +194,20 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, i_chip->getHuid(), i_type ); break; } + + #ifdef __HOSTBOOT_RUNTIME + // Increment the UE counter and store the rank we're on, resetting + // the UE and CE counts if we have stopped on a new rank. + OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip); + if ( ocmbdb->iv_ceUeRank != i_addr.getRank() ) + { + ocmbdb->iv_ceStopCounter.reset(); + ocmbdb->iv_ueStopCounter.reset(); + } + ocmbdb->iv_ueStopCounter.inc( io_sc ); + ocmbdb->iv_ceUeRank = i_addr.getRank(); + #endif + } } while (0); diff --git a/src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H b/src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H index 499baf00c..75d7dd53e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfOcmbDataBundle.H @@ -210,8 +210,15 @@ class OcmbDataBundle : public DataBundle // These are used to limit the number of times a scrub command will stop // on a UE or CE on a rank. This is to prevent potential flooding of // maintenance UEs or CEs. The threshold will be 16 per rank for each. - ScrubResumeCounter iv_ueScrubStopCounter; - ScrubResumeCounter iv_ceScrubStopCounter; + TimeBasedThreshold iv_ueStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS ); + TimeBasedThreshold iv_ceStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS );; + + // If we stop on a UE or a CE, we will need to store the rank that the + // error is on so that we can clear our respective thresholds if the + // next error we stop on is on a different rank. + MemRank iv_ceUeRank; #else // IPL only diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C index fcd347793..564dafdf4 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.C @@ -384,7 +384,7 @@ uint32_t MemTdCtlr::analyzeCmdComplete( bool & o_errorsFound, // of in defaultStep() because a TD procedure could have been run // before defaultStep() and it is possible that canResumeBgScrub() // could give as a false positive in that case. - o_rc = canResumeBgScrub( iv_resumeBgScrub ); + o_rc = canResumeBgScrub( iv_resumeBgScrub, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "canResumeBgScrub(0x%08x) failed", diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H index f1c072eea..da969e2c1 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr.H @@ -294,9 +294,11 @@ class MemTdCtlr /** * @param o_canResume True, if background scrubbing can be resumed. False, * if a new background scrub command must be started. + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ - uint32_t canResumeBgScrub( bool & o_canResume ); + uint32_t canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ); #endif diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index da18cea81..5565e217f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -313,7 +313,7 @@ uint32_t MemTdCtlr::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) PRDF_TRAC( PRDF_FUNC "Calling resumeBgScrub(0x%08x)", iv_chip->getHuid() ); - o_rc = resumeBgScrub( iv_chip ); + o_rc = resumeBgScrub( iv_chip, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "resumeBgScrub(0x%08x) failed", @@ -388,10 +388,48 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, uint32_t count = symData.size(); switch ( T ) { - case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break; - case TYPE_MBA: PRDF_ASSERT( 1 == count ); break; - case TYPE_OCMB_CHIP: PRDF_ASSERT( 1 <= count && count <= 2 ); break; - default: PRDF_ASSERT( false ); + case TYPE_MCA: + { + PRDF_ASSERT( 1 <= count && count <= 2 ); + // Increment the CE counter and store the rank we're on, + // reset the UE and CE counts if we have stopped on a new rank. + ExtensibleChip * mcb = getConnectedParent(i_chip, TYPE_MCBIST); + McbistDataBundle * mcbdb = getMcbistDataBundle(mcb); + if ( mcbdb->iv_ceUeRank != i_addr.getRank() ) + { + mcbdb->iv_ceStopCounter.reset(); + mcbdb->iv_ueStopCounter.reset(); + } + mcbdb->iv_ceStopCounter.inc( io_sc ); + mcbdb->iv_ceUeRank = i_addr.getRank(); + + break; + } + case TYPE_MBA: + { + PRDF_ASSERT( 1 == count ); + break; + } + case TYPE_OCMB_CHIP: + { + PRDF_ASSERT( 1 <= count && count <= 2 ); + // Increment the UE counter and store the rank we're on, + // reset the UE and CE counts if we have stopped on a new rank. + OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip); + if ( ocmbdb->iv_ceUeRank != i_addr.getRank() ) + { + ocmbdb->iv_ceStopCounter.reset(); + ocmbdb->iv_ueStopCounter.reset(); + } + ocmbdb->iv_ceStopCounter.inc( io_sc ); + ocmbdb->iv_ceUeRank = i_addr.getRank(); + + break; + } + default: + { + PRDF_ASSERT( false ); + } } for ( auto & d : symData ) @@ -1607,7 +1645,8 @@ uint32_t MemTdCtlr::handleRrFo() //------------------------------------------------------------------------------ template<> -uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) +uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr::canResumeBgScrub] " @@ -1638,8 +1677,8 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) // of UEs or CEs that we have stopped on on a rank. // If we haven't hit CE or UE threshold, check the CE stop conditions - if ( !getMcbistDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() && - !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + if ( !getMcbistDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) && + !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If the stop conditions aren't set, just break out. if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH @@ -1652,7 +1691,7 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) } // If we haven't hit UE threshold yet, check the UE stop condition - if ( !getMcbistDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + if ( !getMcbistDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If the stop condition isn't set, just break out if ( !reg->IsBitSet(35) ) // pause on UE @@ -1677,7 +1716,8 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) } template<> -uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) +uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr::canResumeBgScrub] " @@ -1708,8 +1748,8 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) // of UEs or CEs that we have stopped on on a rank. // If we haven't hit CE or UE threshold, check the CE stop conditions - if ( !getOcmbDataBundle(iv_chip)->iv_ceScrubStopCounter.atTh() && - !getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + if ( !getOcmbDataBundle(iv_chip)->iv_ceStopCounter.thReached(io_sc) && + !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If the stop conditions aren't set, just break out. if ( !(0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH @@ -1722,7 +1762,7 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) } // If we haven't hit UE threshold yet, check the UE stop condition - if ( !getOcmbDataBundle(iv_chip)->iv_ueScrubStopCounter.atTh() ) + if ( !getOcmbDataBundle(iv_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If the stop condition isn't set, just break out if ( !reg->IsBitSet(35) ) // pause on UE @@ -1747,7 +1787,8 @@ uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) } template<> -uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) +uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr::canResumeBgScrub] " diff --git a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H index 3883eb936..44ef77ec7 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H +++ b/src/usr/diag/prdf/plat/mem/prdfP9McbistDataBundle.H @@ -36,6 +36,7 @@ // Platform includes #include #include +#include namespace PRDF { @@ -88,8 +89,15 @@ class McbistDataBundle : public DataBundle // These are used to limit the number of times a scrub command will stop // on a UE or CE on a rank. This is to prevent potential flooding of // maintenance UEs or CEs. The threshold will be 16 per rank for each. - ScrubResumeCounter iv_ueScrubStopCounter; - ScrubResumeCounter iv_ceScrubStopCounter; + TimeBasedThreshold iv_ueStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS ); + TimeBasedThreshold iv_ceStopCounter = + TimeBasedThreshold( 16, ThresholdResolution::TEN_HOURS ); + + // If we stop on a UE or a CE, we will need to store the rank that the + // error is on so that we can clear our respective thresholds if the + // next error we stop on is on a different rank. + MemRank iv_ceUeRank; #endif }; diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.C b/src/usr/diag/prdf/plat/prdfPlatServices.C index 55bc84657..a3112f54f 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices.C @@ -865,8 +865,8 @@ uint32_t startBgScrub( ExtensibleChip * i_mcaChip, #ifdef __HOSTBOOT_RUNTIME // Starting a new command. Clear the UE and CE scrub stop counters - getMcbistDataBundle( mcbChip )->iv_ueScrubStopCounter.reset(); - getMcbistDataBundle( mcbChip )->iv_ceScrubStopCounter.reset(); + getMcbistDataBundle( mcbChip )->iv_ueStopCounter.reset(); + getMcbistDataBundle( mcbChip )->iv_ceStopCounter.reset(); #endif // Get the stop conditions. @@ -1445,8 +1445,8 @@ uint32_t startBgScrub( ExtensibleChip * i_ocmb, #ifdef __HOSTBOOT_RUNTIME // Starting a new command. Clear the UE and CE scrub stop counters - getOcmbDataBundle( mcbChip )->iv_ueScrubStopCounter.reset(); - getOcmbDataBundle( mcbChip )->iv_ceScrubStopCounter.reset(); + getOcmbDataBundle( mcbChip )->iv_ueStopCounter.reset(); + getOcmbDataBundle( mcbChip )->iv_ceStopCounter.reset(); #endif // Get the stop conditions. diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C index 74ba41e30..09eff0b6e 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C @@ -152,7 +152,8 @@ uint32_t stopBgScrub( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ template<> -uint32_t resumeBgScrub( ExtensibleChip * i_chip ) +uint32_t resumeBgScrub( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[PlatServices::resumeBgScrub] " @@ -177,13 +178,13 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) // Check UE and CE stop counters to determine stop conditions mss::mcbist::stop_conditions<> stopCond; - if ( getMcbistDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() ) + if ( getMcbistDataBundle(i_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If we've reached the limit of UEs we're allowed to stop on // per rank, only set the stop on mpe stop condition. stopCond.set_pause_on_mpe(mss::ON); } - else if ( getMcbistDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() ) + else if (getMcbistDataBundle(i_chip)->iv_ceStopCounter.thReached(io_sc)) { // If we've reached the limit of CEs we're allowed to stop on // per rank, set all the normal stop conditions except stop on CE @@ -208,12 +209,9 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) } // Resume the command on the next address. - // Note: we have to limit the number of times a command has been stopped - // because of a UE/CE. Therefore, we must always resume the command to - // the end of the current slave rank so we can reset the UE/CE counts. errlHndl_t errl; FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt, - mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond ); + mss::mcbist::end_boundary::DONT_CHANGE, stopCond ); if ( nullptr != errl ) { @@ -233,12 +231,14 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ template<> -uint32_t resumeBgScrub( ExtensibleChip * i_chip ) +uint32_t resumeBgScrub( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); - return resumeBgScrub(getConnectedParent(i_chip, TYPE_MCBIST)); + return resumeBgScrub(getConnectedParent(i_chip, TYPE_MCBIST), + io_sc); } //############################################################################## @@ -378,7 +378,8 @@ uint32_t __resumeScrub( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ template<> -uint32_t resumeBgScrub( ExtensibleChip * i_chip ) +uint32_t resumeBgScrub( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); @@ -458,7 +459,8 @@ uint32_t stopBgScrub( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ template<> -uint32_t resumeBgScrub( ExtensibleChip * i_chip ) +uint32_t resumeBgScrub( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[PlatServices::resumeBgScrub] " @@ -485,13 +487,13 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) // Check UE and CE stop counters to determine stop conditions mss::mcbist::stop_conditions<> stopCond; - if ( getOcmbDataBundle(i_chip)->iv_ueScrubStopCounter.atTh() ) + if ( getOcmbDataBundle(i_chip)->iv_ueStopCounter.thReached(io_sc) ) { // If we've reached the limit of UEs we're allowed to stop on // per rank, only set the stop on mpe stop condition. stopCond.set_pause_on_mpe(mss::ON); } - else if ( getOcmbDataBundle(i_chip)->iv_ceScrubStopCounter.atTh() ) + else if ( getOcmbDataBundle(i_chip)->iv_ceStopCounter.thReached(io_sc) ) { // If we've reached the limit of CEs we're allowed to stop on // per rank, set all the normal stop conditions except stop on CE @@ -516,12 +518,9 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) } // Resume the command on the next address. - // Note: we have to limit the number of times a command has been stopped - // because of a UE/CE. Therefore, we must always resume the command to - // the end of the current slave rank so we can reset the UE/CE counts. errlHndl_t errl; FAPI_INVOKE_HWP( errl, mss::memdiags::continue_cmd, fapiTrgt, - mss::mcbist::STOP_AFTER_SLAVE_RANK, stopCond ); + mss::mcbist::end_boundary::DONT_CHANGE, stopCond ); if ( nullptr != errl ) { diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.H b/src/usr/diag/prdf/plat/prdfPlatServices_rt.H index a64aa8da6..49d4c0a73 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.H +++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.H @@ -90,10 +90,12 @@ uint32_t stopBgScrub( ExtensibleChip * i_chip ); * Diagnotics procedure. * * @param i_chip MCBIST, MCA, MBA, or OCMB chip. + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ template -uint32_t resumeBgScrub( ExtensibleChip * i_chip ); +uint32_t resumeBgScrub( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Resumes TD scrubbing after it has paused on error. -- cgit v1.2.1