From c0e2f1e9a7901c5409c25fa15351f5d7213e2be1 Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Wed, 11 Apr 2018 21:34:44 -0500 Subject: PRD: add full maint cmd support for all TPS procedures Change-Id: I18d5084eed24765a29e4b868c5f1caba58895110 RTC: 190428 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57152 Tested-by: Jenkins Server Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Matt Derksen Reviewed-by: Caleb N. Palmer Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57133 CI-Ready: Zane C. Shelley Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW --- .../prdf/common/plat/mem/prdfCenMbaDataBundle.H | 48 ++++- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C | 117 +---------- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H | 10 +- src/usr/diag/prdf/plat/mem/prdfMemTps.H | 16 +- src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C | 190 ++++++++++++++--- src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 224 ++++++++++++++++++--- .../diag/prdf/plat/pegasus/prdfCenMbaTdCtlr_ipl.C | 26 +-- src/usr/diag/prdf/plat/prdfPlatServices.C | 57 ------ src/usr/diag/prdf/plat/prdfPlatServices.H | 29 --- src/usr/diag/prdf/plat/prdfPlatServices_ipl.C | 190 ----------------- 10 files changed, 428 insertions(+), 479 deletions(-) (limited to 'src') diff --git a/src/usr/diag/prdf/common/plat/mem/prdfCenMbaDataBundle.H b/src/usr/diag/prdf/common/plat/mem/prdfCenMbaDataBundle.H index f883653cf..07fa0fd07 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfCenMbaDataBundle.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfCenMbaDataBundle.H @@ -36,6 +36,7 @@ #ifdef __HOSTBOOT_MODULE #include + #include #ifndef __HOSTBOOT_RUNTIME #include @@ -62,12 +63,20 @@ class MbaDataBundle : public DataBundle ~MbaDataBundle() { #ifdef __HOSTBOOT_MODULE + delete iv_tdCtlr; iv_tdCtlr = nullptr; - #endif - #if defined(__HOSTBOOT_MODULE) && !defined(__HOSTBOOT_RUNTIME) + #ifdef __HOSTBOOT_RUNTIME + + delete iv_tpsFalseAlarmCounter; + + #else // IPL only + delete iv_sfCmd; iv_sfCmd = nullptr; delete iv_iplCeStats; iv_iplCeStats = nullptr; + + #endif + #endif } @@ -88,7 +97,22 @@ class MbaDataBundle : public DataBundle return iv_tdCtlr; } - #ifndef __HOSTBOOT_RUNTIME + #ifdef __HOSTBOOT_RUNTIME + + /** @return The TPS false alarm counter. */ + TpsFalseAlarm * getTpsFalseAlarmCounter() + { + if ( nullptr == iv_tpsFalseAlarmCounter ) + { + iv_tpsFalseAlarmCounter = new TpsFalseAlarm( + TimeBasedThreshold{ 3, ThresholdResolution::ONE_DAY } ); + } + + return iv_tpsFalseAlarmCounter; + } + + #else // IPL only + /** @return The IPL CE statistics object. */ MemIplCeStats * getIplCeStats() { @@ -99,9 +123,10 @@ class MbaDataBundle : public DataBundle return iv_iplCeStats; } - #endif - #endif + #endif // __HOSTBOOT_RUNTIME + + #endif // __HOSTBOOT_MODULE private: // instance variables @@ -113,12 +138,19 @@ class MbaDataBundle : public DataBundle /** The Targeted Diagnostics controller. */ MemTdCtlr * iv_tdCtlr = nullptr; - #ifndef __HOSTBOOT_RUNTIME + #ifdef __HOSTBOOT_RUNTIME + + /** TPS false alarm counter. */ + TpsFalseAlarm * iv_tpsFalseAlarmCounter = nullptr; + + #else // IPL only + /** MNFG IPL CE statistics. */ MemIplCeStats * iv_iplCeStats = nullptr; - #endif - #endif + #endif // __HOSTBOOT_RUNTIME + + #endif // __HOSTBOOT_MODULE public: // instance variables diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C index b4a67b690..5ff565ebc 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C @@ -1276,56 +1276,17 @@ int32_t CenMbaTdCtlr::startDsdPhase1( STEP_CODE_DATA_STRUCT & io_sc ) int32_t CenMbaTdCtlr::startTpsPhase1( STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[CenMbaTdCtlr::startTpsPhase1] " + // Initially true, until hardware error is found. + iv_tpsFalseAlarm = true; - int32_t o_rc = SUCCESS; - - io_sc.service_data->AddSignatureList( iv_mbaTrgt, PRDFSIG_StartTpsPhase1 ); - iv_tdState = TPS_PHASE_1; - - do - { - // Initially true, until hardware error is found. - iv_tpsFalseAlarm = true; - - // Starting a new TPS procedure. Reset the scrub resume counter. - iv_scrubResumeCounter.reset(); - - o_rc = prepareNextCmd( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "prepareNextCmd() failed" ); - break; - } - - // Set CE thresholds. - o_rc = setTpsThresholds(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setTpsThresholds() failed" ); - break; - } - - // Set stop conditions based on CE count type. - uint32_t stopCond = COND_RT_TPS_HARD_CE; - if ( iv_tpsRankData.checkCeTypeTh(iv_rank) ) - { - stopCond = COND_RT_TPS_ALL_CE; - } - - // Start TPS phase 1. - o_rc = doTdScrubCmd( stopCond, mss_MaintCmdWrapper::SLAVE_RANK_ONLY ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "doTdScrubCmd() failed" ); - break; - } - - } while(0); + // Starting a new TPS procedure. Reset the scrub resume counter. + iv_scrubResumeCounter.reset(); - return o_rc; + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + // moved to TpsEvent class + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #undef PRDF_FUNC + return SUCCESS; } //------------------------------------------------------------------------------ @@ -2545,68 +2506,6 @@ int32_t CenMbaTdCtlr::getTpsCeThr( uint16_t & o_thr ) //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::setTpsThresholds() -{ - #define PRDF_FUNC "[CenMbaTdCtlr::setTpsThresholds] " - - int32_t o_rc = SUCCESS; - - do - { - if ( TPS_PHASE_1 != iv_tdState ) - { - PRDF_ERR( PRDF_FUNC "Invalid state machine configuration" ); - o_rc = FAIL; - break; - } - - const char * reg_str = (0 == iv_mbaPos) ? "MBA0_MBSTR" : "MBA1_MBSTR"; - SCAN_COMM_REGISTER_CLASS * mbstr = iv_membChip->getRegister( reg_str ); - - // MBSTR's content could be modified by cleanupCmd() so refresh cache. - o_rc = mbstr->ForceRead(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "ForceRead() failed on %s", reg_str ); - break; - } - - // Set all CE thresholds to the maximum value. The reason for this is if - // there are a lot of CEs we can stop the TPS scrub and place any marks, - // if needed. This will save time since the TPS scrub could take several - // hours. The threshold is set to the max value so that we can get - // enough data to place a mark. - mbstr->SetBitFieldJustified( 4, 12, 0xfff ); - mbstr->SetBitFieldJustified( 16, 12, 0xfff ); - mbstr->SetBitFieldJustified( 28, 12, 0xfff ); - - if ( !iv_tpsRankData.checkCeTypeTh(iv_rank) ) - { - // Set the per symbol counters to count only hard CEs. - mbstr->SetBitFieldJustified( 55, 3, 0x1 ); - } - else - { - // Set the per symbol counters to count all CE typs. - mbstr->SetBitFieldJustified( 55, 3, 0x7 ); - } - - o_rc = mbstr->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); - break; - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::tpsChipMark( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlr::tpsChipMark] " diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H index 7441fad5a..9fcaa4253 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016 */ +/* Contributors Listed Below - COPYRIGHT 2016,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -334,14 +334,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon */ int32_t getTpsCeThr( uint16_t & o_thr ); - /** - * @brief Sets the CE thresholds in hardware for a TPS procedure. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - * @note iv_tdState must be set to a valid TPS phase before calling this - * function. - */ - int32_t setTpsThresholds(); - /** * @brief Sets iv_mark in hardware and adds a VCM request to the TD queue. * @param io_sc The step code data struct. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps.H b/src/usr/diag/prdf/plat/mem/prdfMemTps.H index 0f5e89d18..c06b759cf 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2017 */ +/* Contributors Listed Below - COPYRIGHT 2016,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -68,14 +68,22 @@ class TpsEvent : public TdEntry private: // functions - #ifdef __HOSTBOOT_RUNTIME + /** + * @brief Starts the appropriate maintenance command based on iv_phase. + * @pre iv_phase must be set appropriately before calling this function. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + uint32_t startCmd(); /** - * @brief Starts phase 1 of the procedure. + * @brief Starts the next phase of the procedure. * @param io_sc The step code data struct. + * @post iv_phase will be updated appropriately per design. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ - uint32_t startTpsPhase1_rt( STEP_CODE_DATA_STRUCT & io_sc ); + uint32_t startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ); + + #ifdef __HOSTBOOT_RUNTIME /** * @brief Analyzes phase 1 of the procedure. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C index 339d3ac0a..da993d51e 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_ipl.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2017 */ +/* Contributors Listed Below - COPYRIGHT 2016,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -62,22 +62,13 @@ uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, //phase 0 if ( TD_PHASE_0 == iv_phase ) { - // Start TPS phase 1 - io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), - PRDFSIG_StartTpsPhase1 ); - - PRDF_TRAC( PRDF_FUNC "Starting TPS Phase 1" ); - - o_rc = startTpsPhase1( iv_chip, iv_rank ); + o_rc = startNextPhase( io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "startTpsPhase1(0x%08x,m%ds%d) failed", - iv_chip->getHuid(), iv_rank.getMaster(), - iv_rank.getSlave() ); + PRDF_ERR( PRDF_FUNC "startNextPhase() failed on 0x%08x,0x%02x", + iv_chip->getHuid(), getKey() ); break; } - - iv_phase = TD_PHASE_1; } //phase 1/2 else @@ -183,22 +174,13 @@ uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, //phase 1 if ( TD_PHASE_1 == iv_phase ) { - // Start TPS phase 2 - io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), - PRDFSIG_StartTpsPhase2 ); - - PRDF_TRAC( PRDF_FUNC "Starting TPS Phase 2" ); - - o_rc = startTpsPhase2( iv_chip, iv_rank ); + o_rc = startNextPhase( io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "startTpsPhase2(0x%08x,m%ds%d) " - "failed", iv_chip->getHuid(), - iv_rank.getMaster(), iv_rank.getSlave() ); + PRDF_ERR( PRDF_FUNC "startNextPhase() failed on 0x%08x," + "0x%02x", iv_chip->getHuid(), getKey() ); break; } - - iv_phase = TD_PHASE_2; } //phase 2 else @@ -236,6 +218,164 @@ uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, #undef PRDF_FUNC } +//############################################################################## +// +// Generic template functions +// +//############################################################################## + +template +uint32_t TpsEvent::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +{ + uint32_t signature = 0; + + switch ( iv_phase ) + { + case TD_PHASE_0: + iv_phase = TD_PHASE_1; + signature = PRDFSIG_StartTpsPhase1; + break; + + case TD_PHASE_1: + iv_phase = TD_PHASE_2; + signature = PRDFSIG_StartTpsPhase2; + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x", + iv_phase, iv_chip->getHuid(), getKey() ); + + io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature ); + + return startCmd(); +} + +//############################################################################## +// +// Specializations for MCA +// +//############################################################################## + +template<> +uint32_t TpsEvent::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs during Memory Diagnostics. The design is to let the + // command continue to the end of the rank and we do diagnostics on the + // CE counts found in the per-symbol counters. Therefore, all we need to do + // is tell the hardware which CE types to count. + + mss::mcbist::stop_conditions stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only soft/inter CEs. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//############################################################################## +// +// Specializations for MBA +// +//############################################################################## + +template<> +uint32_t TpsEvent::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + uint32_t stopCond = mss_MaintCmd::NO_STOP_CONDITIONS; + + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs during Memory Diagnostics. The design is to let the + // command continue to the end of the rank and we do diagnostics on the + // CE counts found in the per-symbol counters. Therefore, all we need to do + // is tell the hardware which CE types to count. + + do + { + ExtensibleChip * membChip = getConnectedParent( iv_chip, TYPE_MEMBUF ); + const char * reg_str = (0 == iv_chip->getPos()) ? "MBA0_MBSTR" + : "MBA1_MBSTR"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s: 0x%08x", reg_str, + membChip->getHuid() ); + break; + } + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only soft/inter CEs. + mbstr->SetBitFieldJustified( 55, 3, 0x6 ); + break; + + case TD_PHASE_2: + // Set the per symbol counters to count only hard CEs. + mbstr->SetBitFieldJustified( 55, 3, 0x1 ); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + o_rc = mbstr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s: 0x%08x", reg_str, + membChip->getHuid() ); + break; + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 7067a0121..480d02839 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -26,6 +26,7 @@ /** @file prdfMemTps_rt.C */ // Platform includes +#include #include #include #include @@ -75,14 +76,10 @@ TpsFalseAlarm * __getTpsFalseAlarmCounter( ExtensibleChip * i_chip ) return getMcaDataBundle(i_chip)->getTpsFalseAlarmCounter(); } -//------------------------------------------------------------------------------ - template<> TpsFalseAlarm * __getTpsFalseAlarmCounter( ExtensibleChip * i_chip ) { - // TODO RTC 157888 - //return getMbaDataBundle(i_chip)->getTpsFalseAlarmCounter(); - return nullptr; + return getMbaDataBundle(i_chip)->getTpsFalseAlarmCounter(); } //------------------------------------------------------------------------------ @@ -306,24 +303,6 @@ uint32_t __updateVpdSumAboveOne( CeCount i_sumAboveOneCount, //------------------------------------------------------------------------------ -template -uint32_t TpsEvent::startTpsPhase1_rt( STEP_CODE_DATA_STRUCT & io_sc ) -{ - PRDF_TRAC( "[TpsEvent] Starting TPS Phase 1: 0x%08x,0x%02x", - iv_chip->getHuid(), getKey() ); - - iv_phase = TD_PHASE_1; - io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), - PRDFSIG_StartTpsPhase1 ); - bool countAllCes = false; - if ( __getTpsFalseAlarmCounter(iv_chip)->count(iv_rank, io_sc) >= 1 ) - countAllCes = true; - - return PlatServices::startTpsRuntime( iv_chip, iv_rank, countAllCes); -} - -//------------------------------------------------------------------------------ - template uint32_t TpsEvent::analyzeTpsPhase1_rt( STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) @@ -1177,14 +1156,21 @@ uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, break; } + // Runtime TPS is slightly different than IPL TPS or any other TD event. + // There really is only one phase, but we use two phases to help + // differentiate between the CE types that are collected. So only one of + // the two phases will be used during a TPS procedure, not both. + // - Phase 1 looks for hard CEs. This is always used first on any rank. + // - Phase 2 looks for all CE types. This phase is only used on a rank + // after phase 1 has exceeded a threshold of false alarms. + switch ( iv_phase ) { case TD_PHASE_0: - // Start TPS phase 1 - o_rc = startTpsPhase1_rt( io_sc ); + o_rc = startNextPhase( io_sc ); break; case TD_PHASE_1: - // Analyze TPS phase 1 + case TD_PHASE_2: o_rc = analyzeTpsPhase1_rt( io_sc, o_done ); break; default: PRDF_ASSERT( false ); // invalid phase @@ -1222,6 +1208,192 @@ uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, #undef PRDF_FUNC } +//############################################################################## +// +// Generic template functions +// +//############################################################################## + +template +uint32_t TpsEvent::startNextPhase( STEP_CODE_DATA_STRUCT & io_sc ) +{ + uint32_t signature = 0; + + switch ( iv_phase ) + { + case TD_PHASE_0: + { + // Only use phase 2 if the false alarm counter has exceeded + // threshold. Otherwise, use phase 1. + TpsFalseAlarm * faCounter = __getTpsFalseAlarmCounter(iv_chip); + if ( faCounter->count(iv_rank, io_sc) >= 1 ) + { + iv_phase = TD_PHASE_2; + signature = PRDFSIG_StartTpsPhase2; + } + else + { + iv_phase = TD_PHASE_1; + signature = PRDFSIG_StartTpsPhase1; + } + break; + } + + default: PRDF_ASSERT( false ); // invalid phase + } + + PRDF_TRAC( "[TpsEvent] Starting TPS Phase %d: 0x%08x,0x%02x", + iv_phase, iv_chip->getHuid(), getKey() ); + + io_sc.service_data->AddSignatureList( iv_chip->getTrgt(), signature ); + + return startCmd(); +} + +//############################################################################## +// +// Specializations for MCA +// +//############################################################################## + +template<> +uint32_t TpsEvent::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + // We don't need to set any stop-on-error conditions or thresholds for + // soft/inter/hard CEs at runtime. The design is to let the command continue + // to the end of the rank and we do diagnostics on the CE counts found in + // the per-symbol counters. Therefore, all we need to do is tell the + // hardware which CE types to count. + + mss::mcbist::stop_conditions stopCond; + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only hard CEs. + stopCond.set_nce_hard_symbol_count_enable(mss::ON); + break; + + case TD_PHASE_2: + // Since there are not enough hard CEs to trigger a symbol mark, set + // the per symbol counters to count all CE types. + stopCond.set_nce_soft_symbol_count_enable( mss::ON); + stopCond.set_nce_inter_symbol_count_enable(mss::ON); + stopCond.set_nce_hard_symbol_count_enable( mss::ON); + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//############################################################################## +// +// Specializations for MBA +// +//############################################################################## + +template<> +uint32_t TpsEvent::startCmd() +{ + #define PRDF_FUNC "[TpsEvent::startCmd] " + + uint32_t o_rc = SUCCESS; + + uint32_t stopCond = mss_MaintCmd::NO_STOP_CONDITIONS; + + // Due to a hardware bug in the Centaur, we must execute runtime maintenance + // commands at a very slow rate. Because of this, we decided that we should + // stop the command immediately on error if there is a UE or MPE so that we + // can respond quicker and send a DMD message to the hypervisor or do chip + // mark verification as soon as possible. + + stopCond |= mss_MaintCmd::STOP_ON_UE; + stopCond |= mss_MaintCmd::STOP_ON_MPE; + stopCond |= mss_MaintCmd::STOP_IMMEDIATE; + + do + { + ExtensibleChip * membChip = getConnectedParent( iv_chip, TYPE_MEMBUF ); + const char * reg_str = (0 == iv_chip->getPos()) ? "MBA0_MBSTR" + : "MBA1_MBSTR"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s: 0x%08x", reg_str, + membChip->getHuid() ); + break; + } + + // Stopping on CE thresholds should save us time since the TPS scrub + // could take several hours (again, due to the Centaur bug). We want to + // set all of the CE thresholds to the maximum value so that we can get + // as much data as we can for analysis before stopping the command. + // Hopefully, we can use this data to place any marks, if needed. + mbstr->SetBitFieldJustified( 4, 12, 0xfff ); + mbstr->SetBitFieldJustified( 16, 12, 0xfff ); + mbstr->SetBitFieldJustified( 28, 12, 0xfff ); + + switch ( iv_phase ) + { + case TD_PHASE_1: + // Set the per symbol counters to count only hard CEs. + mbstr->SetBitFieldJustified( 55, 3, 0x1 ); + stopCond |= mss_MaintCmd::STOP_ON_HARD_NCE_ETE; + break; + + case TD_PHASE_2: + // Since there are not enough hard CEs to trigger a symbol mark, + // set the per symbol counters to count all CE types. + mbstr->SetBitFieldJustified( 55, 3, 0x7 ); + stopCond |= mss_MaintCmd::STOP_ON_SOFT_NCE_ETE; + stopCond |= mss_MaintCmd::STOP_ON_INT_NCE_ETE; + stopCond |= mss_MaintCmd::STOP_ON_HARD_NCE_ETE; + break; + + default: PRDF_ASSERT( false ); // invalid phase + } + + o_rc = mbstr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s: 0x%08x", reg_str, + membChip->getHuid() ); + break; + } + + // Start the time based scrub procedure on this slave rank. + o_rc = startTdScrub( iv_chip, iv_rank, SLAVE_RANK, stopCond ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "startTdScrub(0x%08x,0x%2x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr_ipl.C b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr_ipl.C index 51975b934..c72acd460 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr_ipl.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr_ipl.C @@ -803,40 +803,22 @@ int32_t CenMbaTdCtlr::startDsdPhase2( STEP_CODE_DATA_STRUCT & io_sc ) int32_t CenMbaTdCtlr::startTpsPhase1( STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[CenMbaTdCtlr::startTpsPhase1] " - - int32_t o_rc = SUCCESS; - - io_sc.service_data->AddSignatureList( iv_mbaTrgt, PRDFSIG_StartTpsPhase1 ); - iv_tdState = TPS_PHASE_1; - //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // Moved to startTpsPhase1() in prdfPlatServices_ipl.C + // Moved to TpsEvent class //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - return o_rc; - - #undef PRDF_FUNC + return SUCCESS; } //------------------------------------------------------------------------------ int32_t CenMbaTdCtlr::startTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[CenMbaTdCtlr::startTpsPhase2] " - - int32_t o_rc = SUCCESS; - - io_sc.service_data->AddSignatureList( iv_mbaTrgt, PRDFSIG_StartTpsPhase2 ); - iv_tdState = TPS_PHASE_2; - //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - // Moved to startTpsPhase2() in prdfPlatServices_ipl.C + // Moved to TpsEvent class //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - return o_rc; - - #undef PRDF_FUNC + return SUCCESS; } //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.C b/src/usr/diag/prdf/plat/prdfPlatServices.C index b714427bb..b9024dc89 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices.C @@ -746,52 +746,6 @@ uint32_t __startTdScrub_mca( ExtensibleChip * i_mcaChip, const MemRank & i_rank, #undef PRDF_FUNC } -//------------------------------------------------------------------------------ - -template<> -uint32_t startTpsPhase1( ExtensibleChip * i_mcaChip, - const MemRank & i_rank ) -{ - mss::mcbist::stop_conditions stopCond; - stopCond.set_nce_soft_symbol_count_enable(mss::ON) - .set_nce_inter_symbol_count_enable(mss::ON); - - return __startTdScrub_mca( i_mcaChip, i_rank, stopCond, SLAVE_RANK ); -} - -//------------------------------------------------------------------------------ - -template<> -uint32_t startTpsPhase2( ExtensibleChip * i_mcaChip, - const MemRank & i_rank ) -{ - mss::mcbist::stop_conditions stopCond; - stopCond.set_nce_hard_symbol_count_enable(mss::ON); - - return __startTdScrub_mca( i_mcaChip, i_rank, stopCond, SLAVE_RANK ); -} - -//------------------------------------------------------------------------------ - -template<> -uint32_t startTpsRuntime( ExtensibleChip * i_mcaChip, - const MemRank & i_rank, - bool i_countAllCes ) -{ - mss::mcbist::stop_conditions stopCond; - stopCond.set_nce_hard_symbol_count_enable(mss::ON); - - // If the TPS false alarms count is one or more, enable per-symbol counters - // for soft and intermittent CEs. - if ( i_countAllCes ) - { - stopCond.set_nce_soft_symbol_count_enable(mss::ON) - .set_nce_inter_symbol_count_enable(mss::ON); - } - - return __startTdScrub_mca( i_mcaChip, i_rank, stopCond, SLAVE_RANK ); -} - //############################################################################## //## Centaur Maintenance Command wrappers //############################################################################## @@ -982,17 +936,6 @@ uint32_t startTdScrub( ExtensibleChip * i_chip, #undef PRDF_FUNC } -//------------------------------------------------------------------------------ - -template<> -uint32_t startTpsRuntime( ExtensibleChip * i_mbaChip, - const MemRank & i_rank, - bool i_countAllCes ) -{ - PRDF_ERR( "function not implemented yet" ); // TODO RTC 157888 - return SUCCESS; -} - //############################################################################## //## Core/cache trace array functions //############################################################################## diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.H b/src/usr/diag/prdf/plat/prdfPlatServices.H index 2c4510506..9168aab82 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.H +++ b/src/usr/diag/prdf/plat/prdfPlatServices.H @@ -172,35 +172,6 @@ template uint32_t startTdScrub( ExtensibleChip * i_chip, const MemRank & i_rank, AddrRangeType i_rangeType, SCT i_stopCond ); -/** - * @brief Starts a targeted scrub command on the target rank for TPS phase 1. - * @param i_chip MCA or MBA chip. - * @param i_rank The rank to target. - * @return Non-SUCCESS if an internal function fails, otherwise SUCCESS. - */ -template -uint32_t startTpsPhase1( ExtensibleChip * i_chip, const MemRank & i_rank ); - -/** - * @brief Starts a targeted scrub command on the target rank for TPS phase 2. - * @param i_chip MCA or MBA chip. - * @param i_rank The rank to target. - * @return Non-SUCCESS if an internal function fails, otherwise SUCCESS. - */ -template -uint32_t startTpsPhase2( ExtensibleChip * i_chip, const MemRank & i_rank ); - -/** - * @brief Starts a targeted scrub command on the target rank for runtime TPS. - * @param i_chip MCA or MBA chip. - * @param i_rank The rank to target. - * @param i_countAllCes True if we should count all CEs. - * @return Non-SUCCESS if an internal function fails, otherwise SUCCESS. - */ -template -uint32_t startTpsRuntime( ExtensibleChip * i_chip, const MemRank & i_rank, - bool i_countAllCes ); - //############################################################################## //## Core/cache trace array functions //############################################################################## diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_ipl.C b/src/usr/diag/prdf/plat/prdfPlatServices_ipl.C index 3944671a0..3982e17f5 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_ipl.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices_ipl.C @@ -640,196 +640,6 @@ uint32_t startTdSteerCleanup( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ -template<> -uint32_t startTpsPhase1( ExtensibleChip * i_chip, - const MemRank & i_rank ) -{ - #define PRDF_FUNC "[PlatServices::startTpsPhase1] " - - PRDF_ASSERT( isInMdiaMode() ); // MDIA must be running. - - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - - uint32_t o_rc = SUCCESS; - - // Get the MBA fapi target - fapi2::Target fapiTrgt ( i_chip->getTrgt() ); - - // Get the stop conditions. - uint32_t stopCond = mss_MaintCmd::STOP_ON_RETRY_CE_ETE | - mss_MaintCmd::STOP_ON_END_ADDRESS | - mss_MaintCmd::ENABLE_CMD_COMPLETE_ATTENTION; - - // Note that we set the stop on RCE ETE flag. This requires us to set a - // threshold in the MBSTR. Fortunately, MDIA sets the threshold for us when - // it starts the first command on this MBA. - - do - { - // Set up the per-symbol counters to capture soft CEs. - ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); - const char * reg_str = (0 == i_chip->getPos()) ? "MBA0_MBSTR" - : "MBA1_MBSTR"; - SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); - o_rc = mbstr->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); - break; - } - - // Enable per-symbol error counters to count soft CEs - mbstr->SetBit(55); - mbstr->SetBit(56); - // Disable per-symbol error counters to count hard CEs - mbstr->ClearBit(57); - - o_rc = mbstr->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); - break; - } - - // Get the address range of the master rank. - fapi2::buffer saddr, eaddr; - o_rc = getMemAddrRange( i_chip, i_rank, saddr, eaddr, - SLAVE_RANK ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "getMemAddrRange(0x%08x,0x%2x) failed", - i_chip->getHuid(), i_rank.getKey() ); - break; - } - - // Clear all of the counters and maintenance ECC attentions. - o_rc = prepareNextCmd( i_chip ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "prepareNextCmd(0x%08x) failed", - i_chip->getHuid() ); - break; - } - - // Start a steer cleanup command. - mss_TimeBaseScrub cmd { fapiTrgt, saddr, eaddr, - mss_MaintCmd::FAST_MAX_BW_IMPACT, - stopCond, false }; - errlHndl_t errl; - FAPI_INVOKE_HWP( errl, cmd.setupAndExecuteCmd ); - if ( nullptr != errl ) - { - PRDF_ERR( PRDF_FUNC "setupAndExecuteCmd() on 0x%08x,0x%02x failed", - i_chip->getHuid(), i_rank.getKey() ); - PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); - o_rc = FAIL; break; - } - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -template<> -uint32_t startTpsPhase2( ExtensibleChip * i_chip, - const MemRank & i_rank ) -{ - #define PRDF_FUNC "[PlatServices::startTpsPhase2] " - - PRDF_ASSERT( isInMdiaMode() ); // MDIA must be running. - - PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - - uint32_t o_rc = SUCCESS; - - // Get the MBA fapi target - fapi2::Target fapiTrgt ( i_chip->getTrgt() ); - - // Get the stop conditions. - uint32_t stopCond = mss_MaintCmd::STOP_ON_RETRY_CE_ETE | - mss_MaintCmd::STOP_ON_END_ADDRESS | - mss_MaintCmd::ENABLE_CMD_COMPLETE_ATTENTION; - - // Note that we set the stop on RCE ETE flag. This requires us to set a - // threshold in the MBSTR. Fortunately, MDIA sets the threshold for us when - // it starts the first command on this MBA. - - do - { - // Set up the per-symbol counters to capture soft CEs. - ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); - const char * reg_str = (0 == i_chip->getPos()) ? "MBA0_MBSTR" - : "MBA1_MBSTR"; - SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); - o_rc = mbstr->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); - break; - } - - // Disable per-symbol error counters to count soft CEs - mbstr->ClearBit(55); - mbstr->ClearBit(56); - // Enable per-symbol error counters to count hard CEs - mbstr->SetBit(57); - - o_rc = mbstr->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); - break; - } - - // Get the address range of the master rank. - fapi2::buffer saddr, eaddr; - o_rc = getMemAddrRange( i_chip, i_rank, saddr, eaddr, - SLAVE_RANK ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "getMemAddrRange(0x%08x,0x%2x) failed", - i_chip->getHuid(), i_rank.getKey() ); - break; - } - - // Clear all of the counters and maintenance ECC attentions. - o_rc = prepareNextCmd( i_chip ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "prepareNextCmd(0x%08x) failed", - i_chip->getHuid() ); - break; - } - - // Start a steer cleanup command. - mss_TimeBaseScrub cmd { fapiTrgt, saddr, eaddr, - mss_MaintCmd::FAST_MAX_BW_IMPACT, - stopCond, false }; - errlHndl_t errl; - FAPI_INVOKE_HWP( errl, cmd.setupAndExecuteCmd ); - if ( nullptr != errl ) - { - PRDF_ERR( PRDF_FUNC "setupAndExecuteCmd() on 0x%08x,0x%02x failed", - i_chip->getHuid(), i_rank.getKey() ); - PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); - o_rc = FAIL; break; - } - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - } // end namespace PlatServices } // end namespace PRDF -- cgit v1.2.1