From e940af9a779a680dd817b65f5bbc356ad91f4c59 Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Thu, 17 May 2018 12:01:34 -0500 Subject: PRD: Maint soft/inter/hard CE handling during background scrub for Centaur Change-Id: I9363812d7e3a7fcca46e481c6250d810bfcd970a RTC: 192638 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58980 Tested-by: Jenkins Server Reviewed-by: Caleb N. Palmer Reviewed-by: Matt Derksen Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59229 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C | 219 --------------------- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H | 16 -- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 52 ++++- 3 files changed, 42 insertions(+), 245 deletions(-) (limited to 'src/usr') diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C index 24d7b6c9e..6e69f8ac5 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C @@ -468,124 +468,6 @@ int32_t CenMbaTdCtlr::initialize() //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::analyzeCmdComplete( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_stopAddr, - const CenAddr & i_endAddr ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::analyzeCmdComplete] " - - int32_t o_rc = SUCCESS; - - do - { - if ( NO_OP != iv_tdState ) - { - PRDF_ERR( PRDF_FUNC "Invalid state machine configuration" ); - o_rc = FAIL; break; - } - - // Initialize iv_rank. This must be done before calling other - // functions as they require iv_rank to be accurate. - iv_rank = i_stopAddr.getRank(); - - // Background scrubbing was interrupted, most likely because of an ECC - // error, so set the interrupted rank in the rank list. - o_rc = iv_masterRanks.setInterruptedRank( iv_rank ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "setInterruptedRank() failed" ); - break; - } - - // Get all reported error conditions. - uint16_t eccErrorMask = NO_ERROR; - o_rc = checkEccErrors( eccErrorMask, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "checkEccErrors() failed" ); - break; - } - - // The order of the following checks is important. Each call to handle - // an error will set the PRD signature and override the previous - // signature. We want the highest priority error signature (memory UEs) - // to be displayed so these checks should be ordered from lowest to - // highest priority. - - if ( (eccErrorMask & SOFT_CTE) || (eccErrorMask & INTER_CTE) ) - { - o_rc = handleSoftIntCeEte_NonTd( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleSoftIntCeEte_NonTd() failed" ); - break; - } - } - - if ( eccErrorMask & HARD_CTE ) - { - o_rc = handleHardCeEte_NonTd( io_sc, i_stopAddr ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleHardCeEte_NonTd() failed" ); - break; - } - } - - if ( iv_queue.empty() ) - { - // No TD requests so resume background. If the scrub reached the end - // address, start background scrubbing on the next good rank. - // Otherwise, resume the current scrub. - - if ( i_endAddr == i_stopAddr ) - { - if ( (NO_ERROR == eccErrorMask) || (MCE == eccErrorMask) ) - { - // The scrub completed without an error (this function - // currently ignores MCEs). Don't commit the error log - // (reduces informational error logs). - io_sc.service_data->setDontCommitErrl(); - } - - o_rc = startBgScrub( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "startBgScrub() failed" ); - break; - } - } - else - { - // Restart the scrub on the next address. - o_rc = resumeScrub( io_sc, eccErrorMask ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "resumeScrub() failed" ); - break; - } - } - } - else - { - // A TD request was added to the queue, start the next TD request. - o_rc = startNextTd( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "startNextTd() failed" ); - break; - } - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::analyzeVcmPhase1( STEP_CODE_DATA_STRUCT & io_sc, const CenAddr & i_stopAddr, const CenAddr & i_endAddr ) @@ -1926,107 +1808,6 @@ int32_t CenMbaTdCtlr::handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::handleHardCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleHardCeEte_NonTd] " - - int32_t o_rc = SUCCESS; - - setTdSignature( io_sc, PRDFSIG_MaintHARD_CTE ); - - do - { - // Send page deallocation message to PHYP - o_rc = DEALLOC::pageGard( iv_mbaChip, i_addr ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "pageGard() failed" ); - break; - } - - // Get the failing symbol. Note that the hard CE threshold is 1 so there - // should only be one symbol with a non-zero per symbol count. - - MaintSymbols symData; CenSymbol junk; - o_rc = collectCeStats( iv_mbaChip, iv_rank, symData, junk ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "collectCeStats() failed." ); - break; - } - - if ( 1 != symData.size() ) - { - PRDF_ERR( PRDF_FUNC "collectCeStats() return size %d, but was " - "expecting size 1", symData.size() ); - o_rc = FAIL; - break; - } - - CenSymbol symbol = symData[0].symbol; - - // Callout the symbol. - MemoryMru memmru ( iv_mbaTrgt, iv_rank, symbol ); - io_sc.service_data->SetCallout( memmru ); - - // Add entry to CE table and add a TPS request to the queue, if needed. - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - if ( mbadb->iv_ceTable.addEntry(i_addr, symbol, true) ) - { - o_rc = addTdQueueEntryTPS( iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" ); - break; - } - } - - // Any hard CEs in MNFG should be immediately reported. - if ( mfgMode() ) - io_sc.service_data->setServiceCall(); - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -int32_t CenMbaTdCtlr::handleSoftIntCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleSoftIntCeEte_NonTd] " - - int32_t o_rc = SUCCESS; - - setTdSignature( io_sc, PRDFSIG_MaintNCE_CTE ); - - do - { - // Callout the rank. Note that the per CE counters only capture hard CEs - // so it is not possible to isolate any further than a rank. - MemoryMru memmru ( iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK ); - io_sc.service_data->SetCallout( memmru ); - - // Add a TPS request to the queue. - o_rc = addTdQueueEntryTPS( iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" ); - break; - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::handleTpsFalseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlr::handleTpsFalseAlarm] " diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H index 2838d288a..6573636cd 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H @@ -261,22 +261,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon */ int32_t handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc ); - /** - * @brief Handles hard CE ETEs during background scrub. - * @param io_sc The step code data struct. - * @param i_addr The address in which the maintenance command stopped. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleHardCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ); - - /** - * @brief Handles soft and intermittent CEs during background scrub. - * @param io_sc The step code data struct. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleSoftIntCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc ); - /** * @brief Handles TPS false alarms. * @param io_sc The step code data struct. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index b42daa446..052d6e0da 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -523,6 +523,42 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, TdQueue & io_queue, //------------------------------------------------------------------------------ +template +uint32_t __handleSoftInterCeEte( ExtensibleChip * i_chip, TdQueue & io_queue, + const MemAddr & i_addr, + STEP_CODE_DATA_STRUCT & io_sc ); + +template<> +uint32_t __handleSoftInterCeEte( ExtensibleChip * i_chip, + TdQueue & io_queue, + const MemAddr & i_addr, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + return __handleNceEte( i_chip, io_queue, i_addr, io_sc ); +} + +template<> +uint32_t __handleSoftInterCeEte( ExtensibleChip * i_chip, + TdQueue & io_queue, + const MemAddr & i_addr, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + // Due to workarounds on the Centaur we are unable to stop on each + // occurrence of the soft or intermittent CEs like we do for Nimbus. + // Instead, the threshold is set much higher. If the threshold is hit we + // simply want to add the rank to the callout list and trigger TPS. + + MemoryMru mm { i_chip->getTrgt(), i_addr.getRank(), + MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + + io_queue.push( new TpsEvent(i_chip, i_addr.getRank()) ); + + return SUCCESS; +} + +//------------------------------------------------------------------------------ + template uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, const MemRank & i_rank, bool & o_errorsFound, @@ -673,10 +709,10 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintINTER_CTE); - o_rc = __handleNceEte( i_chip, io_queue, i_addr, io_sc ); + o_rc = __handleSoftInterCeEte( i_chip, io_queue, i_addr, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__handleNceEte(0x%08x) failed", + PRDF_ERR( PRDF_FUNC "__handleSoftInterCeEte(0x%08x) failed", huid ); break; } @@ -687,10 +723,10 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintSOFT_CTE ); - o_rc = __handleNceEte( i_chip, io_queue, i_addr, io_sc ); + o_rc = __handleSoftInterCeEte( i_chip, io_queue, i_addr, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__handleNceEte(0x%08x) failed", + PRDF_ERR( PRDF_FUNC "__handleSoftInterCeEte(0x%08x) failed", huid ); break; } @@ -781,14 +817,10 @@ template uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); -template<> +template uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, - STEP_CODE_DATA_STRUCT & io_sc ) -{ - // TODO: remove this once runtime support is abled for MBA. - return SUCCESS; -} + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ -- cgit v1.2.1