From 0d6b900a5d121f3cf9d67d0c2fdb91efd27d2a9b Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Thu, 17 May 2018 11:36:17 -0500 Subject: PRD: Maintenance RCE handling during background scrub for Centaur Change-Id: Ib146e7bead1f3b4bae4e36fd582360bdbd22afce RTC: 192638 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/58979 Tested-by: Jenkins Server Reviewed-by: Caleb N. Palmer Reviewed-by: Matt Derksen Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59228 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C | 176 --------------------- .../prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H | 25 --- src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C | 62 ++++++-- 3 files changed, 53 insertions(+), 210 deletions(-) diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C index 4f8df05fa..24d7b6c9e 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.C @@ -532,36 +532,6 @@ int32_t CenMbaTdCtlr::analyzeCmdComplete( STEP_CODE_DATA_STRUCT & io_sc, } } - if ( eccErrorMask & RETRY_CTE ) - { - o_rc = handleRceEte_NonTd( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleRceEte_NonTd() failed" ); - break; - } - } - - if ( eccErrorMask & MPE ) - { - o_rc = handleMpe_NonTd( io_sc, i_stopAddr ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleMpe_NonTd() failed" ); - break; - } - } - - if ( eccErrorMask & UE ) - { - o_rc = handleUe_NonTd( io_sc, i_stopAddr ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "handleUe_NonTd() failed" ); - break; - } - } - if ( iv_queue.empty() ) { // No TD requests so resume background. If the scrub reached the end @@ -1956,152 +1926,6 @@ int32_t CenMbaTdCtlr::handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::handleUe_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleUe_NonTd] " - - int32_t o_rc = SUCCESS; - - setTdSignature( io_sc, PRDFSIG_MaintUE ); - - do - { - // Add entry to UE table. - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - mbadb->iv_ueTable.addEntry( UE_TABLE::SCRUB_UE, i_addr ); - - // Callout the rank. - MemoryMru memmru ( iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK ); - io_sc.service_data->SetCallout( memmru ); - io_sc.service_data->setServiceCall(); - - // Add a TPS request to the queue and ban any future TPS requests. - o_rc = addTdQueueEntryTPS( iv_rank, io_sc, true ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" ); - break; - } - - // Send lmb gard message to PHYP. - o_rc = DEALLOC::lmbGard( iv_mbaChip, i_addr, false ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "lmbGard() failed" ); - break; - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -int32_t CenMbaTdCtlr::handleMpe_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleMpe_NonTd] " - - int32_t o_rc = SUCCESS; - - setTdSignature( io_sc, PRDFSIG_MaintMPE ); - - do - { - // Add entry to UE table. - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - mbadb->iv_ueTable.addEntry( UE_TABLE::SCRUB_MPE, i_addr ); - - // Add a VCM request to the queue. - o_rc = addTdQueueEntryVCM( iv_rank ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTdQueueEntryVCM() failed" ); - break; - } - - // Get the current mark in hardware. - CenMark mark; - o_rc = mssGetMarkStore( iv_mbaTrgt, iv_rank, mark ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "mssGetMarkStore() failed"); - break; - } - - // Callout the mark. - CalloutUtil::calloutMark( iv_mbaTrgt, iv_rank, mark, io_sc ); - - } while( 0 ); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -int32_t CenMbaTdCtlr::handleRceEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleRceEte_NonTd] " - - int32_t o_rc = SUCCESS; - - setTdSignature( io_sc, PRDFSIG_MaintRETRY_CTE ); - - do - { - MemoryMru memmru ( iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK ); - io_sc.service_data->SetCallout( memmru ); - - bool doTps = true; - - if ( mfgMode() ) - { - // Get RCE count. - const char * reg_str = (0 == iv_mbaPos) ? "MBA0_MBSEC1" - : "MBA1_MBSEC1"; - SCAN_COMM_REGISTER_CLASS * mbsec1 - = iv_membChip->getRegister( reg_str ); - o_rc = mbsec1->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); - break; - } - - uint16_t count = mbsec1->GetBitFieldJustified( 0, 12 ); - - // Add count to RCE table - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - doTps = mbadb->iv_rceTable.addEntry( iv_rank, io_sc, count ); - } - else - io_sc.service_data->setServiceCall(); - - if ( doTps ) - { - o_rc = addTdQueueEntryTPS( iv_rank, io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "addTdQueueEntryTPS() failed" ); - break; - } - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::handleHardCeEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc, const CenAddr & i_addr ) { diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H index 40f67247d..2838d288a 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_rt.H @@ -261,31 +261,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon */ int32_t handleCeEte_Tps( STEP_CODE_DATA_STRUCT & io_sc ); - /** - * @brief Handles UEs during background scrub. - * @param io_sc The step code data struct. - * @param i_addr The address in which the maintenance command stopped. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleUe_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ); - - /** - * @brief Handles MPEs during background scrub. - * @param io_sc The step code data struct. - * @param i_addr The address in which the maintenance command stopped. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleMpe_NonTd( STEP_CODE_DATA_STRUCT & io_sc, - const CenAddr & i_addr ); - - /** - * @brief Handles RCE ETEs during background scrub. - * @param io_sc The step code data struct. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleRceEte_NonTd( STEP_CODE_DATA_STRUCT & io_sc ); - /** * @brief Handles hard CE ETEs during background scrub. * @param io_sc The step code data struct. diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C index e0835df7e..b42daa446 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C @@ -524,12 +524,13 @@ uint32_t __handleNceEte( ExtensibleChip * i_chip, TdQueue & io_queue, //------------------------------------------------------------------------------ template -uint32_t __handleRceEte( ExtensibleChip * i_chip, bool & o_errorsFound, +uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, + const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); template<> -uint32_t __handleRceEte( ExtensibleChip * i_chip, - bool & o_errorsFound, +uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, + const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__handleRceEte] " @@ -557,7 +558,7 @@ uint32_t __handleRceEte( ExtensibleChip * i_chip, o_errorsFound = true; io_sc.service_data->AddSignatureList( i_chip->getTrgt(), PRDFSIG_MaintIUE ); - o_rc = MemEcc::analyzeMaintIue(i_chip, io_sc); + o_rc = MemEcc::handleMemIue( i_chip, i_rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "analyzeMaintIue(0x%08x) failed", @@ -572,18 +573,61 @@ uint32_t __handleRceEte( ExtensibleChip * i_chip, #undef PRDF_FUNC } -/* TODO RTC 157888 template<> -uint32_t __handleRceEte( ExtensibleChip * i_chip, - bool & o_errorsFound, +uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, + const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__handleRceEte] " uint32_t o_rc = SUCCESS; + TargetHandle_t trgt = i_chip->getTrgt(); + + o_errorsFound = true; + io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintRETRY_CTE ); + + // Add the rank to the callout list. + MemoryMru mm { trgt, i_rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + do { + bool doTps = true; + + if ( mfgMode() ) + { + ExtensibleChip * membChip = getConnectedParent(i_chip, TYPE_MEMBUF); + + // Get the current RCE count from hardware. + const char * reg_str = (0 == i_chip->getPos()) ? "MBA0_MBSEC1" + : "MBA1_MBSEC1"; + SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister( reg_str ); + o_rc = reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); + break; + } + uint16_t count = reg->GetBitFieldJustified( 0, 12 ); + + // Add the count to RCE table. + doTps = getMbaDataBundle(i_chip)->iv_rceTable.addEntry( i_rank, + io_sc, + count ); + } + else + { + // The RCE threshold was set to the maximum. If we hit this then + // there is definitely a problem. + io_sc.service_data->setServiceCall(); + } + + // Add a TPS procedure to the queue, if needed. + if ( doTps ) + { + io_queue.push( new TpsEvent(i_chip, i_rank) ); + } } while (0); @@ -591,7 +635,6 @@ uint32_t __handleRceEte( ExtensibleChip * i_chip, #undef PRDF_FUNC } -*/ //------------------------------------------------------------------------------ @@ -694,7 +737,8 @@ uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, if ( 0 != (eccAttns & MAINT_RCE_ETE) ) { - o_rc = __handleRceEte( i_chip, o_errorsFound, io_sc ); + o_rc = __handleRceEte( i_chip, io_queue, rank, o_errorsFound, + io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleRceEte(0x%08x) failed", huid ); -- cgit v1.2.1