diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2013-10-30 17:29:34 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2013-11-13 10:21:53 -0600 |
commit | 4e0d9f538d8c37dd8965e9a5744a78deda359714 (patch) | |
tree | 126b2c3612f4e0d5053486201ca0fe901af95bcb | |
parent | 1ad8af50954d5eb8785a2dd0803db4245c01f396 (diff) | |
download | talos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.tar.gz talos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.zip |
PRD: bad path in RT/IPL TD controllers
Also moved handleMCE_VCM2(), handleMCE_DSD2(), checkEccErrors(),
and prepareNextCmd() from Hostboot only code to the common TD
controller code. No changes were made to these functions, simply
preparing for future code.
Change-Id: Id0c46f6963f66b22d603b7345d95b323b5c4b02b
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/6993
Tested-by: Jenkins Server
Reviewed-by: Christopher T. Phan <cphan@us.ibm.com>
Reviewed-by: Prem Shanker Jha <premjha2@in.ibm.com>
Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Reviewed-by: Zane Shelley <zshelle@us.ibm.com>
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/7211
4 files changed, 538 insertions, 520 deletions
diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C index 65628e581..b6bc34e73 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C @@ -23,6 +23,14 @@ #include <prdfCenMbaTdCtlr_common.H> +// Framework includes +#include <prdfRegisterCache.H> + +// Pegasus includes +#include <prdfCalloutUtil.H> +#include <prdfCenDqBitmap.H> +#include <prdfCenMbaDataBundle.H> + using namespace TARGETING; namespace PRDF @@ -32,6 +40,13 @@ using namespace PlatServices; //------------------------------------------------------------------------------ +bool CenMbaTdCtlrCommon::isInTdMode() +{ + return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) ); +} + +//------------------------------------------------------------------------------ + int32_t CenMbaTdCtlrCommon::cleanupPrevCmd() { #define PRDF_FUNC "[CenMbaTdCtlrCommon::cleanupPrevCmd] " @@ -71,6 +86,107 @@ int32_t CenMbaTdCtlrCommon::cleanupPrevCmd() //------------------------------------------------------------------------------ +int32_t CenMbaTdCtlrCommon::prepareNextCmd() +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::prepareNextCmd] " + + int32_t o_rc = SUCCESS; + + do + { + CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); + ExtensibleChip * membChip = mbadb->getMembChip(); + if ( NULL == membChip ) + { + PRDF_ERR( PRDF_FUNC"getMembChip() failed" ); + o_rc = FAIL; break; + } + + uint32_t mbaPos = getTargetPosition( iv_mbaChip->GetChipHandle() ); + + //---------------------------------------------------------------------- + // Clean up previous command + //---------------------------------------------------------------------- + + o_rc = cleanupPrevCmd(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); + break; + } + + //---------------------------------------------------------------------- + // Clear ECC counters + //---------------------------------------------------------------------- + + const char * reg_str = ( 0 == mbaPos ) ? "MBA0_MBSTR" : "MBA1_MBSTR"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); + break; + } + + mbstr->SetBit(53); // Setting this bit clears all counters. + + o_rc = mbstr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + + // Hardware automatically clears bit 53, so flush this register out of + // the register cache to avoid clearing the counters again with a write + // from the out-of-date cached copy. + RegDataCache & cache = RegDataCache::getCachedRegisters(); + cache.flush( membChip, mbstr ); + + //---------------------------------------------------------------------- + // Clear ECC FIRs + //---------------------------------------------------------------------- + + reg_str = ( 0 == mbaPos ) ? "MBA0_MBSECCFIR_AND" : "MBA1_MBSECCFIR_AND"; + SCAN_COMM_REGISTER_CLASS * firand = membChip->getRegister( reg_str ); + firand->setAllBits(); + + // Clear MPE bit for this rank. + firand->ClearBit( 20 + iv_rank.getMaster() ); + + // Clear NCE, SCE, MCE, RCE, SUE, UE bits (36-41) + firand->SetBitFieldJustified( 36, 6, 0 ); + + o_rc = firand->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + + SCAN_COMM_REGISTER_CLASS * spaAnd = + iv_mbaChip->getRegister("MBASPA_AND"); + spaAnd->setAllBits(); + + // clear threshold exceeded attentions + spaAnd->SetBitFieldJustified( 1, 4, 0 ); + + o_rc = spaAnd->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on MBASPA_AND" ); + o_rc = FAIL; break; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + int32_t CenMbaTdCtlrCommon::chipMarkCleanup() { #define PRDF_FUNC "[CenMbaTdCtlrCommon::chipMarkCleanup] " @@ -102,9 +218,351 @@ int32_t CenMbaTdCtlrCommon::chipMarkCleanup() //------------------------------------------------------------------------------ -bool CenMbaTdCtlrCommon::isInTdMode() +int32_t CenMbaTdCtlrCommon::checkEccErrors( uint16_t & o_eccErrorMask ) { - return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) ); + #define PRDF_FUNC "[CenMbaTdCtlrCommon::checkEccErrors] " + + int32_t o_rc = SUCCESS; + + o_eccErrorMask = NO_ERROR; + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); + ExtensibleChip * membChip = mbadb->getMembChip(); + if ( NULL == membChip ) + { + PRDF_ERR( PRDF_FUNC"getMembChip() failed: MBA=0x%08x", + getHuid(mba) ); + o_rc = FAIL; break; + } + + const char * reg_str = ( 0 == getTargetPosition(mba) ) + ? "MBA0_MBSECCFIR" : "MBA1_MBSECCFIR"; + SCAN_COMM_REGISTER_CLASS * mbsEccFir = membChip->getRegister( reg_str ); + + o_rc = mbsEccFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); + break; + } + + if ( mbsEccFir->IsBitSet(20 + iv_rank.getMaster()) ) + { + o_eccErrorMask |= MPE; + + // Clean up side-effect FIRs that may be set due to the chip mark. + o_rc = chipMarkCleanup(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"chipMarkCleanup() failed" ); + break; + } + } + + if ( mbsEccFir->IsBitSet(38) ) o_eccErrorMask |= MCE; + if ( mbsEccFir->IsBitSet(41) ) o_eccErrorMask |= UE; + + SCAN_COMM_REGISTER_CLASS * mbaSpaFir = + iv_mbaChip->getRegister("MBASPA"); + o_rc = mbaSpaFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Failed to read MBASPA Regsiter"); + break; + } + + if ( mbaSpaFir->IsBitSet(1) ) o_eccErrorMask |= HARD_CTE; + if ( mbaSpaFir->IsBitSet(2) ) o_eccErrorMask |= SOFT_CTE; + if ( mbaSpaFir->IsBitSet(3) ) o_eccErrorMask |= INTER_CTE; + if ( mbaSpaFir->IsBitSet(4) ) o_eccErrorMask |= RETRY_CTE; + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_VCM2] " + + int32_t o_rc = SUCCESS; + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + if ( VCM_PHASE_2 != iv_tdState ) + { + PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); + o_rc = FAIL; break; + } + + io_sc.service_data->SetErrorSig( PRDFSIG_VcmVerified ); + + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + + if ( areDramRepairsDisabled() ) + { + iv_tdState = NO_OP; // The TD procedure is complete. + + io_sc.service_data->SetServiceCall(); + + break; // nothing else to do. + } + + bool startDsdProcedure = false; + + // Read VPD. + CenDqBitmap bitmap; + o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); + break; + } + + // The chip mark is considered verified, so set it in VPD. + // NOTE: If this chip mark was placed on the spare, the original failing + // DRAM will have already been set in VPD so this will be + // redundant but it simplifies the rest of the logic below. + o_rc = bitmap.setDram( iv_mark.getCM().getSymbol() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDram() failed" ); + break; + } + + // RAS callout policies can be determined by the DIMM type. We can + // assume IS DIMMs are on low end systems and Centaur DIMMs are on + // mid/high end systems. + bool isCenDimm = false; + o_rc = isMembufOnDimm( mba, isCenDimm ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"isMembufOnDimm() failed" ); + break; + } + + if ( isCenDimm ) // Medium/high end systems + { + uint8_t ps = iv_mark.getCM().getPortSlct(); + + // It is possible that a Centaur DIMM does not have spare DRAMs. + // Check the VPD for available spares. Note that a x4 DIMM may have + // one or two spare DRAMs so check for availability on both. + // TODO: RTC 68096 Add support for x4 DRAMs. + bool dramSparePossible = false; + o_rc = bitmap.isDramSpareAvailable( ps, dramSparePossible ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"isDramSpareAvailable() failed" ); + break; + } + + if ( dramSparePossible ) + { + // Verify the spare is not already used. + CenSymbol sp0, sp1, ecc; + // TODO: RTC 68096 need to support ECC spare. + o_rc = mssGetSteerMux( mba, iv_rank, sp0, sp1, ecc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed" ); + break; + } + + if ( ((0 == ps) && !sp0.isValid()) || + ((1 == ps) && !sp1.isValid()) ) + { + // A spare DRAM is available. + startDsdProcedure = true; + } + else if ( iv_mark.getCM().getDram() == + (0 == ps ? sp0.getDram() : sp1.getDram()) ) + { + io_sc.service_data->SetErrorSig( PRDFSIG_VcmBadSpare ); + + // The chip mark was on the spare DRAM and it is bad, so + // call it out and set it in VPD. + + MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); + memmru.setDramSpared(); + io_sc.service_data->SetCallout( memmru ); + io_sc.service_data->SetServiceCall(); + + o_rc = bitmap.setDramSpare( ps ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); + break; + } + } + else + { + // Chip mark and DRAM spare are both used. + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + else + { + // Chip mark is in place and sparing is not possible. + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + else // Low end systems + { + // Not able to do dram sparing. If there is a symbol mark, there are + // no repairs available so call it out and set the error log to + // predictive. + if ( iv_mark.getSM().isValid() ) + { + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + + // Write VPD. + o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); + break; + } + + // Start DSD Phase 1, if possible. + if ( startDsdProcedure ) + { + o_rc = startDsdPhase1( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"startDsdPhase1() failed" ); + break; + } + } + else + { + iv_tdState = NO_OP; // The TD procedure is complete. + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +int32_t CenMbaTdCtlrCommon::handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_DSD2] " + + int32_t o_rc = SUCCESS; + + io_sc.service_data->SetErrorSig( PRDFSIG_DsdBadSpare ); + io_sc.service_data->SetServiceCall(); + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + if ( DSD_PHASE_2 != iv_tdState ) + { + PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); + o_rc = FAIL; break; + } + + // Callout mark and spare DRAM. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + + MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); + memmru.setDramSpared(); + io_sc.service_data->SetCallout( memmru ); + + // The spare DRAM is bad, so set it in VPD. At this point, the chip mark + // should have already been set in the VPD because it was recently + // verified. + + CenDqBitmap bitmap; + o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); + break; + } + + o_rc = bitmap.setDramSpare( iv_mark.getCM().getPortSlct() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); + break; + } + + o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); + break; + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +void CenMbaTdCtlrCommon::badPathErrorHandling( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::badPathErrorHandling] " + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + PRDF_ERR( PRDF_FUNC"iv_mbaChip:0x%08x iv_initialized:%c iv_tdState:%d " + "iv_rank:M%dS%d iv_mark:%2d %2d", getHuid(mba), + iv_initialized ? 'T' : 'F', iv_tdState, iv_rank.getMaster(), + iv_rank.getSlave(), iv_mark.getCM().getSymbol(), + iv_mark.getSM().getSymbol() ); + + iv_tdState = NO_OP; + + int32_t l_rc = cleanupPrevCmd(); // Just in case. + if ( SUCCESS != l_rc ) + PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); + + io_sc.service_data->SetErrorSig( PRDFSIG_MaintCmdComplete_ERROR ); + io_sc.service_data->SetServiceCall(); + + // There may have been a code bug, callout 2nd level support. + io_sc.service_data->SetCallout( NextLevelSupport_ENUM, MRU_HIGH ); + + // Callout the rank if no other callouts have been made (besides 2nd + // Level Support). Note that iv_mark is not always guaranteed to be + // valid for every error scenario. For simplicity, callout the rank that + // was targeted with low priority. + if ( 1 == io_sc.service_data->GetMruList().size() ) + { + MemoryMru memmru ( mba, iv_rank, MemoryMruData::CALLOUT_RANK ); + io_sc.service_data->SetCallout( memmru, MRU_LOW ); + } + + // Just in case it was a legitimate maintenance command complete (error + // log not committed) but something else failed. + io_sc.service_data->ClearFlag(ServiceDataCollector::DONT_COMMIT_ERRL); + + #undef PRDF_FUNC } } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H index 6e5644e00..50af6b034 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H @@ -83,6 +83,18 @@ class CenMbaTdCtlrCommon MAX_TD_STATE ///< The maximum number of TD states. }; + enum EccErrorMask + { + NO_ERROR = 0, ///< No ECC errors found + UE = 0x01, ///< UE + MPE = 0x02, ///< Chip mark placed + MCE = 0x04, ///< CE on chip mark + HARD_CTE = 0x08, ///< Hard CE threshold exceeed + SOFT_CTE = 0x10, ///< Soft CE threshold exceeed + INTER_CTE = 0x20, ///< Intermittent CE threshold exceeed + RETRY_CTE = 0x40, ///< Retry CE threshold exceeed + }; + public: // functions /** @@ -269,12 +281,54 @@ class CenMbaTdCtlrCommon virtual int32_t cleanupPrevCmd(); /** + * @brief Preforms cleanup tasks that need to be done before starting the + * next maintenance command (i.e. clear scrub counter). + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + virtual int32_t prepareNextCmd(); + + /** * @brief Clears FIR bits that may have been a side-effect of a chip mark * placed by hardware. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ virtual int32_t chipMarkCleanup(); + /** + * @brief Checks if ECC errors have occurred during a maintenance command. + * @param o_eccErrorMask Bitwise mask indicating which ECC errors have + * occurred. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + virtual int32_t checkEccErrors( uint16_t & o_eccErrorMask ); + + /** + * @brief Handle MCE event during VCM Phase 2 + * @param io_sc Service data collector. + * @note This will update bad bits information in VPD, set callouts, and + * start the DRAM sparing procedure, if possible. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + virtual int32_t handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ); + + /** + * @brief Handle MCE event during DSD Phase 2 + * @param io_sc Service data collector. + * @note This will update bad bits information in VPD and set callouts. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + virtual int32_t handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ); + + /** + * @brief This class is designed such that all functions will eventually + * return any bad error code to the top level public functions such + * as handleCmdCompleteEvent() and handleTdEvent(). This is a common + * function to handle everything needed to that the TD controller + * can hopefully fail gracefully. + * @param io_sc The step code data struct. + */ + virtual void badPathErrorHandling( STEP_CODE_DATA_STRUCT & io_sc ); + protected: // instance variables /** The memory controller chip that this TD controller acts on. */ diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C index d26a75214..569a07fd2 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C @@ -29,7 +29,6 @@ #include <prdfExtensibleChip.H> #include <prdfGlobal.H> #include <prdfPlatServices.H> -#include <prdfRegisterCache.H> #include <prdfTrace.H> // Pegasus includes @@ -50,18 +49,6 @@ namespace PRDF using namespace PlatServices; -enum EccErrorMask -{ - NO_ERROR = 0, ///< No ECC errors found - UE = 0x01, ///< UE - MPE = 0x02, ///< Chip mark placed - MCE = 0x04, ///< CE on chip mark - HARD_CTE = 0x08, ///< Hard CE threshold exceeed - SOFT_CTE = 0x10, ///< Soft CE threshold exceeed - INTER_CTE = 0x20, ///< Intermittent CE threshold exceeed - RETRY_CTE = 0x40, ///< Retry CE threshold exceeed -}; - //------------------------------------------------------------------------------ // Class Variables //------------------------------------------------------------------------------ @@ -144,39 +131,13 @@ int32_t CenMbaTdCtlr::handleCmdCompleteEvent( STEP_CODE_DATA_STRUCT & io_sc ) if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC"iv_mbaChip:0x%08x iv_initialized:%c iv_tdState:%d " - "iv_rank:M%dS%d iv_mark:%2d %2d", getHuid(mba), - iv_initialized ? 'T' : 'F', iv_tdState, iv_rank.getMaster(), - iv_rank.getSlave(), iv_mark.getCM().getSymbol(), - iv_mark.getSM().getSymbol() ); + PRDF_ERR( PRDF_FUNC"Failed." ); + badPathErrorHandling( io_sc ); - int32_t l_rc = cleanupPrevCmd(); // Just in case. - if ( SUCCESS != l_rc ) - PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); - - l_rc = mdiaSendEventMsg( mba, MDIA::SKIP_MBA ); + // Tell MDIA to skip further analysis on this MBA. + int32_t l_rc = mdiaSendEventMsg( mba, MDIA::SKIP_MBA ); if ( SUCCESS != l_rc ) PRDF_ERR( PRDF_FUNC"mdiaSendEventMsg(SKIP_MBA) failed" ); - - io_sc.service_data->SetErrorSig( PRDFSIG_MaintCmdComplete_ERROR ); - io_sc.service_data->SetServiceCall(); - - // There may have been a code bug, callout 2nd level support. - io_sc.service_data->SetCallout( NextLevelSupport_ENUM, MRU_HIGH ); - - // Callout the rank if no other callouts have been made (besides 2nd - // Level Support). Note that iv_mark is not always guaranteed to be - // valid for every error scenario. For simplicity, callout the rank that - // was targeted with low priority. - if ( 1 == io_sc.service_data->GetMruList().size() ) - { - MemoryMru memmru ( mba, iv_rank, MemoryMruData::CALLOUT_RANK ); - io_sc.service_data->SetCallout( memmru, MRU_LOW ); - } - - // Just in case it was a legitimate maintenance command complete (error - // log not committed) but something else failed. - io_sc.service_data->ClearFlag(ServiceDataCollector::DONT_COMMIT_ERRL); } return o_rc; @@ -192,35 +153,15 @@ int32_t CenMbaTdCtlr::handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc, { #define PRDF_FUNC "[CenMbaTdCtlr::handleTdEvent] " - int32_t o_rc = SUCCESS; - - TargetHandle_t mba = iv_mbaChip->GetChipHandle(); - - do - { - o_rc = initialize(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"initialize() failed" ); - break; - } - - // This is a no-op in Hostboot. Instead, print a trace statement - // indicating the intended request. - PRDF_INF( PRDF_FUNC"TD request found during Hostboot: " - "iv_mbaChip=0x%08x i_rank=M%dS%d i_event=%d", - getHuid(mba), i_rank.getMaster(), i_rank.getSlave(), - i_event ); - - } while(0); - - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"iv_mbaChip:0x%08x iv_initialized:%c iv_tdState:%d", - getHuid(mba), iv_initialized ? 'T' : 'F', iv_tdState ); - } + // This is a no-op in Hostboot because we can't support Targeted Diagnostics + // at this time. Instead, print a trace statement indicating the intended + // request. Note that any VCM request will eventually be found during the + // initialization of the runtime TD controller. + PRDF_INF( PRDF_FUNC"TD request found during Hostboot: iv_mbaChip=0x%08x " + "i_rank=M%dS%d i_event=%d", iv_mbaChip->GetId(), + i_rank.getMaster(), i_rank.getSlave(), i_event ); - return o_rc; + return SUCCESS; #undef PRDF_FUNC } @@ -356,6 +297,9 @@ int32_t CenMbaTdCtlr::analyzeVcmPhase1( STEP_CODE_DATA_STRUCT & io_sc ) o_rc = FAIL; break; } + // Add the mark to the callout list. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + // Get error condition which caused command to stop uint16_t eccErrorMask = NO_ERROR; o_rc = checkEccErrors( eccErrorMask ); @@ -377,8 +321,6 @@ int32_t CenMbaTdCtlr::analyzeVcmPhase1( STEP_CODE_DATA_STRUCT & io_sc ) } else { - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - // Start VCM Phase 2 o_rc = startVcmPhase2( io_sc ); if ( SUCCESS != o_rc ) @@ -413,6 +355,9 @@ int32_t CenMbaTdCtlr::analyzeVcmPhase2( STEP_CODE_DATA_STRUCT & io_sc ) o_rc = FAIL; break; } + // Add the mark to the callout list. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + // Get error condition which caused command to stop uint16_t eccErrorMask = NO_ERROR; o_rc = checkEccErrors( eccErrorMask ); @@ -452,8 +397,6 @@ int32_t CenMbaTdCtlr::analyzeVcmPhase2( STEP_CODE_DATA_STRUCT & io_sc ) io_sc.service_data->SetErrorSig( PRDFSIG_VcmFalseAlarm ); - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - // In the field, this error log will be recoverable for now, but we // may have to add thresholding later if they become a problem. In // manufacturing, this error log will be predictive. @@ -497,6 +440,9 @@ int32_t CenMbaTdCtlr::analyzeDsdPhase1( STEP_CODE_DATA_STRUCT & io_sc ) o_rc = FAIL; break; } + // Add the mark to the callout list. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + // Get error condition which caused command to stop uint16_t eccErrorMask = NO_ERROR; o_rc = checkEccErrors( eccErrorMask ); @@ -518,8 +464,6 @@ int32_t CenMbaTdCtlr::analyzeDsdPhase1( STEP_CODE_DATA_STRUCT & io_sc ) } else { - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - // Start DSD Phase 2 o_rc = startDsdPhase2( io_sc ); if ( SUCCESS != o_rc ) @@ -554,6 +498,9 @@ int32_t CenMbaTdCtlr::analyzeDsdPhase2( STEP_CODE_DATA_STRUCT & io_sc ) o_rc = FAIL; break; } + // Add the mark to the callout list. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + // Get error condition which caused command to stop uint16_t eccErrorMask = NO_ERROR; o_rc = checkEccErrors( eccErrorMask ); @@ -591,8 +538,6 @@ int32_t CenMbaTdCtlr::analyzeDsdPhase2( STEP_CODE_DATA_STRUCT & io_sc ) io_sc.service_data->SetErrorSig( PRDFSIG_DsdDramSpared ); - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - // Remove chip mark from hardware. iv_mark.clearCM(); bool junk; @@ -1066,77 +1011,6 @@ int32_t CenMbaTdCtlr::startTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::checkEccErrors( uint16_t & o_eccErrorMask ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::checkEccErrors] " - - int32_t o_rc = SUCCESS; - - o_eccErrorMask = NO_ERROR; - - TargetHandle_t mba = iv_mbaChip->GetChipHandle(); - - do - { - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - ExtensibleChip * membChip = mbadb->getMembChip(); - if ( NULL == membChip ) - { - PRDF_ERR( PRDF_FUNC"getMembChip() failed: MBA=0x%08x", - getHuid(mba) ); - o_rc = FAIL; break; - } - - const char * reg_str = ( 0 == getTargetPosition(mba) ) - ? "MBA0_MBSECCFIR" : "MBA1_MBSECCFIR"; - SCAN_COMM_REGISTER_CLASS * mbsEccFir = membChip->getRegister( reg_str ); - - o_rc = mbsEccFir->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); - break; - } - - if ( mbsEccFir->IsBitSet(20 + iv_rank.getMaster()) ) - { - o_eccErrorMask |= MPE; - - // Clean up side-effect FIRs that may be set due to the chip mark. - o_rc = chipMarkCleanup(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"chipMarkCleanup() failed" ); - break; - } - } - - if ( mbsEccFir->IsBitSet(38) ) o_eccErrorMask |= MCE; - if ( mbsEccFir->IsBitSet(41) ) o_eccErrorMask |= UE; - - SCAN_COMM_REGISTER_CLASS * mbaSpaFir = - iv_mbaChip->getRegister("MBASPA"); - o_rc = mbaSpaFir->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Failed to read MBASPA Regsiter"); - break; - } - - if ( mbaSpaFir->IsBitSet(1) ) o_eccErrorMask |= HARD_CTE; - if ( mbaSpaFir->IsBitSet(2) ) o_eccErrorMask |= SOFT_CTE; - if ( mbaSpaFir->IsBitSet(3) ) o_eccErrorMask |= INTER_CTE; - if ( mbaSpaFir->IsBitSet(4) ) o_eccErrorMask |= RETRY_CTE; - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::handleUE( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[CenMbaTdCtlr::handleUE] " @@ -1289,241 +1163,6 @@ int32_t CenMbaTdCtlr::handleMPE( STEP_CODE_DATA_STRUCT & io_sc ) } //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleMCE_VCM2] " - - int32_t o_rc = SUCCESS; - - TargetHandle_t mba = iv_mbaChip->GetChipHandle(); - - do - { - if ( VCM_PHASE_2 != iv_tdState ) - { - PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); - o_rc = FAIL; break; - } - - io_sc.service_data->SetErrorSig( PRDFSIG_VcmVerified ); - - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - - if ( areDramRepairsDisabled() ) - { - iv_tdState = NO_OP; // The TD procedure is complete. - - io_sc.service_data->SetServiceCall(); - - break; // nothing else to do. - } - - bool startDsdProcedure = false; - - // Read VPD. - CenDqBitmap bitmap; - o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); - break; - } - - // The chip mark is considered verified, so set it in VPD. - // NOTE: If this chip mark was placed on the spare, the original failing - // DRAM will have already been set in VPD so this will be - // redundant but it simplifies the rest of the logic below. - o_rc = bitmap.setDram( iv_mark.getCM().getSymbol() ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"setDram() failed" ); - break; - } - - // RAS callout policies can be determined by the DIMM type. We can - // assume IS DIMMs are on low end systems and Centaur DIMMs are on - // mid/high end systems. - bool isCenDimm = false; - o_rc = isMembufOnDimm( mba, isCenDimm ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"isMembufOnDimm() failed" ); - break; - } - - if ( isCenDimm ) // Medium/high end systems - { - uint8_t ps = iv_mark.getCM().getPortSlct(); - - // It is possible that a Centaur DIMM does not have spare DRAMs. - // Check the VPD for available spares. Note that a x4 DIMM may have - // one or two spare DRAMs so check for availability on both. - // TODO: RTC 68096 Add support for x4 DRAMs. - bool dramSparePossible = false; - o_rc = bitmap.isDramSpareAvailable( ps, dramSparePossible ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"isDramSpareAvailable() failed" ); - break; - } - - if ( dramSparePossible ) - { - // Verify the spare is not already used. - CenSymbol sp0, sp1, ecc; - // TODO: RTC 68096 need to support ECC spare. - o_rc = mssGetSteerMux( mba, iv_rank, sp0, sp1, ecc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed" ); - break; - } - - if ( ((0 == ps) && !sp0.isValid()) || - ((1 == ps) && !sp1.isValid()) ) - { - // A spare DRAM is available. - startDsdProcedure = true; - } - else if ( iv_mark.getCM().getDram() == - (0 == ps ? sp0.getDram() : sp1.getDram()) ) - { - io_sc.service_data->SetErrorSig( PRDFSIG_VcmBadSpare ); - - // The chip mark was on the spare DRAM and it is bad, so - // call it out and set it in VPD. - - MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); - memmru.setDramSpared(); - io_sc.service_data->SetCallout( memmru ); - io_sc.service_data->SetServiceCall(); - - o_rc = bitmap.setDramSpare( ps ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); - break; - } - } - else - { - // Chip mark and DRAM spare are both used. - io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); - io_sc.service_data->SetServiceCall(); - } - } - else - { - // Chip mark is in place and sparing is not possible. - io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); - io_sc.service_data->SetServiceCall(); - } - } - else // Low end systems - { - // Not able to do dram sparing. If there is a symbol mark, there are - // no repairs available so call it out and set the error log to - // predictive. - if ( iv_mark.getSM().isValid() ) - { - io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); - io_sc.service_data->SetServiceCall(); - } - } - - // Write VPD. - o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); - break; - } - - // Start DSD Phase 1, if possible. - if ( startDsdProcedure ) - { - o_rc = startDsdPhase1( io_sc ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"startDsdPhase1() failed" ); - break; - } - } - else - { - iv_tdState = NO_OP; // The TD procedure is complete. - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - -int32_t CenMbaTdCtlr::handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[CenMbaTdCtlr::handleMCE_DSD2] " - - int32_t o_rc = SUCCESS; - - io_sc.service_data->SetErrorSig( PRDFSIG_DsdBadSpare ); - io_sc.service_data->SetServiceCall(); - - TargetHandle_t mba = iv_mbaChip->GetChipHandle(); - - do - { - if ( DSD_PHASE_2 != iv_tdState ) - { - PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); - o_rc = FAIL; break; - } - - // Callout mark and spare DRAM. - CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); - - MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); - memmru.setDramSpared(); - io_sc.service_data->SetCallout( memmru ); - - // The spare DRAM is bad, so set it in VPD. At this point, the chip mark - // should have already been set in the VPD because it was recently - // verified. - - CenDqBitmap bitmap; - o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); - break; - } - - o_rc = bitmap.setDramSpare( iv_mark.getCM().getPortSlct() ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); - break; - } - - o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); - break; - } - - } while(0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::exitTdSequence() { #define PRDF_FUNC "[CenMbaTdCtlr::exitTdSequence] " @@ -1567,107 +1206,6 @@ int32_t CenMbaTdCtlr::exitTdSequence() //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlr::prepareNextCmd() -{ - #define PRDF_FUNC "[CenMbaTdCtlr::prepareNextCmd] " - - int32_t o_rc = SUCCESS; - - do - { - CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); - ExtensibleChip * membChip = mbadb->getMembChip(); - if ( NULL == membChip ) - { - PRDF_ERR( PRDF_FUNC"getMembChip() failed" ); - o_rc = FAIL; break; - } - - uint32_t mbaPos = getTargetPosition( iv_mbaChip->GetChipHandle() ); - - //---------------------------------------------------------------------- - // Clean up previous command - //---------------------------------------------------------------------- - - o_rc = cleanupPrevCmd(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); - break; - } - - //---------------------------------------------------------------------- - // Clear ECC counters - //---------------------------------------------------------------------- - - const char * reg_str = ( 0 == mbaPos ) ? "MBA0_MBSTR" : "MBA1_MBSTR"; - SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); - o_rc = mbstr->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); - break; - } - - mbstr->SetBit(53); // Setting this bit clears all counters. - - o_rc = mbstr->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); - break; - } - - // Hardware automatically clears bit 53, so flush this register out of - // the register cache to avoid clearing the counters again with a write - // from the out-of-date cached copy. - RegDataCache & cache = RegDataCache::getCachedRegisters(); - cache.flush( membChip, mbstr ); - - //---------------------------------------------------------------------- - // Clear ECC FIRs - //---------------------------------------------------------------------- - - reg_str = ( 0 == mbaPos ) ? "MBA0_MBSECCFIR_AND" : "MBA1_MBSECCFIR_AND"; - SCAN_COMM_REGISTER_CLASS * firand = membChip->getRegister( reg_str ); - firand->setAllBits(); - - // Clear MPE bit for this rank. - firand->ClearBit( 20 + iv_rank.getMaster() ); - - // Clear NCE, SCE, MCE, RCE, SUE, UE bits (36-41) - firand->SetBitFieldJustified( 36, 6, 0 ); - - o_rc = firand->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); - break; - } - - SCAN_COMM_REGISTER_CLASS * spaAnd = - iv_mbaChip->getRegister("MBASPA_AND"); - spaAnd->setAllBits(); - - // clear threshold exceeded attentions - spaAnd->SetBitFieldJustified( 1, 4, 0 ); - - o_rc = spaAnd->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Write() failed on MBASPA_AND" ); - o_rc = FAIL; break; - } - - } while (0); - - return o_rc; - - #undef PRDF_FUNC -} - -//------------------------------------------------------------------------------ - int32_t CenMbaTdCtlr::signalMdiaCmdComplete() { #define PRDF_FUNC "[CenMbaTdCtlr::signalMdiaCmdComplete] " diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H index b1118e009..0fdb3e47b 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H @@ -87,14 +87,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon private: // functions /** - * @brief Checks if ECC errors have occurred during a maintenance command. - * @param o_eccErrorMask Bitwise mask indicating which ECC errors have - * occurred. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t checkEccErrors( uint16_t & o_eccErrorMask ); - - /** * @brief Handle UEs during TD analysis. * @param io_sc Service data collector. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. @@ -110,23 +102,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon int32_t handleMPE( STEP_CODE_DATA_STRUCT & io_sc ); /** - * @brief Handle MCE event during VCM Phase 2 - * @param io_sc Service data collector. - * @note This will update bad bits information in VPD, set callouts, and - * start the DRAM sparing procedure, if possible. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ); - - /** - * @brief Handle MCE event during DSD Phase 2 - * @param io_sc Service data collector. - * @note This will update bad bits information in VPD and set callouts. - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ); - - /** * @brief Handle cleanup when TD sequence is complete and TD state machine * will reset. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. @@ -134,13 +109,6 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon int32_t exitTdSequence(); /** - * @brief Preforms cleanup tasks that need to be done before starting the - * next maintenance command (i.e. clear scrub counter). - * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. - */ - int32_t prepareNextCmd(); - - /** * @brief Sends a message to MDIA that a maintenance command has completed. * @note If for some reason PRD needed to do some targeted diagnotics and * on a rank that was not the last rank behind the MBA, this |