diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2013-10-30 17:29:34 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2013-11-13 10:21:53 -0600 |
commit | 4e0d9f538d8c37dd8965e9a5744a78deda359714 (patch) | |
tree | 126b2c3612f4e0d5053486201ca0fe901af95bcb /src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C | |
parent | 1ad8af50954d5eb8785a2dd0803db4245c01f396 (diff) | |
download | talos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.tar.gz talos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.zip |
PRD: bad path in RT/IPL TD controllers
Also moved handleMCE_VCM2(), handleMCE_DSD2(), checkEccErrors(),
and prepareNextCmd() from Hostboot only code to the common TD
controller code. No changes were made to these functions, simply
preparing for future code.
Change-Id: Id0c46f6963f66b22d603b7345d95b323b5c4b02b
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/6993
Tested-by: Jenkins Server
Reviewed-by: Christopher T. Phan <cphan@us.ibm.com>
Reviewed-by: Prem Shanker Jha <premjha2@in.ibm.com>
Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Reviewed-by: Zane Shelley <zshelle@us.ibm.com>
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/7211
Diffstat (limited to 'src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C')
-rw-r--r-- | src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C | 462 |
1 files changed, 460 insertions, 2 deletions
diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C index 65628e581..b6bc34e73 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C @@ -23,6 +23,14 @@ #include <prdfCenMbaTdCtlr_common.H> +// Framework includes +#include <prdfRegisterCache.H> + +// Pegasus includes +#include <prdfCalloutUtil.H> +#include <prdfCenDqBitmap.H> +#include <prdfCenMbaDataBundle.H> + using namespace TARGETING; namespace PRDF @@ -32,6 +40,13 @@ using namespace PlatServices; //------------------------------------------------------------------------------ +bool CenMbaTdCtlrCommon::isInTdMode() +{ + return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) ); +} + +//------------------------------------------------------------------------------ + int32_t CenMbaTdCtlrCommon::cleanupPrevCmd() { #define PRDF_FUNC "[CenMbaTdCtlrCommon::cleanupPrevCmd] " @@ -71,6 +86,107 @@ int32_t CenMbaTdCtlrCommon::cleanupPrevCmd() //------------------------------------------------------------------------------ +int32_t CenMbaTdCtlrCommon::prepareNextCmd() +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::prepareNextCmd] " + + int32_t o_rc = SUCCESS; + + do + { + CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); + ExtensibleChip * membChip = mbadb->getMembChip(); + if ( NULL == membChip ) + { + PRDF_ERR( PRDF_FUNC"getMembChip() failed" ); + o_rc = FAIL; break; + } + + uint32_t mbaPos = getTargetPosition( iv_mbaChip->GetChipHandle() ); + + //---------------------------------------------------------------------- + // Clean up previous command + //---------------------------------------------------------------------- + + o_rc = cleanupPrevCmd(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); + break; + } + + //---------------------------------------------------------------------- + // Clear ECC counters + //---------------------------------------------------------------------- + + const char * reg_str = ( 0 == mbaPos ) ? "MBA0_MBSTR" : "MBA1_MBSTR"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); + break; + } + + mbstr->SetBit(53); // Setting this bit clears all counters. + + o_rc = mbstr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + + // Hardware automatically clears bit 53, so flush this register out of + // the register cache to avoid clearing the counters again with a write + // from the out-of-date cached copy. + RegDataCache & cache = RegDataCache::getCachedRegisters(); + cache.flush( membChip, mbstr ); + + //---------------------------------------------------------------------- + // Clear ECC FIRs + //---------------------------------------------------------------------- + + reg_str = ( 0 == mbaPos ) ? "MBA0_MBSECCFIR_AND" : "MBA1_MBSECCFIR_AND"; + SCAN_COMM_REGISTER_CLASS * firand = membChip->getRegister( reg_str ); + firand->setAllBits(); + + // Clear MPE bit for this rank. + firand->ClearBit( 20 + iv_rank.getMaster() ); + + // Clear NCE, SCE, MCE, RCE, SUE, UE bits (36-41) + firand->SetBitFieldJustified( 36, 6, 0 ); + + o_rc = firand->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + + SCAN_COMM_REGISTER_CLASS * spaAnd = + iv_mbaChip->getRegister("MBASPA_AND"); + spaAnd->setAllBits(); + + // clear threshold exceeded attentions + spaAnd->SetBitFieldJustified( 1, 4, 0 ); + + o_rc = spaAnd->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on MBASPA_AND" ); + o_rc = FAIL; break; + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + int32_t CenMbaTdCtlrCommon::chipMarkCleanup() { #define PRDF_FUNC "[CenMbaTdCtlrCommon::chipMarkCleanup] " @@ -102,9 +218,351 @@ int32_t CenMbaTdCtlrCommon::chipMarkCleanup() //------------------------------------------------------------------------------ -bool CenMbaTdCtlrCommon::isInTdMode() +int32_t CenMbaTdCtlrCommon::checkEccErrors( uint16_t & o_eccErrorMask ) { - return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) ); + #define PRDF_FUNC "[CenMbaTdCtlrCommon::checkEccErrors] " + + int32_t o_rc = SUCCESS; + + o_eccErrorMask = NO_ERROR; + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip ); + ExtensibleChip * membChip = mbadb->getMembChip(); + if ( NULL == membChip ) + { + PRDF_ERR( PRDF_FUNC"getMembChip() failed: MBA=0x%08x", + getHuid(mba) ); + o_rc = FAIL; break; + } + + const char * reg_str = ( 0 == getTargetPosition(mba) ) + ? "MBA0_MBSECCFIR" : "MBA1_MBSECCFIR"; + SCAN_COMM_REGISTER_CLASS * mbsEccFir = membChip->getRegister( reg_str ); + + o_rc = mbsEccFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); + break; + } + + if ( mbsEccFir->IsBitSet(20 + iv_rank.getMaster()) ) + { + o_eccErrorMask |= MPE; + + // Clean up side-effect FIRs that may be set due to the chip mark. + o_rc = chipMarkCleanup(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"chipMarkCleanup() failed" ); + break; + } + } + + if ( mbsEccFir->IsBitSet(38) ) o_eccErrorMask |= MCE; + if ( mbsEccFir->IsBitSet(41) ) o_eccErrorMask |= UE; + + SCAN_COMM_REGISTER_CLASS * mbaSpaFir = + iv_mbaChip->getRegister("MBASPA"); + o_rc = mbaSpaFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Failed to read MBASPA Regsiter"); + break; + } + + if ( mbaSpaFir->IsBitSet(1) ) o_eccErrorMask |= HARD_CTE; + if ( mbaSpaFir->IsBitSet(2) ) o_eccErrorMask |= SOFT_CTE; + if ( mbaSpaFir->IsBitSet(3) ) o_eccErrorMask |= INTER_CTE; + if ( mbaSpaFir->IsBitSet(4) ) o_eccErrorMask |= RETRY_CTE; + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_VCM2] " + + int32_t o_rc = SUCCESS; + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + if ( VCM_PHASE_2 != iv_tdState ) + { + PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); + o_rc = FAIL; break; + } + + io_sc.service_data->SetErrorSig( PRDFSIG_VcmVerified ); + + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + + if ( areDramRepairsDisabled() ) + { + iv_tdState = NO_OP; // The TD procedure is complete. + + io_sc.service_data->SetServiceCall(); + + break; // nothing else to do. + } + + bool startDsdProcedure = false; + + // Read VPD. + CenDqBitmap bitmap; + o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); + break; + } + + // The chip mark is considered verified, so set it in VPD. + // NOTE: If this chip mark was placed on the spare, the original failing + // DRAM will have already been set in VPD so this will be + // redundant but it simplifies the rest of the logic below. + o_rc = bitmap.setDram( iv_mark.getCM().getSymbol() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDram() failed" ); + break; + } + + // RAS callout policies can be determined by the DIMM type. We can + // assume IS DIMMs are on low end systems and Centaur DIMMs are on + // mid/high end systems. + bool isCenDimm = false; + o_rc = isMembufOnDimm( mba, isCenDimm ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"isMembufOnDimm() failed" ); + break; + } + + if ( isCenDimm ) // Medium/high end systems + { + uint8_t ps = iv_mark.getCM().getPortSlct(); + + // It is possible that a Centaur DIMM does not have spare DRAMs. + // Check the VPD for available spares. Note that a x4 DIMM may have + // one or two spare DRAMs so check for availability on both. + // TODO: RTC 68096 Add support for x4 DRAMs. + bool dramSparePossible = false; + o_rc = bitmap.isDramSpareAvailable( ps, dramSparePossible ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"isDramSpareAvailable() failed" ); + break; + } + + if ( dramSparePossible ) + { + // Verify the spare is not already used. + CenSymbol sp0, sp1, ecc; + // TODO: RTC 68096 need to support ECC spare. + o_rc = mssGetSteerMux( mba, iv_rank, sp0, sp1, ecc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed" ); + break; + } + + if ( ((0 == ps) && !sp0.isValid()) || + ((1 == ps) && !sp1.isValid()) ) + { + // A spare DRAM is available. + startDsdProcedure = true; + } + else if ( iv_mark.getCM().getDram() == + (0 == ps ? sp0.getDram() : sp1.getDram()) ) + { + io_sc.service_data->SetErrorSig( PRDFSIG_VcmBadSpare ); + + // The chip mark was on the spare DRAM and it is bad, so + // call it out and set it in VPD. + + MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); + memmru.setDramSpared(); + io_sc.service_data->SetCallout( memmru ); + io_sc.service_data->SetServiceCall(); + + o_rc = bitmap.setDramSpare( ps ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); + break; + } + } + else + { + // Chip mark and DRAM spare are both used. + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + else + { + // Chip mark is in place and sparing is not possible. + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + else // Low end systems + { + // Not able to do dram sparing. If there is a symbol mark, there are + // no repairs available so call it out and set the error log to + // predictive. + if ( iv_mark.getSM().isValid() ) + { + io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail ); + io_sc.service_data->SetServiceCall(); + } + } + + // Write VPD. + o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); + break; + } + + // Start DSD Phase 1, if possible. + if ( startDsdProcedure ) + { + o_rc = startDsdPhase1( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"startDsdPhase1() failed" ); + break; + } + } + else + { + iv_tdState = NO_OP; // The TD procedure is complete. + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +int32_t CenMbaTdCtlrCommon::handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_DSD2] " + + int32_t o_rc = SUCCESS; + + io_sc.service_data->SetErrorSig( PRDFSIG_DsdBadSpare ); + io_sc.service_data->SetServiceCall(); + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + do + { + if ( DSD_PHASE_2 != iv_tdState ) + { + PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" ); + o_rc = FAIL; break; + } + + // Callout mark and spare DRAM. + CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc ); + + MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() ); + memmru.setDramSpared(); + io_sc.service_data->SetCallout( memmru ); + + // The spare DRAM is bad, so set it in VPD. At this point, the chip mark + // should have already been set in the VPD because it was recently + // verified. + + CenDqBitmap bitmap; + o_rc = getBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" ); + break; + } + + o_rc = bitmap.setDramSpare( iv_mark.getCM().getPortSlct() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setDramSpare() failed" ); + break; + } + + o_rc = setBadDqBitmap( mba, iv_rank, bitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" ); + break; + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +void CenMbaTdCtlrCommon::badPathErrorHandling( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[CenMbaTdCtlrCommon::badPathErrorHandling] " + + TargetHandle_t mba = iv_mbaChip->GetChipHandle(); + + PRDF_ERR( PRDF_FUNC"iv_mbaChip:0x%08x iv_initialized:%c iv_tdState:%d " + "iv_rank:M%dS%d iv_mark:%2d %2d", getHuid(mba), + iv_initialized ? 'T' : 'F', iv_tdState, iv_rank.getMaster(), + iv_rank.getSlave(), iv_mark.getCM().getSymbol(), + iv_mark.getSM().getSymbol() ); + + iv_tdState = NO_OP; + + int32_t l_rc = cleanupPrevCmd(); // Just in case. + if ( SUCCESS != l_rc ) + PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" ); + + io_sc.service_data->SetErrorSig( PRDFSIG_MaintCmdComplete_ERROR ); + io_sc.service_data->SetServiceCall(); + + // There may have been a code bug, callout 2nd level support. + io_sc.service_data->SetCallout( NextLevelSupport_ENUM, MRU_HIGH ); + + // Callout the rank if no other callouts have been made (besides 2nd + // Level Support). Note that iv_mark is not always guaranteed to be + // valid for every error scenario. For simplicity, callout the rank that + // was targeted with low priority. + if ( 1 == io_sc.service_data->GetMruList().size() ) + { + MemoryMru memmru ( mba, iv_rank, MemoryMruData::CALLOUT_RANK ); + io_sc.service_data->SetCallout( memmru, MRU_LOW ); + } + + // Just in case it was a legitimate maintenance command complete (error + // log not committed) but something else failed. + io_sc.service_data->ClearFlag(ServiceDataCollector::DONT_COMMIT_ERRL); + + #undef PRDF_FUNC } } // end namespace PRDF |