summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2013-10-30 17:29:34 -0500
committerA. Patrick Williams III <iawillia@us.ibm.com>2013-11-13 10:21:53 -0600
commit4e0d9f538d8c37dd8965e9a5744a78deda359714 (patch)
tree126b2c3612f4e0d5053486201ca0fe901af95bcb /src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C
parent1ad8af50954d5eb8785a2dd0803db4245c01f396 (diff)
downloadtalos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.tar.gz
talos-hostboot-4e0d9f538d8c37dd8965e9a5744a78deda359714.zip
PRD: bad path in RT/IPL TD controllers
Also moved handleMCE_VCM2(), handleMCE_DSD2(), checkEccErrors(), and prepareNextCmd() from Hostboot only code to the common TD controller code. No changes were made to these functions, simply preparing for future code. Change-Id: Id0c46f6963f66b22d603b7345d95b323b5c4b02b Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/6993 Tested-by: Jenkins Server Reviewed-by: Christopher T. Phan <cphan@us.ibm.com> Reviewed-by: Prem Shanker Jha <premjha2@in.ibm.com> Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com> Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com> Reviewed-by: Zane Shelley <zshelle@us.ibm.com> Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/7211
Diffstat (limited to 'src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C')
-rw-r--r--src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C462
1 files changed, 460 insertions, 2 deletions
diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C
index 65628e581..b6bc34e73 100644
--- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C
+++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C
@@ -23,6 +23,14 @@
#include <prdfCenMbaTdCtlr_common.H>
+// Framework includes
+#include <prdfRegisterCache.H>
+
+// Pegasus includes
+#include <prdfCalloutUtil.H>
+#include <prdfCenDqBitmap.H>
+#include <prdfCenMbaDataBundle.H>
+
using namespace TARGETING;
namespace PRDF
@@ -32,6 +40,13 @@ using namespace PlatServices;
//------------------------------------------------------------------------------
+bool CenMbaTdCtlrCommon::isInTdMode()
+{
+ return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) );
+}
+
+//------------------------------------------------------------------------------
+
int32_t CenMbaTdCtlrCommon::cleanupPrevCmd()
{
#define PRDF_FUNC "[CenMbaTdCtlrCommon::cleanupPrevCmd] "
@@ -71,6 +86,107 @@ int32_t CenMbaTdCtlrCommon::cleanupPrevCmd()
//------------------------------------------------------------------------------
+int32_t CenMbaTdCtlrCommon::prepareNextCmd()
+{
+ #define PRDF_FUNC "[CenMbaTdCtlrCommon::prepareNextCmd] "
+
+ int32_t o_rc = SUCCESS;
+
+ do
+ {
+ CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );
+ ExtensibleChip * membChip = mbadb->getMembChip();
+ if ( NULL == membChip )
+ {
+ PRDF_ERR( PRDF_FUNC"getMembChip() failed" );
+ o_rc = FAIL; break;
+ }
+
+ uint32_t mbaPos = getTargetPosition( iv_mbaChip->GetChipHandle() );
+
+ //----------------------------------------------------------------------
+ // Clean up previous command
+ //----------------------------------------------------------------------
+
+ o_rc = cleanupPrevCmd();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" );
+ break;
+ }
+
+ //----------------------------------------------------------------------
+ // Clear ECC counters
+ //----------------------------------------------------------------------
+
+ const char * reg_str = ( 0 == mbaPos ) ? "MBA0_MBSTR" : "MBA1_MBSTR";
+ SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( reg_str );
+ o_rc = mbstr->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str );
+ break;
+ }
+
+ mbstr->SetBit(53); // Setting this bit clears all counters.
+
+ o_rc = mbstr->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str );
+ break;
+ }
+
+ // Hardware automatically clears bit 53, so flush this register out of
+ // the register cache to avoid clearing the counters again with a write
+ // from the out-of-date cached copy.
+ RegDataCache & cache = RegDataCache::getCachedRegisters();
+ cache.flush( membChip, mbstr );
+
+ //----------------------------------------------------------------------
+ // Clear ECC FIRs
+ //----------------------------------------------------------------------
+
+ reg_str = ( 0 == mbaPos ) ? "MBA0_MBSECCFIR_AND" : "MBA1_MBSECCFIR_AND";
+ SCAN_COMM_REGISTER_CLASS * firand = membChip->getRegister( reg_str );
+ firand->setAllBits();
+
+ // Clear MPE bit for this rank.
+ firand->ClearBit( 20 + iv_rank.getMaster() );
+
+ // Clear NCE, SCE, MCE, RCE, SUE, UE bits (36-41)
+ firand->SetBitFieldJustified( 36, 6, 0 );
+
+ o_rc = firand->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str );
+ break;
+ }
+
+ SCAN_COMM_REGISTER_CLASS * spaAnd =
+ iv_mbaChip->getRegister("MBASPA_AND");
+ spaAnd->setAllBits();
+
+ // clear threshold exceeded attentions
+ spaAnd->SetBitFieldJustified( 1, 4, 0 );
+
+ o_rc = spaAnd->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Write() failed on MBASPA_AND" );
+ o_rc = FAIL; break;
+ }
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
int32_t CenMbaTdCtlrCommon::chipMarkCleanup()
{
#define PRDF_FUNC "[CenMbaTdCtlrCommon::chipMarkCleanup] "
@@ -102,9 +218,351 @@ int32_t CenMbaTdCtlrCommon::chipMarkCleanup()
//------------------------------------------------------------------------------
-bool CenMbaTdCtlrCommon::isInTdMode()
+int32_t CenMbaTdCtlrCommon::checkEccErrors( uint16_t & o_eccErrorMask )
{
- return ( (NO_OP != iv_tdState) && (MAX_TD_STATE > iv_tdState) );
+ #define PRDF_FUNC "[CenMbaTdCtlrCommon::checkEccErrors] "
+
+ int32_t o_rc = SUCCESS;
+
+ o_eccErrorMask = NO_ERROR;
+
+ TargetHandle_t mba = iv_mbaChip->GetChipHandle();
+
+ do
+ {
+ CenMbaDataBundle * mbadb = getMbaDataBundle( iv_mbaChip );
+ ExtensibleChip * membChip = mbadb->getMembChip();
+ if ( NULL == membChip )
+ {
+ PRDF_ERR( PRDF_FUNC"getMembChip() failed: MBA=0x%08x",
+ getHuid(mba) );
+ o_rc = FAIL; break;
+ }
+
+ const char * reg_str = ( 0 == getTargetPosition(mba) )
+ ? "MBA0_MBSECCFIR" : "MBA1_MBSECCFIR";
+ SCAN_COMM_REGISTER_CLASS * mbsEccFir = membChip->getRegister( reg_str );
+
+ o_rc = mbsEccFir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str );
+ break;
+ }
+
+ if ( mbsEccFir->IsBitSet(20 + iv_rank.getMaster()) )
+ {
+ o_eccErrorMask |= MPE;
+
+ // Clean up side-effect FIRs that may be set due to the chip mark.
+ o_rc = chipMarkCleanup();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"chipMarkCleanup() failed" );
+ break;
+ }
+ }
+
+ if ( mbsEccFir->IsBitSet(38) ) o_eccErrorMask |= MCE;
+ if ( mbsEccFir->IsBitSet(41) ) o_eccErrorMask |= UE;
+
+ SCAN_COMM_REGISTER_CLASS * mbaSpaFir =
+ iv_mbaChip->getRegister("MBASPA");
+ o_rc = mbaSpaFir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Failed to read MBASPA Regsiter");
+ break;
+ }
+
+ if ( mbaSpaFir->IsBitSet(1) ) o_eccErrorMask |= HARD_CTE;
+ if ( mbaSpaFir->IsBitSet(2) ) o_eccErrorMask |= SOFT_CTE;
+ if ( mbaSpaFir->IsBitSet(3) ) o_eccErrorMask |= INTER_CTE;
+ if ( mbaSpaFir->IsBitSet(4) ) o_eccErrorMask |= RETRY_CTE;
+
+ } while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_VCM2] "
+
+ int32_t o_rc = SUCCESS;
+
+ TargetHandle_t mba = iv_mbaChip->GetChipHandle();
+
+ do
+ {
+ if ( VCM_PHASE_2 != iv_tdState )
+ {
+ PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" );
+ o_rc = FAIL; break;
+ }
+
+ io_sc.service_data->SetErrorSig( PRDFSIG_VcmVerified );
+
+ CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc );
+
+ if ( areDramRepairsDisabled() )
+ {
+ iv_tdState = NO_OP; // The TD procedure is complete.
+
+ io_sc.service_data->SetServiceCall();
+
+ break; // nothing else to do.
+ }
+
+ bool startDsdProcedure = false;
+
+ // Read VPD.
+ CenDqBitmap bitmap;
+ o_rc = getBadDqBitmap( mba, iv_rank, bitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" );
+ break;
+ }
+
+ // The chip mark is considered verified, so set it in VPD.
+ // NOTE: If this chip mark was placed on the spare, the original failing
+ // DRAM will have already been set in VPD so this will be
+ // redundant but it simplifies the rest of the logic below.
+ o_rc = bitmap.setDram( iv_mark.getCM().getSymbol() );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"setDram() failed" );
+ break;
+ }
+
+ // RAS callout policies can be determined by the DIMM type. We can
+ // assume IS DIMMs are on low end systems and Centaur DIMMs are on
+ // mid/high end systems.
+ bool isCenDimm = false;
+ o_rc = isMembufOnDimm( mba, isCenDimm );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"isMembufOnDimm() failed" );
+ break;
+ }
+
+ if ( isCenDimm ) // Medium/high end systems
+ {
+ uint8_t ps = iv_mark.getCM().getPortSlct();
+
+ // It is possible that a Centaur DIMM does not have spare DRAMs.
+ // Check the VPD for available spares. Note that a x4 DIMM may have
+ // one or two spare DRAMs so check for availability on both.
+ // TODO: RTC 68096 Add support for x4 DRAMs.
+ bool dramSparePossible = false;
+ o_rc = bitmap.isDramSpareAvailable( ps, dramSparePossible );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"isDramSpareAvailable() failed" );
+ break;
+ }
+
+ if ( dramSparePossible )
+ {
+ // Verify the spare is not already used.
+ CenSymbol sp0, sp1, ecc;
+ // TODO: RTC 68096 need to support ECC spare.
+ o_rc = mssGetSteerMux( mba, iv_rank, sp0, sp1, ecc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed" );
+ break;
+ }
+
+ if ( ((0 == ps) && !sp0.isValid()) ||
+ ((1 == ps) && !sp1.isValid()) )
+ {
+ // A spare DRAM is available.
+ startDsdProcedure = true;
+ }
+ else if ( iv_mark.getCM().getDram() ==
+ (0 == ps ? sp0.getDram() : sp1.getDram()) )
+ {
+ io_sc.service_data->SetErrorSig( PRDFSIG_VcmBadSpare );
+
+ // The chip mark was on the spare DRAM and it is bad, so
+ // call it out and set it in VPD.
+
+ MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() );
+ memmru.setDramSpared();
+ io_sc.service_data->SetCallout( memmru );
+ io_sc.service_data->SetServiceCall();
+
+ o_rc = bitmap.setDramSpare( ps );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"setDramSpare() failed" );
+ break;
+ }
+ }
+ else
+ {
+ // Chip mark and DRAM spare are both used.
+ io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail );
+ io_sc.service_data->SetServiceCall();
+ }
+ }
+ else
+ {
+ // Chip mark is in place and sparing is not possible.
+ io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail );
+ io_sc.service_data->SetServiceCall();
+ }
+ }
+ else // Low end systems
+ {
+ // Not able to do dram sparing. If there is a symbol mark, there are
+ // no repairs available so call it out and set the error log to
+ // predictive.
+ if ( iv_mark.getSM().isValid() )
+ {
+ io_sc.service_data->SetErrorSig( PRDFSIG_VcmMarksUnavail );
+ io_sc.service_data->SetServiceCall();
+ }
+ }
+
+ // Write VPD.
+ o_rc = setBadDqBitmap( mba, iv_rank, bitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" );
+ break;
+ }
+
+ // Start DSD Phase 1, if possible.
+ if ( startDsdProcedure )
+ {
+ o_rc = startDsdPhase1( io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"startDsdPhase1() failed" );
+ break;
+ }
+ }
+ else
+ {
+ iv_tdState = NO_OP; // The TD procedure is complete.
+ }
+
+ } while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+int32_t CenMbaTdCtlrCommon::handleMCE_DSD2( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[CenMbaTdCtlrCommon::handleMCE_DSD2] "
+
+ int32_t o_rc = SUCCESS;
+
+ io_sc.service_data->SetErrorSig( PRDFSIG_DsdBadSpare );
+ io_sc.service_data->SetServiceCall();
+
+ TargetHandle_t mba = iv_mbaChip->GetChipHandle();
+
+ do
+ {
+ if ( DSD_PHASE_2 != iv_tdState )
+ {
+ PRDF_ERR( PRDF_FUNC"Invalid state machine configuration" );
+ o_rc = FAIL; break;
+ }
+
+ // Callout mark and spare DRAM.
+ CalloutUtil::calloutMark( mba, iv_rank, iv_mark, io_sc );
+
+ MemoryMru memmru ( mba, iv_rank, iv_mark.getCM() );
+ memmru.setDramSpared();
+ io_sc.service_data->SetCallout( memmru );
+
+ // The spare DRAM is bad, so set it in VPD. At this point, the chip mark
+ // should have already been set in the VPD because it was recently
+ // verified.
+
+ CenDqBitmap bitmap;
+ o_rc = getBadDqBitmap( mba, iv_rank, bitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed" );
+ break;
+ }
+
+ o_rc = bitmap.setDramSpare( iv_mark.getCM().getPortSlct() );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"setDramSpare() failed" );
+ break;
+ }
+
+ o_rc = setBadDqBitmap( mba, iv_rank, bitmap );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"setBadDqBitmap() failed" );
+ break;
+ }
+
+ } while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
+void CenMbaTdCtlrCommon::badPathErrorHandling( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[CenMbaTdCtlrCommon::badPathErrorHandling] "
+
+ TargetHandle_t mba = iv_mbaChip->GetChipHandle();
+
+ PRDF_ERR( PRDF_FUNC"iv_mbaChip:0x%08x iv_initialized:%c iv_tdState:%d "
+ "iv_rank:M%dS%d iv_mark:%2d %2d", getHuid(mba),
+ iv_initialized ? 'T' : 'F', iv_tdState, iv_rank.getMaster(),
+ iv_rank.getSlave(), iv_mark.getCM().getSymbol(),
+ iv_mark.getSM().getSymbol() );
+
+ iv_tdState = NO_OP;
+
+ int32_t l_rc = cleanupPrevCmd(); // Just in case.
+ if ( SUCCESS != l_rc )
+ PRDF_ERR( PRDF_FUNC"cleanupPrevCmd() failed" );
+
+ io_sc.service_data->SetErrorSig( PRDFSIG_MaintCmdComplete_ERROR );
+ io_sc.service_data->SetServiceCall();
+
+ // There may have been a code bug, callout 2nd level support.
+ io_sc.service_data->SetCallout( NextLevelSupport_ENUM, MRU_HIGH );
+
+ // Callout the rank if no other callouts have been made (besides 2nd
+ // Level Support). Note that iv_mark is not always guaranteed to be
+ // valid for every error scenario. For simplicity, callout the rank that
+ // was targeted with low priority.
+ if ( 1 == io_sc.service_data->GetMruList().size() )
+ {
+ MemoryMru memmru ( mba, iv_rank, MemoryMruData::CALLOUT_RANK );
+ io_sc.service_data->SetCallout( memmru, MRU_LOW );
+ }
+
+ // Just in case it was a legitimate maintenance command complete (error
+ // log not committed) but something else failed.
+ io_sc.service_data->ClearFlag(ServiceDataCollector::DONT_COMMIT_ERRL);
+
+ #undef PRDF_FUNC
}
} // end namespace PRDF
OpenPOWER on IntegriCloud