diff options
Diffstat (limited to 'src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C')
-rw-r--r-- | src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C | 467 |
1 files changed, 218 insertions, 249 deletions
diff --git a/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C b/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C index cfcf67afa..2f2d88ea5 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C @@ -34,9 +34,11 @@ #include "common/plat/pegasus/prdfCalloutUtil.H" #include "common/plat/pegasus/prdfCenDqBitmap.H" #include "common/plat/pegasus/prdfCenMarkstore.H" +#include "common/plat/pegasus/prdfCenMbaExtraSig.H" #include "common/plat/pegasus/prdfCenSymbol.H" #include "common/plat/pegasus/prdfMemoryMru.H" #include "framework/service/prdfPlatServices.H" +#include "plat/pegasus/prdfPlatCalloutUtil.H" using namespace HWAS; using namespace std; @@ -45,111 +47,76 @@ using namespace TARGETING; namespace PRDF { -static const uint8_t INVALID_SYMBOL = 0xff; +using namespace PlatServices; -bool validSymbol(uint8_t i_symbol) +namespace RDR // local utility functions to support PRDF::restoreDramRepairs() { - return i_symbol != INVALID_SYMBOL; -} -void commitRestoreCallout( void (*i_func)(errlHndl_t &, void *), void * i_data, - TargetHandle_t i_mba ) +// Creates and returns an error log. +errlHndl_t createErrl( uint32_t i_reasonCode, TargetHandle_t i_mba, + uint32_t i_signature ) { - PRDF_DENTER("commitRestoreCallout"); - - errlHndl_t err = NULL; - - PRDF_HW_CREATE_ERRL( - err, - ERRL_SEV_PREDICTIVE, - ERRL_ETYPE_NOT_APPLICABLE, - SRCI_MACH_CHECK, - SRCI_NO_ATTR, - PRDF_RESTORE_DRAM_REPAIR, - FSP_DEFAULT_REFCODE, - PRDF_DETECTED_FAIL_HARDWARE_PROBABLE, - 0, 0, 0, 0, // user data - HWSV_SYS_NO_TERMINATE, - false); // no pld check - - // add the callout - - (*i_func)(err, i_data); - - bool term = false; - - CenMbaCaptureData::addDramRepairsData( i_mba, err ); + uint64_t userdata12 = PRDF_GET_UINT64_FROM_UINT32( getHuid(i_mba), 0 ); + uint64_t userdata34 = PRDF_GET_UINT64_FROM_UINT32( i_signature, 0 ); + + // Note that the error log tags are not needed because PRD uses its own + // signature parser. + + return new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, // severity + PRDF_RESTORE_DRAM_REPAIR, // module ID + i_reasonCode, // reason code + userdata12, // user data 1 & 2 + userdata34 ); // user data 3 & 4 +} - PRDF_HW_COMMIT_ERRL( - term, - err, - HWSV::HWSV_DECONFIG_DEFER, - ERRL_ACTION_REPORT, - HWSV_CONTINUE); +//------------------------------------------------------------------------------ - if(term) +// If an error log is given, will add DRAM repairs FFDC and traces to error log, +// then commit the error log. +void commitErrl( errlHndl_t i_errl, TargetHandle_t i_mba ) +{ + if ( NULL != i_errl ) { - // FIXME...this is a little goofy. - // Should be scrubbed with RTC 51552 + // Add capture data + CenMbaCaptureData::addMemEccData( i_mba, i_errl ); - PRDF_COMMIT_ERRL(err, ERRL_ACTION_REPORT); + // Add traces + i_errl->collectTrace( PRDF_COMP_NAME, 512 ); + + // Commit the error log + ERRORLOG::errlCommit( i_errl, PRDF_COMP_ID ); } } -void addMemMruCallout(errlHndl_t & io_log, void * i_memMru) -{ - PRDF_DENTER("addMemMruCallout"); +//------------------------------------------------------------------------------ - if ( NULL != i_memMru ) +// If there were analysis errors, will create and commit an error log with 2nd +// level support callout. +void commitSoftError( uint32_t i_reasonCode, TargetHandle_t i_mba, + uint32_t i_signature, bool i_analysisErrors ) +{ + if ( i_analysisErrors ) { - MemoryMru *memMru = static_cast<MemoryMru *>(i_memMru); - - TargetHandleList partList = memMru->getCalloutList(); - for ( TargetHandleList::iterator it = partList.begin(); - it != partList.end(); it++ ) - { - PRDF_HW_ADD_CALLOUT( - *it, - SRCI_PRIORITY_HIGH, - HWSV::HWSV_DECONFIG, - HWSV::HWSV_DECONFIG_GARD, - io_log, - false, // don't write src to vpd - GARD_Predictive, - ERRL_SEV_PREDICTIVE, - false); // don't update hcdb - } + errlHndl_t errl = createErrl( i_reasonCode, i_mba, i_signature ); + errl->addProcedureCallout( EPUB_PRC_LVL_SUPP, SRCI_PRIORITY_HIGH ); + commitErrl( errl, i_mba ); } } -void addDimmCallout(errlHndl_t & io_log, void * i_dimm) -{ - PRDF_DENTER("addDimmCallout"); - - PRDF_HW_ADD_CALLOUT( - static_cast<TargetHandle_t>(i_dimm), - SRCI_PRIORITY_HIGH, - HWSV::HWSV_DECONFIG, - HWSV::HWSV_DECONFIG_GARD, - io_log, - false, // don't write src to vpd - GARD_Predictive, - ERRL_SEV_PREDICTIVE, - false); // don't update hcdb -} +//------------------------------------------------------------------------------ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) { - PRDF_DENTER("processRepairedRanks: %p, 0x%02x", - i_mba, i_repairedRankMask); + #define PRDF_FUNC "[processRepairedRanks] " - // check the argument ranks for repairs - // that violate RAS policy + // The bits in i_repairedRankMask represent ranks that have repairs. Query + // hardware and compare against RAS policies. - bool calloutMade = false; + bool o_calloutMade = false; + bool analysisErrors = false; - // check each rank for repairs - // that violate RAS policy + errlHndl_t errl = NULL; // Initially NULL, will create if needed. for ( uint8_t r = 0; r < MAX_RANKS_PER_MBA; ++r ) { @@ -161,15 +128,21 @@ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) CenRank rank ( r ); CenMark mark; - if ( SUCCESS != PlatServices::mssGetMarkStore(i_mba, rank, mark) ) + if ( SUCCESS != mssGetMarkStore(i_mba, rank, mark) ) { + PRDF_ERR( PRDF_FUNC"mssGetMarkStore() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } CenSymbol sp0, sp1, sp; - if ( SUCCESS != PlatServices::mssGetSteerMux(i_mba, rank, sp0, sp1, sp)) + if ( SUCCESS != mssGetSteerMux(i_mba, rank, sp0, sp1, sp)) { + PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } @@ -179,112 +152,110 @@ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) // This rank has both a steer and a chip mark. Call out the DIMM // with the chip mark. - MemoryMru memoryMru( i_mba, rank, mark.getCM() ); - - commitRestoreCallout( &addMemMruCallout, &memoryMru, i_mba ); + if ( NULL == errl ) + { + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrRepairsUsed ); + } - calloutMade = true; + MemoryMru memoryMru( i_mba, rank, mark.getCM() ); + CalloutUtil::calloutMemoryMru( errl, memoryMru, + SRCI_PRIORITY_HIGH, + HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); + o_calloutMade = true; } } - PRDF_DEXIT("processRepairedRanks"); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); - return calloutMade; -} + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); -bool processBadDimms(TargetHandle_t i_mba, uint8_t i_badDimmMask) -{ - PRDF_DENTER("processBadDimms: %p, 0x%02x", i_mba, i_badDimmMask); + return o_calloutMade; - const struct DimmPortAssoc - { - uint8_t port; - uint8_t dimm; - uint8_t enc; - - } dimmPortAssoc[] = { - - {0, 0, 0x8}, - {0, 1, 0x4}, - {1, 0, 0x2}, - {1, 1, 0x1}, - }; - - uint64_t calloutCount = 0; + #undef PRDF_FUNC +} - // callout the argument dimms +//------------------------------------------------------------------------------ - // get all the dimms connected to this MBA +bool processBadDimms( TargetHandle_t i_mba, uint8_t i_badDimmMask ) +{ + #define PRDF_FUNC "[processBadDimms] " - TARGETING::TargetHandleList dimms = PlatServices::getConnected( - i_mba, TARGETING::TYPE_DIMM); + // The bits in i_badDimmMask represent DIMMs that have exceeded the + // available repairs. Callout these DIMMs. - // convert the encoded dimms that had too many repairs to - // dimm targets + bool o_calloutMade = false; + bool analysisErrors = false; - TargetHandleList::iterator dit = dimms.end(); + errlHndl_t errl = NULL; // Initially NULL, will create if needed. - while(dit-- != dimms.begin()) + // Iterate the list of all DIMMs be + TargetHandleList dimms = getConnected( i_mba, TYPE_DIMM ); + for ( TargetHandleList::iterator i = dimms.begin(); i < dimms.end(); i++ ) { uint8_t port = 0, dimm = 0; - if(SUCCESS != PlatServices::getMbaPort(*dit, port)) + if ( SUCCESS != getMbaPort(*i, port) ) { - // skip this dimm - continue; + PRDF_ERR( PRDF_FUNC"getMbaPort() failed: DIMM=0x%08x", getHuid(*i)); + analysisErrors = true; + continue; // skip this dimm } - if(SUCCESS != PlatServices::getMbaDimm(*dit, dimm)) + if ( SUCCESS != getMbaDimm(*i, dimm) ) { - // skip this dimm - continue; + PRDF_ERR( PRDF_FUNC"getMbaDimm() failed: DIMM=0x%08x", getHuid(*i)); + analysisErrors = true; + continue; // skip this dimm } - // see if the passed in dimm - // was flagged as bad by the restore procedure - - bool match = false; + // The 4 bits of i_badDimmMask is defined as p0d0, p0d1, p1d0, and p1d1. + uint8_t mask = 0x8 >> (port * PORT_SLCT_PER_MBA + dimm); - const DimmPortAssoc * it = dimmPortAssoc - + sizeof(dimmPortAssoc)/sizeof(*dimmPortAssoc); - - while(!match && it-- != dimmPortAssoc) + if ( 0 != (i_badDimmMask & mask) ) { - if(i_badDimmMask & it->enc - && port == it->port - && dimm == it->dimm) + if ( NULL == errl ) { - // this dimm is a match - - match = true; + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrRepairUnavail ); } - } - // call them out - - if(match) - { - ++calloutCount; - commitRestoreCallout( &addDimmCallout, *dit, i_mba ); + o_calloutMade = true; + errl->addHwCallout( *i, SRCI_PRIORITY_HIGH, HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); } } - PRDF_DEXIT("processBadDimms: bad dimm count: %d", calloutCount); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); - return 0 != calloutCount; + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); + + return o_calloutMade; + + #undef PRDF_FUNC } -bool processDq(TargetHandle_t i_mba) +//------------------------------------------------------------------------------ + +bool screenBadDqs( TargetHandle_t i_mba ) { - using namespace TARGETING; - using namespace PlatServices; + #define PRDF_FUNC "[screenBadDqs] " - PRDF_DENTER("processDq: %p", i_mba); + // Callout any attached DIMMs that have any bad DQs. - // callout any dimms on the argument MBA - // that have any bad dq + bool o_calloutMade = false; + bool analysisErrors = false; - uint64_t calloutCount = 0; + errlHndl_t errl = NULL; // Initially NULL, will create if needed. for ( uint32_t r = 0; r < MAX_RANKS_PER_MBA; r++ ) { @@ -293,6 +264,9 @@ bool processDq(TargetHandle_t i_mba) if ( SUCCESS != getBadDqBitmap(i_mba, rank, bitmap, true) ) { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } @@ -301,175 +275,170 @@ bool processDq(TargetHandle_t i_mba) bool badDqs = false; if ( SUCCESS != bitmap.badDqs(p, badDqs) ) { + PRDF_ERR( PRDF_FUNC"badDqs() failed: MBA=0x%08x rank=%d " + "port=%d", getHuid(i_mba), rank.flatten(), p ); + analysisErrors = true; continue; // skip this DIMM } if ( !badDqs ) { - continue; // skip this DIMM + continue; // nothing to do, skip this DIMM } TargetHandleList list = CalloutUtil::getConnectedDimms( i_mba, rank, p ); if ( 0 == list.size() ) { - PRDF_ERR( "[processDq] bad bits present but no connected " - "DIMM: MBA=0x%08x rank=%d port=%d", getHuid(i_mba), + PRDF_ERR( PRDF_FUNC"bad bits present but no connected DIMM: " + "MBA=0x%08x rank=%d port=%d", getHuid(i_mba), rank.flatten(), p ); - continue; + analysisErrors = true; + continue; // skip this DIMM } for ( TargetHandleList::iterator i = list.begin(); i < list.end(); i++ ) { - ++calloutCount; - commitRestoreCallout( &addDimmCallout, *i, i_mba ); + if ( NULL == errl ) + { + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrScreenBadDqs ); + } + + o_calloutMade = true; + errl->addHwCallout( *i, SRCI_PRIORITY_HIGH, + HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); } } } - PRDF_DEXIT("processDq: bad dq dimm count: %d", calloutCount); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); + + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); + + return o_calloutMade; - return 0 != calloutCount; + #undef PRDF_FUNC } -void deployDramSpares(TargetHandle_t i_mba) -{ - using namespace fapi; +//------------------------------------------------------------------------------ - bool x4 = PlatServices::isDramWidthX4(i_mba); +void deployDramSpares( TargetHandle_t i_mba ) +{ + bool x4 = isDramWidthX4(i_mba); for ( uint32_t r = 0; r < MAX_RANKS_PER_MBA; r++ ) { CenRank rank ( r ); - CenSymbol symbol = CenSymbol::fromSymbol( i_mba, rank, 0 ); - // ignore errors from putSteerMux + // Doesn't matter which DRAM is spared as long as they are all spared. + // Also, make sure the ECC spare is on a different DRAM than the spare + // DRAM. + CenSymbol symPort0 = CenSymbol::fromDimmDq( i_mba, rank, 0, 0 ); + CenSymbol symPort1 = CenSymbol::fromDimmDq( i_mba, rank, 0, 1 ); + CenSymbol symEccSp = CenSymbol::fromDimmDq( i_mba, rank, 8, 0 ); + + int32_t l_rc = SUCCESS; - static_cast<void>( - PlatServices::mssSetSteerMux(i_mba, rank, symbol, false) ); + l_rc = mssSetSteerMux( i_mba, rank, symPort0, false ); + l_rc |= mssSetSteerMux( i_mba, rank, symPort1, false ); - if( x4 ) + if ( x4 ) + l_rc |= mssSetSteerMux( i_mba, rank, symEccSp, true ); + + if ( SUCCESS != l_rc ) { - static_cast<void>( - PlatServices::mssSetSteerMux(i_mba, rank, symbol, true) ); + // mssSetSteerMux() will print a trace and commit the error log, + // however, we need to handle the return code or we get a compile + // warning in Hostboot. + continue; } } } +} // end namespace RDR + //------------------------------------------------------------------------------ // External functions - declared in prdfMain.H //------------------------------------------------------------------------------ int32_t restoreDramRepairs( TargetHandle_t i_mba ) { - PRDF_ENTER( "restoreDramRepairs(0x%08x)", PlatServices::getHuid(i_mba) ); - - bool calloutMade = false; + #define PRDF_FUNC "PRDF::restoreDramRepairs" - uint8_t repairedRankMask = 0, badDimmMask = 0; + PRDF_ENTER( PRDF_FUNC"(0x%08x)", getHuid(i_mba) ); - do { + bool calloutMade = false; - if(PlatServices::isMemoryPreservingIpl()) + do + { + if ( isMemoryPreservingIpl() ) { - // nothing to do in MPIPL - + // Power is preserved on a Centaur for a MPIPL. So the marks and + // spares will not need to be restored. break; } - bool spareDramDeploy = PlatServices::mnfgSpareDramDeploy(); + bool spareDramDeploy = mnfgSpareDramDeploy(); - if(spareDramDeploy) + if ( spareDramDeploy ) { - deployDramSpares(i_mba); + // Deploy all spares for MNFG corner tests. + RDR::deployDramSpares(i_mba); } - // in mfg mode, check dq and don't restore anything - - if(PlatServices::areDramRepairsDisabled()) + if ( areDramRepairsDisabled() ) { - if(processDq(i_mba)) - { - calloutMade = true; - } + // DRAM Repairs are disabled in MNFG mode, so screen all DIMMs with + // VPD information. + if ( RDR::screenBadDqs(i_mba) ) calloutMade = true; + // No need to continue because there will not be anything to + // restore. break; } - if(spareDramDeploy) + if ( spareDramDeploy ) { - // this is an error...the spare dram - // deploy bit was set but we weren't - // in mfg mode...log an error for MFG - - errlHndl_t err = NULL; - - PRDF_ERR( "[restoreDramRepairs] " - "The specified combination of mfg policy flags is invalid"); - - /*@ - * @errortype - * @reasoncode PRDF_INVALID_CONFIG - * @subsys EPUB_FIRMWARE_SUBSYS - * @moduleid PRDF_RESTORE_DRAM_REPAIR - * @devdesc The specified combination of policy flags is invalid. - */ - PRDF_CREATE_ERRL( - err, - ERRL_SEV_PREDICTIVE, - ERRL_ETYPE_NOT_APPLICABLE, - SRCI_MACH_CHECK, - SRCI_NO_ATTR, - PRDF_RESTORE_DRAM_REPAIR, - FSP_DEFAULT_REFCODE, - PRDF_INVALID_CONFIG, - 0, 0, 0, 0); - PRDF_COMMIT_ERRL(err, ERRL_ACTION_REPORT); - - // assume mfg mode (no repairs) ... + // This is an error. The MNFG spare DRAM deply bit is set, but DRAM + // Repairs have not been disabled. - break; - } - - if(SUCCESS != PlatServices::mssRestoreDramRepairs( - i_mba, - repairedRankMask, - badDimmMask)) - { - // can't check anything if - // this doesn't work + PRDF_ERR( "["PRDF_FUNC"] MNFG spare deploy enabled, but DRAM " + "repairs are not disabled" ); - PRDF_ERR( "[restoreDramRepairs] " - "PlatServices::mssRestoreDramRepairs failed" ); + RDR::commitSoftError( PRDF_INVALID_CONFIG, i_mba, + PRDFSIG_RdrInvalidConfig, true ); - break; + break; // Assume user meant to disable DRAM repairs. } - // callout bad dimms - - if(processBadDimms( - i_mba, - badDimmMask)) + uint8_t rankMask = 0, dimmMask = 0; + if ( SUCCESS != mssRestoreDramRepairs(i_mba, rankMask, dimmMask) ) { - calloutMade = true; + // Can't check anything if this doesn't work. + PRDF_ERR( "["PRDF_FUNC"] mssRestoreDramRepairs() failed" ); + break; } - // check repaired ranks for - // RAS policy violations + // Callout DIMMs with too many bad bits and not enough repairs available + if ( RDR::processBadDimms(i_mba, dimmMask) ) calloutMade = true; - if(processRepairedRanks( - i_mba, - repairedRankMask)) - { - calloutMade = true; - } + // Check repaired ranks for RAS policy violations. + if ( RDR::processRepairedRanks(i_mba, rankMask) ) calloutMade = true; } while(0); - PRDF_EXIT( "restoreDramRepairs(0x%08x)", PlatServices::getHuid(i_mba) ); + PRDF_EXIT( PRDF_FUNC"(0x%08x)", getHuid(i_mba) ); return calloutMade ? FAIL : SUCCESS; + + #undef PRDF_FUNC } } // end namespace PRDF |