From 5c9a8bfc67a350c5caaed4d2cc9c7adae55c027e Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Wed, 5 Feb 2014 14:28:52 -0600 Subject: PRD: TPS support Change-Id: I63465b70623184d4bd6b509db656931057710579 Squashed: I69d487e7c5965d817cb4b86fbeb3fc08f3f0a8c8 Squashed: I1d3486b83ec1817af1c0ed71c7d19c249bca4c79 Squashed: Ia1dccf1d1418d26a049e223cd491e79687b1d04f Squashed: I41ed4ba671361abe9749344bb34e5d699fa5a718 Squashed: I37e82f9604c5a20552e9c94e910583ed094c68f5 Squashed: Ic3125d70979e029c0d4f2bd9f9647db2bc94ed5d Squashed: I2cbdf7eb254ce9055586f494ae859032e5759336 Squashed: I4f00d81457c392919547dd4569233c06d7bfc131 RTC: 87720 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/7471 Tested-by: Jenkins Server Reviewed-by: Prem Shanker Jha Reviewed-by: Bilicon Patil Reviewed-by: Christopher T. Phan Reviewed-by: Sachin Gupta Reviewed-by: Zane Shelley Reviewed-by: A. Patrick Williams III Reviewed-by: BENJAMIN J. WEISENBECK Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/9140 Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/9154 --- .../framework/service/prdfPlatServices_common.C | 41 +++-- .../framework/service/prdfPlatServices_common.H | 27 ++-- .../prdf/common/plat/pegasus/prdfCalloutUtil.C | 30 +++- .../prdf/common/plat/pegasus/prdfCalloutUtil.H | 20 ++- .../prdf/common/plat/pegasus/prdfCenMbaCeTable.C | 13 +- .../prdf/common/plat/pegasus/prdfCenMbaCeTable.H | 15 +- .../prdf/common/plat/pegasus/prdfCenMbaExtraSig.H | 26 ++-- .../prdf/common/plat/pegasus/prdfCenMbaRceTable.C | 22 +-- .../common/plat/pegasus/prdfCenMbaTdCtlr_common.C | 108 +++++++------ .../common/plat/pegasus/prdfCenMbaTdCtlr_common.H | 22 ++- .../plat/pegasus/prdfCenMbaThresholds_common.C | 46 +++++- .../plat/pegasus/prdfCenMbaThresholds_common.H | 12 +- .../prdf/common/plat/pegasus/prdfCenMemUtils.C | 167 +++++++++++++++++---- .../prdf/common/plat/pegasus/prdfCenMemUtils.H | 68 ++++++++- .../diag/prdf/common/plat/pegasus/prdfCenMembuf.C | 66 ++++---- src/usr/diag/prdf/common/plugins/prdfCenLogParse.C | 10 +- .../diag/prdf/plat/pegasus/prdfCenMbaIplCeStats.C | 10 +- src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C | 24 ++- src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H | 5 +- 19 files changed, 537 insertions(+), 195 deletions(-) (limited to 'src/usr/diag') diff --git a/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.C b/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.C index ab837246c..4489bb0d7 100755 --- a/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.C +++ b/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.C @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: ./common/framework/service/prdfPlatServices_common.C $ */ +/* $Source: src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.C $ */ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2013 */ +/* COPYRIGHT International Business Machines Corp. 2012,2014 */ /* */ /* p1 */ /* */ @@ -439,7 +439,7 @@ int32_t mssGetMarkStore( TargetHandle_t i_mba, const CenRank & i_rank, //------------------------------------------------------------------------------ int32_t mssSetMarkStore( TargetHandle_t i_mba, const CenRank & i_rank, - const CenMark & i_mark, bool & o_writeBlocked, + CenMark & io_mark, bool & o_writeBlocked, bool i_allowWriteBlocked ) { #define PRDF_FUNC "[PlatServices::mssSetMarkStore] " @@ -448,19 +448,31 @@ int32_t mssSetMarkStore( TargetHandle_t i_mba, const CenRank & i_rank, errlHndl_t errl = NULL; - uint8_t symbolMark = i_mark.getSM().isValid() ? i_mark.getSM().getSymbol() - : MSS_INVALID_SYMBOL; - uint8_t chipMark = i_mark.getCM().isValid() ? i_mark.getCM().getDramSymbol() - : MSS_INVALID_SYMBOL; + uint8_t sm = io_mark.getSM().isValid() ? io_mark.getSM().getSymbol() + : MSS_INVALID_SYMBOL; + uint8_t cm = io_mark.getCM().isValid() ? io_mark.getCM().getDramSymbol() + : MSS_INVALID_SYMBOL; fapi::ReturnCode l_rc = mss_put_mark_store( getFapiTarget(i_mba), - i_rank.getMaster(), symbolMark, - chipMark ); + i_rank.getMaster(), sm, cm ); if ( i_allowWriteBlocked && fapi::RC_MSS_MAINT_MARKSTORE_WRITE_BLOCKED == l_rc ) { o_writeBlocked = true; + + // Read hardware and get the new chip mark. + CenMark hwMark; + o_rc = mssGetMarkStore( i_mba, i_rank, hwMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"mssGetMarkStore() failed." ); + } + else + { + // Update io_mark with the new chip mark. + io_mark.setCM( hwMark.getCM() ); + } } else { @@ -469,7 +481,7 @@ int32_t mssSetMarkStore( TargetHandle_t i_mba, const CenRank & i_rank, { PRDF_ERR( PRDF_FUNC"mss_put_mark_store() failed. HUID: 0x%08x " "rank: %d sm: %d cm: %d", getHuid(i_mba), - i_rank.getMaster(), symbolMark, chipMark ); + i_rank.getMaster(), sm, cm ); PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); o_rc = FAIL; } @@ -749,7 +761,8 @@ mss_MaintCmdWrapper * createMssCmd( mss_MaintCmdWrapper::CmdType i_cmdType, mss_MaintCmdWrapper * createMssCmd( mss_MaintCmdWrapper::CmdType i_cmdType, TargetHandle_t i_mba, const CenRank & i_rank, uint32_t i_stopCond, - uint32_t i_flags ) + uint32_t i_flags, + const CenAddr * i_sAddrOverride ) { mss_MaintCmdWrapper * o_cmd = NULL; @@ -768,6 +781,12 @@ mss_MaintCmdWrapper * createMssCmd( mss_MaintCmdWrapper::CmdType i_cmdType, i_rank.getSlave(), slaveOnly ); if ( SUCCESS != l_rc ) break; + // Override the start address, if needed. + if ( NULL != i_sAddrOverride ) + { + sAddr.setDoubleWord( 0, i_sAddrOverride->toReadAddr() ); + } + // Get the last address in memory, if needed. if ( allMemory ) { diff --git a/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.H b/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.H index 02e0c8ca5..2586e54c2 100755 --- a/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.H +++ b/src/usr/diag/prdf/common/framework/service/prdfPlatServices_common.H @@ -234,7 +234,10 @@ int32_t mssGetMarkStore( TARGETING::TargetHandle_t i_mba, * @brief Invokes the set mark store hardware procedure. * @param i_mba Target MBA. * @param i_rank Target rank. - * @param i_mark The mark to write. + * @param io_mark The mark to write. If hardware blocks the write + * to markstore and the block is allowed, io_mark + * will be updated with the new chip mark set by + * hardware. * @param o_writeBlocked TRUE if a blocke write is allowed and hardware * blocked the write to markstore. * @param i_allowWriteBlocked TRUE if a blocked write is allowed. This means @@ -248,7 +251,7 @@ int32_t mssGetMarkStore( TARGETING::TargetHandle_t i_mba, * @return Non-SUCCESS in internal function fails, SUCCESS otherwise. */ int32_t mssSetMarkStore( TARGETING::TargetHandle_t i_mba, - const CenRank & i_rank, const CenMark & i_mark, + const CenRank & i_rank, CenMark & io_mark, bool & o_writeBlocked, bool i_allowWriteBlocked = false ); @@ -417,20 +420,26 @@ class mss_MaintCmdWrapper /** * @brief Create a maintenance command object. - * @param i_cmdType Maintenance command type which we want to create. - * @param i_mba An MBA target. - * @param i_rank The first rank to start with (see enum CtrlFlags for - * more details). - * @param i_stopCond Bit mask for conditions in which to stop command. - * @param i_flags See enum CtrlFlags for details. + * @param i_cmdType Maintenance command type which we want to create. + * @param i_mba An MBA target. + * @param i_rank The first rank to start with (see enum CtrlFlags for + * more details). + * @param i_stopCond Bit mask for conditions in which to stop command. + * @param i_flags See enum CtrlFlags for details. + * @param i_sAddrOverride A non-NULL value indicates to use this start address + * and not the start address of i_rank. * @return A mss_MaintCmdWrapper object, NULL if an internal function failed. * @note This function allocates memory on heap for mss_MaintCmdWrapper * object. It is the caller's responsibilty to delete this object. + * @note By default this maintenance command will operate on the address range + * that contains i_rank, but the target address range can be modified + * with i_flags and/or i_sAddrOverride. */ mss_MaintCmdWrapper * createMssCmd( mss_MaintCmdWrapper::CmdType i_cmdType, TARGETING::TargetHandle_t i_mba, const CenRank & i_rank, uint32_t i_stopCond, - uint32_t i_flags = mss_MaintCmdWrapper::NO_FLAGS ); + uint32_t i_flags = mss_MaintCmdWrapper::NO_FLAGS, + const CenAddr * i_sAddrOverride = NULL ); } // end namespace PlatServices diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.C index 22557bcb1..10b069d25 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2013 */ +/* COPYRIGHT International Business Machines Corp. 2012,2014 */ /* */ /* p1 */ /* */ @@ -69,6 +69,34 @@ void calloutMark( TargetHandle_t i_mba, const CenRank & i_rank, //------------------------------------------------------------------------------ +void calloutSymbolData( TargetHandle_t i_mba, const CenRank & i_rank, + const MemUtils::MaintSymbols & i_symData, + STEP_CODE_DATA_STRUCT & io_sc, PRDpriority i_priority ) +{ + bool dimmsBad[PORT_SLCT_PER_MBA] = { false, false }; + + for ( MemUtils::MaintSymbols::const_iterator it = i_symData.begin(); + it != i_symData.end(); it++ ) + { + dimmsBad[it->symbol.getPortSlct()] = true; + } + + for ( uint32_t port = 0; port < PORT_SLCT_PER_MBA; port++ ) + { + if ( dimmsBad[port] ) + { + TargetHandleList list = getConnectedDimms( i_mba, i_rank, port ); + for ( TargetHandleList::iterator it = list.begin(); + it != list.end(); it++ ) + { + io_sc.service_data->SetCallout( *it, i_priority ); + } + } + } +} + +//------------------------------------------------------------------------------ + TargetHandleList getConnectedDimms( TargetHandle_t i_mba, const CenRank & i_rank ) { diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.H index 8d7540c46..0a58cc926 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCalloutUtil.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2013 */ +/* COPYRIGHT International Business Machines Corp. 2012,2014 */ /* */ /* p1 */ /* */ @@ -31,6 +31,7 @@ #include #include +#include namespace PRDF { @@ -63,6 +64,23 @@ void calloutMark( TARGETING::TargetHandle_t i_mba, const CenRank & i_rank, const CenMark & i_mark, STEP_CODE_DATA_STRUCT & io_sc, PRDpriority i_priority = MRU_MED ); +/** + * @brief Will add all DIMMs with symbols that exist in the given list to the + * callout list. + * @note We cannot just callout a MemoryMru for each symbol because there could + * be a lot more symbols in the list than we have room for in the PFA + * data section in the error log. + * @param i_mba Target MBA. + * @param i_rank Target rank. + * @param i_symData The list of symbols. + * @param io_sc The step code data struct. + * @param i_priority Callout priority (default MRU_MED). + */ +void calloutSymbolData( TARGETING::TargetHandle_t i_mba, const CenRank & i_rank, + const MemUtils::MaintSymbols & i_symData, + STEP_CODE_DATA_STRUCT & io_sc, + PRDpriority i_priority = MRU_MED ); + /** * @param i_mba The target MBA. * @param i_rank The target rank. diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.C index e58abb65b..311d94628 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -42,12 +42,13 @@ using namespace CE_TABLE; //------------------------------------------------------------------------------ bool CenMbaCeTable::addEntry( const CenAddr & i_addr, - const CenSymbol & i_symbol ) + const CenSymbol & i_symbol, bool i_isHard ) { bool o_doTps = false; TableData data ( i_addr, i_symbol.getDram(), i_symbol.getDramPins(), - i_symbol.getPortSlct(), i_symbol.getWiringType() ); + i_symbol.getPortSlct(), i_symbol.getWiringType(), + i_isHard ); // First, check if the entry already exists. If so, increment its count and // move it to the end of the queue. @@ -60,6 +61,9 @@ bool CenMbaCeTable::addEntry( const CenAddr & i_addr, // Update the DRAM pins data.dramPins |= it->dramPins; + // Check the hard CE status + if ( it->isHard ) data.isHard = true; + // Remove the old entry iv_table.erase( it ); } @@ -206,10 +210,11 @@ void CenMbaCeTable::addCapData( TargetHandle_t i_mbaTrgt, CaptureData & io_cd ) uint8_t col4_11 = col & 0x0ff; uint8_t active = it->active ? 1 : 0; + uint8_t isHard = it->isHard ? 1 : 0; data[sz_actData ] = it->count; data[sz_actData+1] = it->type << 4; // 4 spare bits - data[sz_actData+2] = (active << 6) | (it->dram & 0x3f); // 1 spare bit + data[sz_actData+2] = (isHard << 7) | (active << 6) | (it->dram & 0x3f); data[sz_actData+3] = it->dramPins; data[sz_actData+4] = (mrnk << 5) | (srnk << 2) | (svld << 1) | row0; data[sz_actData+5] = row1_8; diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.H index 64b312ce9..6bd9bc335 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCeTable.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -77,9 +77,11 @@ class CenMbaCeTable * * @param i_addr The address reporting the CE. * @param i_symbol The symbol reporting the CE. + * @param i_isHard TRUE if a hard CE was detected on this address/symbol. * @return TRUE if TPS is required, FALSE otherwise. */ - bool addEntry( const CenAddr & i_addr, const CenSymbol & i_symbol ); + bool addEntry( const CenAddr & i_addr, const CenSymbol & i_symbol, + bool i_isHard = false ); /** * @brief Deactivates all entries in the table. @@ -125,11 +127,12 @@ class CenMbaCeTable uint8_t dramPins; ///< The failing pins of the DRAM uint8_t portSlct; ///< The port select of the DRAM CenSymbol::WiringType type; ///< The wiring type + bool isHard; ///< TRUE if a hard CE was detected /** @brief Default constructor. */ TableData() : active(false), count(0), addr(), dram(0), dramPins(0), portSlct(0), - type(CenSymbol::WIRING_INVALID) + type(CenSymbol::WIRING_INVALID), isHard(false) {} /** @@ -140,9 +143,11 @@ class CenMbaCeTable * @param i_type The wiring type (for DRAM site locations). */ TableData( const CenAddr & i_addr, uint8_t i_dram, uint8_t i_dramPins, - uint8_t i_portSlct, CenSymbol::WiringType i_type ) : + uint8_t i_portSlct, CenSymbol::WiringType i_type, + bool i_isHard ) : active(true), count(1), addr(i_addr), dram(i_dram), - dramPins(i_dramPins), portSlct(i_portSlct), type(i_type) + dramPins(i_dramPins), portSlct(i_portSlct), type(i_type), + isHard(i_isHard) {} /** An entry is equivalent if the address and DRAM match. */ diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H index 3bc185574..d7a571250 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -37,16 +37,14 @@ PRDR_ERROR_SIGNATURE(StartTpsPhase1, 0xffff0004, "", "Starting TPS phase 1"); PRDR_ERROR_SIGNATURE(StartTpsPhase2, 0xffff0005, "", "Starting TPS phase 2"); PRDR_ERROR_SIGNATURE(EndTpsPhase2, 0xffff0006, "", "TPS Phase 2 completed"); PRDR_ERROR_SIGNATURE(StartRankScrub, 0xffff0007, "", "Starting rank scrub"); -PRDR_ERROR_SIGNATURE(EndRankScrub, 0xffff0008, "", "Rank scrub completed"); -PRDR_ERROR_SIGNATURE(MaintUE, 0xffff0010, "", "Maintenance UE"); -PRDR_ERROR_SIGNATURE(MaintMPE, 0xffff0011, "", "Maintenance MPE"); -PRDR_ERROR_SIGNATURE(MaintHARD_CTE, 0xffff0012, "", "Maintenance HARD CTE"); -PRDR_ERROR_SIGNATURE(MaintSOFT_CTE, 0xffff0013, "", "Maintenance SOFT CTE"); -PRDR_ERROR_SIGNATURE(MaintINTER_CTE, 0xffff0014, "", "Maintenance INTER CTE"); -PRDR_ERROR_SIGNATURE(MaintRETRY_CTE, 0xffff0015, "", "Maintenance RETRY CTE"); -PRDR_ERROR_SIGNATURE(MaintNCE_CTE, 0xffff0016, "", - "Maintenance SOFT/HARD/INTER CTE"); +PRDR_ERROR_SIGNATURE(MaintUE, 0xffff0010, "", "Maintenance UE"); +PRDR_ERROR_SIGNATURE(MaintMPE, 0xffff0011, "", "Maintenance MPE"); +PRDR_ERROR_SIGNATURE(MaintHARD_CTE, 0xffff0012, "", "Maintenance HARD CTE"); +PRDR_ERROR_SIGNATURE(MaintSOFT_CTE, 0xffff0013, "", "Maintenance SOFT CTE"); +PRDR_ERROR_SIGNATURE(MaintINTER_CTE, 0xffff0014, "", "Maintenance INTER CTE"); +PRDR_ERROR_SIGNATURE(MaintRETRY_CTE, 0xffff0015, "", "Maintenance RETRY CTE"); +PRDR_ERROR_SIGNATURE(MaintNCE_CTE, 0xffff0016, "", "Maint SOFT/INTER CTE"); PRDR_ERROR_SIGNATURE(VcmVerified, 0xffff0020, "", "VCM: verified"); PRDR_ERROR_SIGNATURE(VcmFalseAlarm, 0xffff0021, "", "VCM: false alarm"); @@ -75,4 +73,12 @@ PRDR_ERROR_SIGNATURE(MnfgIplDramCTE, 0xffff0052, "", "MNFG IPL DRAM CTE"); PRDR_ERROR_SIGNATURE(MnfgIplRankCTE, 0xffff0053, "", "MNFG IPL half-rank CTE"); PRDR_ERROR_SIGNATURE(MnfgIplDsCTE, 0xffff0054, "", "MNFG IPL DIMM CTE"); +PRDR_ERROR_SIGNATURE(TpsFalseAlarm, 0xffff0061, "", "TPS: false alarm"); +PRDR_ERROR_SIGNATURE(TpsFalseAlarmExceeded, 0xffff0062, "", + "TPS: false alarm threshold exceeded"); +PRDR_ERROR_SIGNATURE(TpsSymbolMark, 0xffff0063, "", "TPS: symbol mark placed"); +PRDR_ERROR_SIGNATURE(TpsChipMark, 0xffff0064, "", "TPS: chip mark placed"); +PRDR_ERROR_SIGNATURE(TpsMarksUnavail, 0xffff0065, "", + "TPS: No more marks available"); + #endif // __prdfCenMbaExtraSig_H diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaRceTable.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaRceTable.C index a53c7f178..8585de592 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaRceTable.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaRceTable.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -48,20 +48,22 @@ bool CenMbaRceTable::addEntry( const CenRank & i_rank , bool o_doTps = false; RceTable::iterator it = iv_table.find( i_rank ); - if( iv_table.end() == it) + if ( iv_table.end() == it ) { - // TODO via RTC 89386 - // PrdfCacheCETable implementation is not quite efficient. Need to - // find better way. + // TODO via RTC 89386 PrdfCacheCETable implementation is not very + // efficient. Need to find a better way. + PrdfCacheCETable entry( getRceThreshold() ); - // Insert the element and get the iterator - it = iv_table.insert( std::make_pair( i_rank, entry)).first; + + // Add a new rank entry to the table and get the iterator. + it = iv_table.insert( std::make_pair(i_rank, entry) ).first; } - for( uint32_t i = 0; i < i_count; i++ ) + + for ( uint32_t i = 0; i < i_count; i++ ) { // Insert all entries even if threshold is crossed // for better FFDC. - o_doTps |= it->second.addAddress(0 , i_sc ); + o_doTps |= it->second.addAddress( 0, i_sc ); } return o_doTps; @@ -72,7 +74,7 @@ bool CenMbaRceTable::addEntry( const CenRank & i_rank , void CenMbaRceTable::flushEntry( const CenRank & i_rank ) { RceTable::iterator it = iv_table.find( i_rank ); - if( iv_table.end() != it) + if ( iv_table.end() != it ) it->second.flushTable(); } //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C index 5e61aa9c7..ff116a44a 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.C @@ -30,6 +30,7 @@ #include #include #include +#include using namespace TARGETING; @@ -75,6 +76,9 @@ int32_t CenMbaTdCtlrCommon::initialize() o_rc = FAIL; break; } + // Set iv_x4Dimm. + iv_x4Dimm = isDramWidthX4(iv_mbaTrgt); + } while (0); return o_rc; @@ -129,7 +133,7 @@ int32_t CenMbaTdCtlrCommon::cleanupPrevCmd() //------------------------------------------------------------------------------ -int32_t CenMbaTdCtlrCommon::prepareNextCmd() +int32_t CenMbaTdCtlrCommon::prepareNextCmd( bool i_clearStats ) { #define PRDF_FUNC "[CenMbaTdCtlrCommon::prepareNextCmd] " @@ -152,32 +156,38 @@ int32_t CenMbaTdCtlrCommon::prepareNextCmd() // Clear ECC counters //---------------------------------------------------------------------- - const char * reg_str = (0 == iv_mbaPos) ? "MBA0_MBSTR" : "MBA1_MBSTR"; - SCAN_COMM_REGISTER_CLASS * mbstr = iv_membChip->getRegister( reg_str ); + const char * reg_str = NULL; - // MBSTR's content could be modified from cleanupCmd() - // so we need to refresh - o_rc = mbstr->ForceRead(); - if ( SUCCESS != o_rc ) + if ( i_clearStats ) { - PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); - break; - } + reg_str = (0 == iv_mbaPos) ? "MBA0_MBSTR" : "MBA1_MBSTR"; + SCAN_COMM_REGISTER_CLASS * mbstr = + iv_membChip->getRegister( reg_str ); - mbstr->SetBit(53); // Setting this bit clears all counters. + // MBSTR's content could be modified from cleanupCmd() + // so we need to refresh + o_rc = mbstr->ForceRead(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"ForceRead() failed on %s", reg_str ); + break; + } - o_rc = mbstr->Write(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); - break; - } + mbstr->SetBit(53); // Setting this bit clears all counters. - // Hardware automatically clears bit 53, so flush this register out of - // the register cache to avoid clearing the counters again with a write - // from the out-of-date cached copy. - RegDataCache & cache = RegDataCache::getCachedRegisters(); - cache.flush( iv_membChip, mbstr ); + o_rc = mbstr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + + // Hardware automatically clears bit 53, so flush this register out + // of the register cache to avoid clearing the counters again with + // a write from the out-of-date cached copy. + RegDataCache & cache = RegDataCache::getCachedRegisters(); + cache.flush( iv_membChip, mbstr ); + } //---------------------------------------------------------------------- // Clear ECC FIRs @@ -188,7 +198,7 @@ int32_t CenMbaTdCtlrCommon::prepareNextCmd() SCAN_COMM_REGISTER_CLASS * firand = iv_membChip->getRegister( reg_str ); firand->setAllBits(); - // Clear all MPE bits. + // Clear all scrub MPE bits. // This will need to be done when starting a TD procedure or background // scrubbing. iv_rank may not be set when starting background scrubbing // and technically there should only be one of these MPE bits on at a @@ -196,7 +206,7 @@ int32_t CenMbaTdCtlrCommon::prepareNextCmd() // clearing them all. firand->SetBitFieldJustified( 20, 8, 0 ); - // Clear NCE, SCE, MCE, RCE, SUE, UE bits (36-41) + // Clear scrub NCE, SCE, MCE, RCE, SUE, UE bits (36-41) firand->SetBitFieldJustified( 36, 6, 0 ); o_rc = firand->Write(); @@ -364,6 +374,7 @@ int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) int32_t o_rc = SUCCESS; iv_isEccSteer = false; + do { if ( VCM_PHASE_2 != iv_tdState ) @@ -388,8 +399,8 @@ int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) if ( iv_mark.getCM().getDram() == iv_mark.getSM().getDram() ) { iv_mark.clearSM(); - bool junk; - o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, junk ); + bool blocked; // Won't be blocked because chip mark is in place. + o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, blocked ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"mssSetMarkStore() failed" ); @@ -428,16 +439,14 @@ int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) PRDF_ERR( PRDF_FUNC"getDimmSpareConfig() failed" ); break; } - bool isX4Dimm = isDramWidthX4( iv_mbaTrgt ); - // Chaeck if DRAM spare is present. - // Also eccspare is available on all X4 DIMMS. - if ( ( ENUM_ATTR_EFF_DIMM_SPARE_NO_SPARE != spareConfig ) || isX4Dimm ) + // Check if DRAM spare is present. Also, ECC spares are available on all + // x4 DIMMS. + if ( ( ENUM_ATTR_EFF_DIMM_SPARE_NO_SPARE != spareConfig ) || iv_x4Dimm ) { - // It is possible that a Centaur DIMM does not have spare DRAMs. - // Check the VPD for available spares. Note that a x4 DIMM have - // DRAM spare and eccspare, so check for availability on both. + // Check the VPD for available spares. Note that a x4 DIMM has + // DRAM spares and ECC spares, so check for availability on both. bool dramSparePossible = false; o_rc = bitmap.isDramSpareAvailable( ps, dramSparePossible ); if ( SUCCESS != o_rc ) @@ -457,9 +466,9 @@ int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) break; } - // If spare DRAM is bad, HW can not steer another DRAM even - // if it is available ( e.g. ecc Spare ). So if chip mark is on - // spare DRAM, update VPD and make predictive callout. + // If spare DRAM is bad, HW can not steer another DRAM even + // if it is available ( e.g. ECC spare ). So if chip mark is on + // spare DRAM, update VPD and make predictive callout. if ( ( iv_mark.getCM().getDram() == (0 == ps ? sp0.getDram() : sp1.getDram()) ) || ( iv_mark.getCM().getDram() == ecc.getDram() )) @@ -501,7 +510,7 @@ int32_t CenMbaTdCtlrCommon::handleMCE_VCM2( STEP_CODE_DATA_STRUCT & io_sc ) // A spare DRAM is available. startDsdProcedure = true; } - else if( isDramWidthX4 ( iv_mbaTrgt ) && !ecc.isValid() ) + else if ( isDramWidthX4(iv_mbaTrgt) && !ecc.isValid() ) { startDsdProcedure = true; iv_isEccSteer = true; @@ -643,30 +652,35 @@ int32_t CenMbaTdCtlrCommon::setRtEteThresholds() o_rc = mbstr->ForceRead(); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); + PRDF_ERR( PRDF_FUNC"ForceRead() failed on %s", reg_str ); break; } - // TODO: RTC 88720 The soft and intermittent CE thresholds will be - // calculated based on the per DRAM threshold similar to the IPL - // CE analysis. - uint32_t softIntCe = 1; + uint16_t softIntCe = 0; + o_rc = getScrubCeThreshold( iv_mbaChip, iv_rank, softIntCe ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"getScrubCeThreshold() failed." ); + break; + } // Only care about retry CEs if there are a lot of them. So the // threshold will be high in the field. However, in MNFG the retry CEs // will be handled differently by putting every occurrence in the RCE // table and doing targeted diagnostics when needed. - uint32_t retryCe = mfgMode() ? 1 : 2047; + uint16_t retryCe = mfgMode() ? 1 : 2047; - uint32_t hardCe = 1; // Always stop on first occurrence. + uint16_t hardCe = 1; // Always stop on first occurrence. mbstr->SetBitFieldJustified( 4, 12, softIntCe ); mbstr->SetBitFieldJustified( 16, 12, softIntCe ); mbstr->SetBitFieldJustified( 28, 12, hardCe ); mbstr->SetBitFieldJustified( 40, 12, retryCe ); - // Set the per symbol counters to count soft, intermittent, and hard CEs - mbstr->SetBitFieldJustified( 55, 3, 0x7 ); + // Set the per symbol counters to count hard CEs only. This is so that + // when the scrub stops on the first hard CE, we can use the per symbol + // counters to tell us which symbol reported the hard CE. + mbstr->SetBitFieldJustified( 55, 3, 0x1 ); o_rc = mbstr->Write(); if ( SUCCESS != o_rc ) @@ -727,7 +741,7 @@ void CenMbaTdCtlrCommon::setTdSignature( STEP_CODE_DATA_STRUCT & io_sc, HUID mbaId = iv_mbaChip->GetId(); (io_sc.service_data->GetErrorSignature())->setChipId(mbaId); io_sc.service_data->SetErrorSig( i_sig ); - } + } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H index 5dcf9d8d2..aa8b9df43 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaTdCtlr_common.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -156,16 +156,19 @@ class CenMbaTdCtlrCommon * log for the trigger will be committed and a trace statement will be made * indicating which rank and TD procedure was requested. * - * @param io_sc The step code data struct. - * @param i_rank The rank in which the event occurred. - * @param i_event The event type (see enum TdType). + * @param io_sc The step code data struct. + * @param i_rank The rank in which the event occurred. + * @param i_event The event type (see enum TdType). + * @param i_banTps TRUE to ban any future TPS requests for this rank, + * default FALSE. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. * @note If no TD procedures are in progress, it will stop background * scrub and start the next TD procedure. */ virtual int32_t handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc, const CenRank & i_rank, - const TdType i_event ) = 0; + const TdType i_event, + bool i_banTps = false ) = 0; protected: // functions /** @@ -300,10 +303,14 @@ class CenMbaTdCtlrCommon /** * @brief Preforms cleanup tasks that need to be done before starting the * next maintenance command (i.e. clear scrub counter). + * @param i_clearStats TRUE to clear all scrub statistics (default), FALSE + * otherwise. This is useful when we need to resume + * background scrubbing on the next address and we + * don't want to clear all of the scrub statistics. * @note Will call cleanupPrevCmd() as part of the preparations. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ - virtual int32_t prepareNextCmd(); + virtual int32_t prepareNextCmd( bool i_clearStats = true ); /** * @brief Clears FIR bits that may have been a side-effect of a chip mark @@ -387,6 +394,9 @@ class CenMbaTdCtlrCommon /** The position number (0-1) relative to the connected MEMBUF. */ uint32_t iv_mbaPos; + /** TRUE if DIMM has x4 DRAMs, FALSE if DIMM has x8 DRAMs. */ + bool iv_x4Dimm; + /** Indicates if TD controller is initialized. */ bool iv_initialized; diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.C index b17347150..63ed2720b 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -42,26 +42,32 @@ namespace PRDF using namespace PlatServices; -// MNFG RCE threshold +// Non MNFG RCE threshold static uint32_t MBA_RCE_NON_MNFG_TH = 8; -//----------------------------------------------------------------------------- +// Non MNFG Scrub soft/intermittent CE threshold +static uint32_t MBA_SCRUB_CE_NON_MNFG_TH = 80; + +//------------------------------------------------------------------------------ ThresholdResolution::ThresholdPolicy getRceThreshold() { uint32_t th = MBA_RCE_NON_MNFG_TH; + if ( mfgMode() ) { - th = MfgThresholdMgr::getInstance()-> - getThreshold( PRDF_CEN_MBA_RT_RCE_PER_RANK ); + th = MfgThresholdMgr::getInstance()-> + getThreshold( PRDF_CEN_MBA_RT_RCE_PER_RANK ); if( th > MBA_RCE_NON_MNFG_TH ) th = MBA_RCE_NON_MNFG_TH; } + return ThresholdResolution::ThresholdPolicy( th, - ThresholdResolution::ONE_DAY); + ThresholdResolution::ONE_DAY ); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ + int32_t getMnfgMemCeTh( ExtensibleChip * i_mbaChip, const CenRank & i_rank, uint16_t & o_cePerDram, uint16_t & o_cePerHalfRank, uint16_t & o_cePerDimm ) @@ -127,5 +133,31 @@ int32_t getMnfgMemCeTh( ExtensibleChip * i_mbaChip, const CenRank & i_rank, #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + +int32_t getScrubCeThreshold( ExtensibleChip * i_mbaChip, const CenRank & i_rank, + uint16_t & o_thr ) +{ + #define PRDF_FUNC "[getScrubCeThreshold] " + + int32_t o_rc = SUCCESS; + + o_thr = MBA_SCRUB_CE_NON_MNFG_TH; + + if ( mfgMode() ) + { + uint16_t junk1 = 0; + uint16_t junk2 = 0; + + o_rc = getMnfgMemCeTh( i_mbaChip, i_rank, o_thr, junk1, junk2 ); + if ( SUCCESS != o_rc ) + PRDF_ERR( PRDF_FUNC"getMnfgMemCeTh() failed" ); + } + + return o_rc; + + #undef PRDF_FUNC +} + } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.H index 32f6423d8..67a6f8e5e 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaThresholds_common.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -56,6 +56,16 @@ int32_t getMnfgMemCeTh( ExtensibleChip * i_mbaChip, const CenRank & i_rank, */ ThresholdResolution::ThresholdPolicy getRceThreshold(); +/** + * @brief Returns scrub soft/intermittent CEs during runtime. + * @param i_mbaChip MBA chip. + * @param i_rank The rank for which the threshold is needed. + * @param o_thr Scrub soft/intermittent CE threshold. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ +int32_t getScrubCeThreshold( ExtensibleChip * i_mbaChip, const CenRank & i_rank, + uint16_t & o_thr ); + } // end namespace PRDF #endif /* __PRDF_CEN_MBA_COMMON_THRESHOLDS_H */ diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.C index 8dc55931b..2a36179fc 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.C @@ -53,77 +53,182 @@ static const char *mbsCeStatReg[][ CE_REGS_PER_MBA ] = { "MBA1_MBSSYMEC6", "MBA1_MBSSYMEC7", "MBA1_MBSSYMEC8" } }; -int32_t collectCeStats( ExtensibleChip *i_mbaChip, MaintSymbols &o_maintStats, - const CenRank & i_rank ) +//------------------------------------------------------------------------------ + +int32_t collectCeStats( ExtensibleChip * i_mbaChip, const CenRank & i_rank, + MaintSymbols & o_maintStats, CenSymbol & o_highestDram, + uint8_t i_thr ) { #define PRDF_FUNC "[MemUtils::collectCeStats] " + int32_t o_rc = SUCCESS; + do { + if ( 0 == i_thr ) // Must be non-zero + { + PRDF_ERR( PRDF_FUNC"i_thr %d is invalid", i_thr ); + o_rc = FAIL; break; + } + TargetHandle_t mbaTrgt = i_mbaChip->GetChipHandle(); CenMbaDataBundle * mbadb = getMbaDataBundle( i_mbaChip ); ExtensibleChip * membufChip = mbadb->getMembChip(); if ( NULL == membufChip ) { - PRDF_ERR( PRDF_FUNC"getMembChip() failed: MBA=0x%08x", - getHuid(mbaTrgt) ); + PRDF_ERR( PRDF_FUNC"getMembChip() failed" ); o_rc = FAIL; break; } + uint8_t mbaPos = getTargetPosition( mbaTrgt ); + if ( MAX_MBA_PER_MEMBUF <= mbaPos ) + { + PRDF_ERR( PRDF_FUNC"mbaPos %d is invalid", mbaPos ); + o_rc = FAIL; break; + } + + bool isX4 = isDramWidthX4(mbaTrgt); - for( uint8_t regIdx = 0 ; regIdx < CE_REGS_PER_MBA; regIdx++) + // Use this map to keep track of the total counts per DRAM. + typedef std::map DramCount; + DramCount dramCounts; + + const char * reg_str = NULL; + SCAN_COMM_REGISTER_CLASS * reg = NULL; + + for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_MBA; regIdx++ ) { - SCAN_COMM_REGISTER_CLASS * ceReg = membufChip->getRegister( - mbsCeStatReg[mbaPos][regIdx] ); + reg_str = mbsCeStatReg[mbaPos][regIdx]; + reg = membufChip->getRegister( reg_str ); - if( NULL == ceReg ) - { - PRDF_ERR( PRDF_FUNC"getRegister() Failed for register:%s", - mbsCeStatReg[mbaPos][regIdx]); - break; - } - o_rc = ceReg->Read(); + o_rc = reg->Read(); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC"%s Read() failed. Target=0x%08x", - mbsCeStatReg[mbaPos][regIdx], getHuid(mbaTrgt) ); + PRDF_ERR( PRDF_FUNC"Read() failed on %s", reg_str ); break; } - uint8_t baseSymbol = SYMBOLS_PER_CE_REG*regIdx; - for(uint8_t i = 0 ; i < SYMBOLS_PER_CE_REG; i++) + + uint8_t baseSymbol = SYMBOLS_PER_CE_REG * regIdx; + + for ( uint8_t i = 0; i < SYMBOLS_PER_CE_REG; i++ ) { - uint8_t synCount = ceReg->GetBitFieldJustified( (i*8), 8 ); + uint8_t count = reg->GetBitFieldJustified( (i*8), 8 ); - if ( 0 == synCount) - { - continue; - } - else + if ( 0 == count ) continue; // nothing to do + + uint8_t sym = baseSymbol + i; + uint8_t dram = CenSymbol::symbol2Dram( sym, isX4 ); + + // Keep track of the total DRAM counts. + dramCounts[dram] += count; + + // Add any symbols that have exceeded threshold to the list. + if ( i_thr <= count ) { SymbolData symData; symData.symbol = CenSymbol::fromSymbol( mbaTrgt, i_rank, - baseSymbol+i, CenSymbol::BOTH_SYMBOL_DQS ); + sym, CenSymbol::BOTH_SYMBOL_DQS ); if ( !symData.symbol.isValid() ) { - PRDF_ERR( PRDF_FUNC"CenSymbol() failed" ); + PRDF_ERR( PRDF_FUNC"CenSymbol() failed: symbol=%d", + sym ); o_rc = FAIL; break; } else { - symData.count = synCount; + symData.count = count; o_maintStats.push_back( symData ); } } } - if( FAIL == o_rc) break; + if ( SUCCESS != o_rc ) break; } - if( FAIL == o_rc) break; - }while(0); + if ( SUCCESS != o_rc ) break; + + if ( o_maintStats.empty() ) break; // no need to continue + + // Sort the list of symbols. + std::sort( o_maintStats.begin(), o_maintStats.end(), sortSymDataCount ); + + // Get the DRAM with the highest count. + DramCount::iterator highestEntry = dramCounts.begin(); + DramCount::iterator it = highestEntry; ++it; // sets it to next entry + for ( ; it != dramCounts.end(); ++it ) + { + if ( highestEntry->second < it->second ) + highestEntry = it; + } + + uint8_t sym = CenSymbol::dram2Symbol( highestEntry->first, isX4 ); + o_highestDram = CenSymbol::fromSymbol( mbaTrgt, i_rank, sym ); + + } while(0); + + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Failed: i_mbaChip=0x%08x i_rank=m%ds%d i_thr=%d", + i_mbaChip->GetId(), i_rank.getMaster(), i_rank.getSlave(), + i_thr ); + } + return o_rc; + #undef PRDF_FUNC } +//------------------------------------------------------------------------------ + +int32_t clearPerSymbolCounters( ExtensibleChip * i_membChip, uint32_t i_mbaPos ) +{ + #define PRDF_FUNC "[MemUtils::clearPerSymbolCounters] " + + int32_t o_rc = SUCCESS; + + do + { + if ( MAX_MBA_PER_MEMBUF <= i_mbaPos ) + { + PRDF_ERR( PRDF_FUNC"i_mbaPos %d is invalid", i_mbaPos ); + o_rc = FAIL; + break; + } + + const char * reg_str = NULL; + SCAN_COMM_REGISTER_CLASS * reg = NULL; + + for ( uint8_t regIdx = 0; regIdx < CE_REGS_PER_MBA; regIdx++ ) + { + reg_str = mbsCeStatReg[i_mbaPos][regIdx]; + reg = i_membChip->getRegister( reg_str ); + + reg->clearAllBits(); + + o_rc = reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on %s", reg_str ); + break; + } + } + + if ( SUCCESS != o_rc ) break; + + } while(0); + + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC"Failed. i_membChip=0x%08x i_mbaPos=%d", + i_membChip->GetId(), i_mbaPos ); + } + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + int32_t getDramSize( ExtensibleChip *i_mbaChip, uint8_t & o_size ) { #define PRDF_FUNC "[MemUtils::getDramSize] " @@ -233,7 +338,7 @@ int32_t chnlCsCleanup( ExtensibleChip *i_mbChip, SCAN_COMM_REGISTER_CLASS * iomcMask = procChip->getRegister( iomcFirMask); - if ( pos >=4 ) pos -= 4; + if ( pos >= 4 ) pos -= 4; // 8 bits are reserved for each Centaur in IOMCFIR. // There are total 4 ( for P system ) centaur supported diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.H index 799219a5c..b2a8292f2 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMemUtils.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -77,15 +77,69 @@ struct SymbolData typedef std::vector MaintSymbols; +/** Functor for MaintSymbols to sort by count. */ +inline bool sortSymDataCount( const SymbolData & i, const SymbolData & j ) +{ + return i.count < j.count; +} + +/** Functor for MaintSymbols to match a DRAM. */ +class MatchSymDataDram +{ + public: + explicit MatchSymDataDram( const CenSymbol & i_d ) : iv_d(i_d.getDram()) {} + + bool operator() ( const SymbolData & i_data ) const + { return ( iv_d == i_data.symbol.getDram() ); } + + private: + uint8_t iv_d; +}; + +/** Functor for MaintSymbols to match a symbol. */ +class MatchSymDataSymbol +{ + public: + explicit MatchSymDataSymbol( const CenSymbol & i_s ) : iv_s(i_s) {} + + bool operator() ( const SymbolData & i_data ) const + { return ( iv_s == i_data.symbol ); } + + private: + CenSymbol iv_s; +}; + /** - * @brief Checks CE stats on a MBA. - * @param i_mbaChip MBA chip. - * @param o_maintStats Output vector. Contains symbols with CE count. - * @param i_rank Rank for which stat collection will be done. + * @brief Queries the per symbol counters and returns a sorted list of symbols + * with a count greater than or equal to the given threshold. + * @note The returned list will be sorted from lowest count to highest count. + * @param i_mbaChip Target MBA chip. + * @param i_rank Target rank. + * @param o_maintStats Returns the list of symbols and counts. + * @param o_highestDram Returns a symbol representing the DRAM with the highest + * total count. This includes counts from DRAM that are + * under the given threshold. This value is undefined if + * o_maintStats is empty. + * @param i_thr The count threshold. Each symbol count must be greater + * than or equal to this value to be added to the list. + * The default is 1, which means all non-zero counts will + * be added to the list. A value of 0 will result in a bad + * return code. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ +int32_t collectCeStats( ExtensibleChip * i_mbaChip, const CenRank & i_rank, + MaintSymbols & o_maintStats, CenSymbol & o_highestDram, + uint8_t i_thr = 1 ); + +/** + * @brief Clears the per symbol CE counters on a target MBA (via MEMBUF and MBA + * position number). + * @param i_membChip MEMBUF chip. + * @param i_mbaPos MBA positions number (0-1). * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. */ -int32_t collectCeStats( ExtensibleChip * i_mbaChip, MaintSymbols & o_maintStats, - const CenRank & i_rank ); +int32_t clearPerSymbolCounters( ExtensibleChip * i_membChip, + uint32_t i_mbaPos ); /** * @brief Gets DRAM size for an MBA. diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMembuf.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMembuf.C index e0cf9e8b1..9924ab215 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMembuf.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMembuf.C @@ -391,17 +391,18 @@ PRDF_PLUGIN_DEFINE( Membuf, checkSpareBit ); //############################################################################## /** - * @brief MBSECCFIR[0-7,20:27] - Fetch/Maintenance Mark Placed Event (MPE). + * @brief MBSECCFIR[0:7] - Fetch Mark Placed Event (MPE). * @param i_membChip A Centaur chip. * @param i_sc The step code data struct. * @param i_mbaPos The MBA position. * @param i_rank The target rank. * @return SUCCESS */ -int32_t AnalyzeMpe( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, - uint32_t i_mbaPos, uint8_t i_rank ) +int32_t AnalyzeFetchMpe( ExtensibleChip * i_membChip, + STEP_CODE_DATA_STRUCT & i_sc, + uint32_t i_mbaPos, uint8_t i_rank ) { - #define PRDF_FUNC "[AnalyzeMpe] " + #define PRDF_FUNC "[AnalyzeFetchMpe] " int32_t l_rc = SUCCESS; @@ -563,7 +564,9 @@ int32_t AnalyzeFetchNce( ExtensibleChip * i_membChip, CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); bool doTps = mbadb->iv_ceTable.addEntry( addr, symbol ); - if ( mfgMode() ) + // Check MNFG thresholds, if needed. No need to check if a TPS + // request is already needed. + if ( !doTps && mfgMode() ) { // Get the MNFG CE thresholds. uint16_t dramTh, hrTh, dimmTh; @@ -583,32 +586,38 @@ int32_t AnalyzeFetchNce( ExtensibleChip * i_membChip, if ( dramTh < dramCount ) { - i_sc.service_data->SetErrorSig( PRDFSIG_MnfgDramCte ); i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDramCte ); - i_sc.service_data->SetServiceCall(); + doTps = true; } if ( hrTh < hrCount ) { - i_sc.service_data->SetErrorSig( PRDFSIG_MnfgHrCte ); i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgHrCte ); - i_sc.service_data->SetServiceCall(); + doTps = true; } if ( dimmTh < dimmCount ) { - i_sc.service_data->SetErrorSig( PRDFSIG_MnfgDimmCte ); i_sc.service_data->AddSignatureList( mbaTrgt, PRDFSIG_MnfgDimmCte ); - i_sc.service_data->SetServiceCall(); + doTps = true; } } // Initiate a TPS procedure, if needed. if ( doTps ) { + #ifdef __HOSTBOOT_MODULE + // Will not be able to do TPS during hostboot so make the error + // log predictive in MNFG mode. Note that we will still call + // handleTdEvent() so we can get the trace statement indicating + // TPS was requested during Hostboot. + if ( mfgMode() ) + i_sc.service_data->SetServiceCall(); + #endif + l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT ); if ( SUCCESS != l_rc ) @@ -643,14 +652,14 @@ int32_t AnalyzeFetchNce( ExtensibleChip * i_membChip, /** * @brief Fetch Retry CE / Prefetch UE Errors. * @param i_membChip A Centaur chip. - * @param i_sc The step code data struct. - * @param i_mbaPos The MBA position. - * @param isRceError True for RCE error false otherwise. + * @param i_sc The step code data struct. + * @param i_mbaPos The MBA position. + * @param i_isRceError True for RCE error false otherwise. * @return SUCCESS */ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, STEP_CODE_DATA_STRUCT & i_sc, uint32_t i_mbaPos, - bool isRceError ) + bool i_isRceError ) { #define PRDF_FUNC "[AnalyzeFetchRcePue] " @@ -671,7 +680,7 @@ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, CenMbaDataBundle * mbadb = getMbaDataBundle( mbaChip ); CenAddr addr; - if( isRceError ) + if ( i_isRceError ) l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_RCE_ADDR, addr ); else l_rc = getCenReadAddr( i_membChip, i_mbaPos, READ_UE_ADDR, addr ); @@ -681,17 +690,17 @@ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, PRDF_ERR( PRDF_FUNC"getCenReadAddr() failed" ); break; } - CenRank rank = addr.getRank(); - // Callout the rank and the attached MBA. + + // Callout the rank. MemoryMru memmru ( mbaChip->GetChipHandle(), rank, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); - // Add the entry to rce table and take action as per rce table rules. - if ( mbadb->iv_rceTable.addEntry( rank , i_sc )) + // Add an entry to the RCE table. + if ( mbadb->iv_rceTable.addEntry(rank, i_sc) ) { - // Tell TD controller to handle TPS event. + // Add a TPS request to the queue TD queue. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, CenMbaTdCtlrCommon::TPS_EVENT ); if ( SUCCESS != l_rc ) @@ -700,6 +709,7 @@ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, break; } } + } while (0); // Add ECC capture data for FFDC. @@ -708,8 +718,9 @@ int32_t AnalyzeFetchRcePue( ExtensibleChip * i_membChip, if ( SUCCESS != l_rc ) { - PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d", - i_membChip->GetId(), i_mbaPos ); + PRDF_ERR( PRDF_FUNC"Failed: i_membChip=0x%08x i_mbaPos=%d " + "i_isRceError=%c", i_membChip->GetId(), i_mbaPos, + i_isRceError ? 'T' : 'F' ); CalloutUtil::defaultError( i_sc ); } @@ -769,9 +780,11 @@ int32_t AnalyzeFetchUe( ExtensibleChip * i_membChip, MemoryMruData::CALLOUT_RANK ); i_sc.service_data->SetCallout( memmru ); - // Add a TPS request to the TD queue. + // Add a TPS request to the TD queue and ban any further TPS requests + // for this rank. l_rc = mbadb->iv_tdCtlr.handleTdEvent( i_sc, rank, - CenMbaTdCtlrCommon::TPS_EVENT ); + CenMbaTdCtlrCommon::TPS_EVENT, + true ); if ( SUCCESS != l_rc ) { PRDF_ERR( PRDF_FUNC"handleTdEvent() failed: rank=m%ds%d", @@ -806,7 +819,6 @@ int32_t AnalyzeFetchUe( ExtensibleChip * i_membChip, * @param i_chip The Centaur chip. * @param i_sc ServiceDataColector. * @return SUCCESS. - */ int32_t ClearMbsSecondaryBits( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & i_sc ) @@ -1090,7 +1102,7 @@ PLUGIN_FETCH_RCE_PREUE_ERROR( PreUe, 1, false ) int32_t AnalyzeFetchMpe##MBA##_##RANK( ExtensibleChip * i_membChip, \ STEP_CODE_DATA_STRUCT & i_sc ) \ { \ - return AnalyzeMpe( i_membChip, i_sc, MBA, RANK ); \ + return AnalyzeFetchMpe( i_membChip, i_sc, MBA, RANK ); \ } \ PRDF_PLUGIN_DEFINE( Membuf, AnalyzeFetchMpe##MBA##_##RANK ); diff --git a/src/usr/diag/prdf/common/plugins/prdfCenLogParse.C b/src/usr/diag/prdf/common/plugins/prdfCenLogParse.C index 1d5a73fe6..51c35b589 100644 --- a/src/usr/diag/prdf/common/plugins/prdfCenLogParse.C +++ b/src/usr/diag/prdf/common/plugins/prdfCenLogParse.C @@ -231,10 +231,10 @@ bool parseMemCeTable( uint8_t * i_buffer, uint32_t i_buflen, i_parser.PrintNumber( " MEM_CE_TABLE", "%d", entries ); - const char * hh = " A Count Type"; + const char * hh = " A H Count Type"; const char * hd = "Rank Bank Row Column DRAM Pins"; i_parser.PrintString( hh, hd ); - hh = " - ----- -------------"; + hh = " - - ----- -------------"; hd = "---- ---- ------- ------ ---- ----"; i_parser.PrintString( hh, hd ); @@ -245,6 +245,7 @@ bool parseMemCeTable( uint8_t * i_buffer, uint32_t i_buflen, uint32_t count = i_buffer[idx ]; // 8-bit uint32_t type = i_buffer[idx+1] >> 4; // 4-bit + uint8_t isHard = (i_buffer[idx+2] >> 7) & 0x1; // 1-bit uint8_t active = (i_buffer[idx+2] >> 6) & 0x1; // 1-bit uint8_t dram = i_buffer[idx+2] & 0x3f; // 6-bit @@ -266,6 +267,7 @@ bool parseMemCeTable( uint8_t * i_buffer, uint32_t i_buflen, uint32_t col = (col0_3 << 8) | col4_11; // 12-bit char active_char = ( 1 == active ) ? 'Y':'N'; + char isHard_char = ( 1 == isHard ) ? 'Y':'N'; const char * type_str = "UNKNOWN "; // 13 characters switch ( type ) @@ -287,8 +289,8 @@ bool parseMemCeTable( uint8_t * i_buffer, uint32_t i_buflen, } char header[HEADER_SIZE] = { '\0' }; - snprintf( header, HEADER_SIZE, " %c 0x%02x %s", active_char, - count, type_str ); + snprintf( header, HEADER_SIZE, " %c %c 0x%02x %s", active_char, + isHard_char, count, type_str ); char data[DATA_SIZE] = { '\0' }; snprintf( data, DATA_SIZE, "%s 0x%01x 0x%05x 0x%03x %2d 0x%02x", diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaIplCeStats.C b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaIplCeStats.C index 98b21f0e0..235e34db6 100755 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaIplCeStats.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaIplCeStats.C @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -95,8 +95,8 @@ int32_t CenMbaIplCeStats::collectStats( const CenRank & i_stopRank ) int32_t o_rc = SUCCESS; do { - MemUtils::MaintSymbols symData; - o_rc = MemUtils::collectCeStats( iv_mbaChip, symData, i_stopRank); + MemUtils::MaintSymbols symData; CenSymbol junk; + o_rc = MemUtils::collectCeStats( iv_mbaChip, i_stopRank, symData, junk); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"MemUtils::collectCeStats() failed. MBA:0X%08X", @@ -201,8 +201,8 @@ int32_t CenMbaIplCeStats::calloutHardCes( const CenRank & i_stopRank ) int32_t o_rc = SUCCESS; do { - MemUtils::MaintSymbols symData; - o_rc = MemUtils::collectCeStats( iv_mbaChip, symData, i_stopRank); + MemUtils::MaintSymbols symData; CenSymbol junk; + o_rc = MemUtils::collectCeStats( iv_mbaChip, i_stopRank, symData, junk); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"MemUtils::collectCeStats() failed. MBA:0X%08X", diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C index af6734855..966d437f5 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.C @@ -173,7 +173,8 @@ int32_t CenMbaTdCtlr::handleCmdCompleteEvent( STEP_CODE_DATA_STRUCT & io_sc ) int32_t CenMbaTdCtlr::handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc, const CenRank & i_rank, - const CenMbaTdCtlrCommon::TdType i_event ) + const CenMbaTdCtlrCommon::TdType i_event, + bool i_banTps ) { #define PRDF_FUNC "[CenMbaTdCtlr::handleTdEvent] " @@ -182,8 +183,9 @@ int32_t CenMbaTdCtlr::handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc, // request. Note that any VCM request will eventually be found during the // initialization of the runtime TD controller. PRDF_INF( PRDF_FUNC"TD request found during Hostboot: iv_mbaChip=0x%08x " - "i_rank=M%dS%d i_event=%d", iv_mbaChip->GetId(), - i_rank.getMaster(), i_rank.getSlave(), i_event ); + "i_rank=M%dS%d i_event=%d i_banTps=%c", iv_mbaChip->GetId(), + i_rank.getMaster(), i_rank.getSlave(), i_event, + i_banTps ? 'T' : 'F' ); return SUCCESS; @@ -517,8 +519,8 @@ int32_t CenMbaTdCtlr::analyzeVcmPhase2( STEP_CODE_DATA_STRUCT & io_sc ) // Remove chip mark from hardware. iv_mark.clearCM(); - bool junk; - o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, junk ); + bool blocked; // not possible during MDIA + o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, blocked ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"mssSetMarkStore() failed" ); @@ -647,8 +649,8 @@ int32_t CenMbaTdCtlr::analyzeDsdPhase2( STEP_CODE_DATA_STRUCT & io_sc ) // Remove chip mark from hardware. iv_mark.clearCM(); - bool junk; - o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, junk ); + bool blocked; // not possible during MDIA + o_rc = mssSetMarkStore( iv_mbaTrgt, iv_rank, iv_mark, blocked ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC"mssSetMarkStore() failed" ); @@ -720,6 +722,10 @@ int32_t CenMbaTdCtlr::analyzeTpsPhase1( STEP_CODE_DATA_STRUCT & io_sc ) } else { + // No error found so add rank to callout list, just in case. + MemoryMru memmru (iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK); + io_sc.service_data->SetCallout( memmru ); + // Start TPS Phase 2 o_rc = startTpsPhase2( io_sc ); if ( SUCCESS != o_rc ) @@ -791,6 +797,10 @@ int32_t CenMbaTdCtlr::analyzeTpsPhase2( STEP_CODE_DATA_STRUCT & io_sc ) } else { + // No error found so add rank to callout list, just in case. + MemoryMru memmru (iv_mbaTrgt, iv_rank, MemoryMruData::CALLOUT_RANK); + io_sc.service_data->SetCallout( memmru ); + io_sc.service_data->AddSignatureList( iv_mbaTrgt, PRDFSIG_EndTpsPhase2 ); iv_tdState = NO_OP; diff --git a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H index f314f0fe9..67849adc6 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H +++ b/src/usr/diag/prdf/plat/pegasus/prdfCenMbaTdCtlr.H @@ -5,7 +5,7 @@ /* */ /* IBM CONFIDENTIAL */ /* */ -/* COPYRIGHT International Business Machines Corp. 2013 */ +/* COPYRIGHT International Business Machines Corp. 2013,2014 */ /* */ /* p1 */ /* */ @@ -63,7 +63,8 @@ class CenMbaTdCtlr : public CenMbaTdCtlrCommon int32_t handleCmdCompleteEvent( STEP_CODE_DATA_STRUCT & io_sc ); int32_t handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc, - const CenRank & i_rank, const TdType i_event ); + const CenRank & i_rank, const TdType i_event, + bool i_banTps = false ); private: // Overloaded functions -- cgit v1.2.1