From 21ea16bf30b5f7d5d757a1a7454c0c51db0e79e4 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Fri, 18 Aug 2017 10:04:05 -0500 Subject: PRD: Runtime TPS Analyze CEs Change-Id: I175c14c65df216ccfc81bf7d8b1fa3e5a05902b2 RTC: 171914 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44809 Tested-by: Jenkins Server Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45389 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- src/usr/diag/prdf/plat/mem/prdfMemTps.H | 18 ++ src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C | 483 ++++++++++++++++++++++++++++- 2 files changed, 496 insertions(+), 5 deletions(-) (limited to 'src/usr/diag/prdf/plat/mem') diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps.H b/src/usr/diag/prdf/plat/mem/prdfMemTps.H index e01ca9087..577de3d6a 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps.H @@ -97,6 +97,23 @@ class TpsEvent : public TdEntry STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ); + /** + * @brief Analyzes the counts that summarize the symbol CE counts. + * @param i_badDqCount Number of nibbles with a bad DQ + * @param i_badChipCount Number of nibbles with a bad Chip + * @param i_nonZeroSumCount Number of nibbles under threshold with a + * non-zero sum + * @param i_singleSymCount Number of nibbles under threshold with a + * single symbol count greater than one + * @param i_symList Vector of all symbols with counts > 0. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + uint32_t analyzeCeSymbolCounts( CeCount i_badDqCount, + CeCount i_badChipCount, CeCount i_nonZeroSumCount, + CeCount i_singleSymCount, MemUtils::MaintSymbols i_symList, + STEP_CODE_DATA_STRUCT & io_sc ); + /** * @brief Gets the counts that summarize the symbol CE counts. * @param io_badDqCount Number of nibbles with a bad DQ @@ -121,6 +138,7 @@ class TpsEvent : public TdEntry */ uint32_t analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ); + #endif // __HOSTBOOT_RUNTIME }; diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C index 17ec60c04..d1b24ce08 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C @@ -27,6 +27,7 @@ // Platform includes #include +#include #include #include #include @@ -240,6 +241,41 @@ void __analyzeNibbleSyms( MemUtils::MaintSymbols i_nibbleStats, //------------------------------------------------------------------------------ +template +uint32_t __updateVpdCountAboveOne( MemUtils::MaintSymbols i_symList, + MemDqBitmap & io_dqBitmap ) +{ + + #define PRDF_FUNC "[__updateVpdCountAboveOne] " + + uint32_t o_rc = SUCCESS; + + do + { + // Update VPD with all symbols that have a count greater than 1. This is + // so if we do TPS again, we'll callout again even if the symbol + // counters change. + for ( auto sym : i_symList ) + { + if ( sym.count > 1 ) + { + o_rc = io_dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "io_dqBitmap.setSymbol failed." ); + break; + } + } + } + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + template uint32_t TpsEvent::startTpsPhase1_rt( STEP_CODE_DATA_STRUCT & io_sc ) { @@ -411,6 +447,441 @@ uint32_t TpsEvent::analyzeEcc( const uint32_t & i_eccAttns, //------------------------------------------------------------------------------ +template<> +uint32_t TpsEvent::analyzeCeSymbolCounts( CeCount i_badDqCount, + CeCount i_badChipCount, CeCount i_nonZeroSumCount, + CeCount i_singleSymCount, MemUtils::MaintSymbols i_symList, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + + #define PRDF_FUNC "[TpsEvent::analyzeCeSymbolCounts] " + + uint32_t o_rc = SUCCESS; + + do + { + bool tpsFalseAlarm = false; + + // Get the Bad DQ Bitmap. + TargetHandle_t mcaTrgt = iv_chip->getTrgt(); + MemDqBitmap dqBitmap; + + o_rc = getBadDqBitmap(mcaTrgt, iv_rank, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getBadDqBitmap" + "(0x%08x, 0x%02x) failed", getHuid(mcaTrgt), + iv_rank.getKey() ); + break; + } + + // Get the symbol mark. + MemMark symMark; + o_rc = MarkStore::readSymbolMark( iv_chip, iv_rank, symMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readSymbolMark(0x%08x, 0x%02x) " + "failed", iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // Get the chip mark. + MemMark chipMark; + o_rc = MarkStore::readChipMark( iv_chip, iv_rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x, 0x%02x) " + "failed", iv_chip->getHuid(), iv_rank.getKey() ); + break; + } + + // If the bad DQ nibble count is 0 and the bad chip nibble count is 0. + if ( 0 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // There is nothing to repair. Any other non-zero counts are + // considered acceptable noise. + // Set false alarm flag to true. + tpsFalseAlarm = true; + } + // If the bad DQ nibble count is 1 and the bad chip nibble count is 0. + else if ( 1 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the non-zero sum nibble count is <= 1 or the single + // symbol nibble count is <= 2. + if (i_nonZeroSumCount.count <= 1 || i_singleSymCount.count <= 2) + { + // This means we have a potential future chip kill or + // TCE. Both are still correctable after a symbol mark + // is placed. + // Place a symbol mark on this bad DQ. + MemMark newSymMark( mcaTrgt, iv_rank, + i_badDqCount.symList[0].symbol ); + o_rc = MarkStore::writeSymbolMark( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Update VPD with the symbol mark. + o_rc = dqBitmap.setSymbol( i_badDqCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + else + { + // Placing a symbol mark risks a UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + else + { + // Otherwise assume the symbol mark is fixing this bad DQ. + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + } + // Else if bad DQ nibble count is 2 and bad chip nibble count is 0. + else if ( 2 == i_badDqCount.count && 0 == i_badChipCount.count ) + { + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the non-zero sum nibble count is <= 1 and the single + // symbol nibble count is <= 1. + if (i_nonZeroSumCount.count <= 1 && i_singleSymCount.count <= 1) + { + // This means we have only one more potential bad DQ, which + // is correctable after a symbol mark is placed. + // Place a symbol mark on this bad DQ with the highest count + MemUtils::SymbolData highSym; + for ( auto sym : i_badDqCount.symList ) + { + if ( sym.count > highSym.count ) + highSym = sym; + } + + MemMark newSymMark( mcaTrgt, iv_rank, + highSym.symbol ); + o_rc = MarkStore::writeSymbolMark( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Update VPD with both symbols. + for ( auto sym : i_badDqCount.symList ) + { + o_rc = dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + if ( SUCCESS != o_rc ) break; + } + else + { + // Placing a symbol mark risks a UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + + } + else + { + // Otherwise assume the symbol mark is fixing a bad DQ. + // Update VPD with the unrepaired symbol. + for ( auto sym : i_badDqCount.symList ) + { + if ( sym.symbol == symMark.getSymbol() ) continue; + + o_rc = dqBitmap.setSymbol( sym.symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + } + if ( SUCCESS != o_rc ) break; + + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + + } + // Else if bad DQ nibble count is 0 and bad chip nibble count is 1 + else if ( 0 == i_badDqCount.count && 1 == i_badChipCount.count ) + { + // If the chip mark is available. + if ( !chipMark.isValid() ) + { + // If the non-zero sum nibble count is <= 1 and the single + // symbol nibble count is <= 1. + if (i_nonZeroSumCount.count <= 1 && i_singleSymCount.count <= 1) + { + // This means we have only one more potential bad DQ, which + // is still correctable after a chip mark is placed. + // Place a chip mark on this bad chip. + MemMark newChipMark( mcaTrgt, iv_rank, + i_badChipCount.symList[0].symbol ); + o_rc = MarkStore::writeChipMark( iv_chip, iv_rank, + newChipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Update VPD with the chip mark. + o_rc = dqBitmap.setDram( i_badChipCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setDram failed." ); + break; + } + } + else + { + // Placing a mark risks a UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + else + { + // Assume the chip mark is being used to fix the bad chip. + // Set the false alarm flag to true. + tpsFalseAlarm = true; + } + } + // Else if bad DQ nibble count is 1 and bad chip nibble count is 1 + else if ( 1 == i_badDqCount.count && 1 == i_badChipCount.count ) + { + // If neither chip nor symbol mark is available. + if ( chipMark.isValid() && symMark.isValid() ) + { + // Assume the chip and symbol marks are already being used to + // fix the bad chip and DQ and some other nibble under + // threshold triggered TPS. + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + // If the chip mark is available. + if ( !chipMark.isValid() ) + { + // If the non-zero sum nibble count is 0 + if ( 0 == i_nonZeroSumCount.count ) + { + // This means we have no more potential bad DQ or bad chips + // since we can't correct those after chip mark is placed. + // Place a chip mark on the bad chip. + MemMark newChipMark( mcaTrgt, iv_rank, + i_badChipCount.symList[0].symbol ); + o_rc = MarkStore::writeChipMark( iv_chip, iv_rank, + newChipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Update VPD with the chip mark. + o_rc = dqBitmap.setDram( i_badChipCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setDram failed." ); + break; + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + else + { + // Placing a chip mark risks a UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + // If the symbol mark is available. + if ( !symMark.isValid() ) + { + // If the non-zero sum nibble count is 0 + if ( 0 == i_nonZeroSumCount.count ) + { + // This means we have no more potential bad DQ or bad chips + // since we can't correct those after symbol mark is placed. + // Place a symbol mark on this bad DQ. + MemMark newSymMark( mcaTrgt, iv_rank, + i_badDqCount.symList[0].symbol ); + o_rc = MarkStore::writeSymbolMark( iv_chip, + iv_rank, newSymMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeSymbolMark(0x%08x,0x%02x) " + "failed", iv_chip->getHuid(), getKey() ); + break; + } + + // Update VPD with the symbol mark. + o_rc = dqBitmap.setSymbol( i_badDqCount.symList[0].symbol ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "dqBitmap.setSymbol failed." ); + break; + } + + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + } + else + { + // Placing the symbol mark risks a UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + } + + } + else + { + // There are enough errors that this could be a potential UE. + // Update VPD with all symbols that have a count > 1. + o_rc = __updateVpdCountAboveOne( + i_symList, dqBitmap ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__updateVpdCountAboveOne() failed." ); + } + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // Permanently mask mainline NCEs and TCEs. + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + } + + // If analysis resulted in a false alarm. + if ( tpsFalseAlarm ) + { + // Increase false alarm counter. + // If false alarm counter threshold of 3 per day is reached. + if ( __getTpsFalseAlarmCounter(iv_chip)->inc( iv_rank, + io_sc) ) + { + // Permanently mask mainline NCEs and TCEs + getMcaDataBundle(iv_chip)->iv_maskMainlineNceTce = true; + + // Make the error log predictive + io_sc.service_data->setServiceCall(); + } + } + + // We may have placed a chip mark, so if a symbol mark is being used on + // the same chip, undo the symbol mark after the chip mark is in place. + o_rc = MarkStore::balance( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // Write any updates to VPD. + o_rc = setBadDqBitmap(mcaTrgt, iv_rank, dqBitmap); + if ( SUCCESS != o_rc ) + { + PRDF_ERR(PRDF_FUNC "setBadDqBitmap" + "(0x%08x, 0x%02x) failed", getHuid(mcaTrgt), + iv_rank.getKey()); + break; + } + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + template<> uint32_t TpsEvent::getSymbolCeCounts( CeCount & io_badDqCount, CeCount & io_badChipCount, CeCount & io_nonZeroSumCount, @@ -524,9 +995,6 @@ uint32_t TpsEvent::analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ) do { - - - // The symbol CE counts will be summarized in the following buckets: // Number of nibbles with a bad DQ // Number of nibbles with a bad chip @@ -553,7 +1021,13 @@ uint32_t TpsEvent::analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ) } // Analyze the symbol CE counts. - //TODO RTC 171914 + o_rc = analyzeCeSymbolCounts( badDqCount, badChipCount, nonZeroSumCount, + singleSymCount, symList, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "analyzeCeSymbolCounts failed." ); + break; + } }while(0); @@ -565,7 +1039,6 @@ uint32_t TpsEvent::analyzeCe( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ -// TODO: RTC 171914 Actual implementation of this procedure will be done later. template<> uint32_t TpsEvent::nextStep( STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) -- cgit v1.2.1