diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2017-03-29 15:44:39 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2017-04-10 12:05:03 -0400 |
commit | 4a0692811d82840b2c6eed9c892c9a866586b989 (patch) | |
tree | 36821da007432dd8b7b22c04c2899165b3dced2c /src/usr/diag/prdf | |
parent | 6e5d170315054ed04c7a3ba7144990fe3860e882 (diff) | |
download | talos-hostboot-4a0692811d82840b2c6eed9c892c9a866586b989.tar.gz talos-hostboot-4a0692811d82840b2c6eed9c892c9a866586b989.zip |
PRD: Add support for memory IMPE analysis
Change-Id: Icb2e6b8f4c87c38a47f46ecb5d97d36533296ba9
RTC: 165384
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/38902
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/39002
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf')
9 files changed, 173 insertions, 5 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index d2746f138..491b1f63b 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -832,6 +832,113 @@ uint32_t analyzeMaintIue<TYPE_MCA, McaDataBundle*>(ExtensibleChip * i_chip, //------------------------------------------------------------------------------ +template<TARGETING::TYPE T, typename D> +uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) +{ + + #define PRDF_FUNC "[MemEcc::analyzeImpe] " + + PRDF_ASSERT( T == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + #ifdef __HOSTBOOT_MODULE + + do + { + // get data bundle from chip + D db = static_cast<D>( i_chip->getDataBundle() ); + + // get the mark shadow register + SCAN_COMM_REGISTER_CLASS * msr = i_chip->getRegister("MSR"); + + o_rc = msr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on MSR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + TargetHandle_t trgt = i_chip->getTrgt(); + + // get galois field code - bits 8:15 of MSR + uint8_t galois = msr->GetBitFieldJustified( 8, 8 ); + + // get rank - bits 16:18 of MSR + uint8_t mrnk = msr->GetBitFieldJustified( 16, 3 ); + MemRank rank( mrnk ); + + // get symbol and DRAM + MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois ); + uint8_t dram = symbol.getDram(); + + // Add the DIMM to the callout list + MemoryMru memmru( trgt, rank, MemoryMruData::CALLOUT_RANK ); + io_sc.service_data->SetCallout( memmru ); + + // if at any point there is more than one dram reporting an IMPE on a + // rank within the timebase of the threshold we make the error log + // predictive + + // clear our vector of drams if the threshold time has elapsed + if ( db->iv_impeThMap[rank].timeElapsed(io_sc) ) + { + db->iv_impeDramMap[rank].clear(); + } + + // if this DRAM hasn't already reported an IMPE on this rank + if ( std::find( db->iv_impeDramMap[rank].begin(), + db->iv_impeDramMap[rank].end(), dram ) == + db->iv_impeDramMap[rank].end() ) + { + // if there is another DRAM reporting an IMPE on this rank as well + if ( 0 != db->iv_impeDramMap[rank].size() ) + { + // Make the error log predictive + io_sc.service_data->setServiceCall(); + } + + // add the DRAM to the map + db->iv_impeDramMap[rank].push_back( dram ); + } + + // Initialize threshold if it doesn't exist yet + if ( 0 == db->iv_impeThMap.count(rank) ) + { + db->iv_impeThMap[rank] = TimeBasedThreshold( getImpeTh() ); + } + + // increment count for the given rank - check if at threshold + if ( db->iv_impeThMap[rank].inc(io_sc) ) + { + // place a chip mark on the failing DRAM + MemMark chipMark( trgt, rank, galois ); + o_rc = MarkStore::writeChipMark<T>( i_chip, rank, chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::writeChipMark(0x%08x,m%ds%d) " + "failed", i_chip->getHuid(), rank.getMaster(), + rank.getSlave() ); + break; + } + } + + }while(0); + + #endif + + return o_rc; + + #undef PRDF_FUNC +} + +template +uint32_t analyzeImpe<TYPE_MCA, McaDataBundle*>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + +//------------------------------------------------------------------------------ + } // end namespace MemEcc } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H index 4855abeb6..1ddc31d55 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H @@ -120,6 +120,15 @@ template<TARGETING::TYPE T, typename D> uint32_t analyzeMaintIue( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); +/** + * @brief Analyzes a maint or mainline IMPE attention. + * @param i_chip MCA or MBA. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ +template<TARGETING::TYPE T, typename D> +uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); + #ifdef __HOSTBOOT_RUNTIME /** diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.C b/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.C index 136635d94..e0cc1790f 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.C @@ -46,6 +46,7 @@ using namespace PlatServices; enum DefaultThresholds { MCA_RCD_PARITY_NON_MNFG_TH = 32, ///< Non-MNFG RCD parity error TH + MCA_IMPE_NON_MNFG_TH = 32, ///< Non-MNFG IMPE TH MCA_IUE_NON_MNFG_TH = 8, ///< Non-MNFG IUE TH MBA_RCE_NON_MNFG_TH = 8, ///< Non-MNFG RCE TH MBA_SCRUB_CE_NON_MNFG_TH = 80, ///< Non-MNFG Scrub soft/inter CE TH @@ -114,6 +115,26 @@ ThresholdResolution::ThresholdPolicy getIueTh() //------------------------------------------------------------------------------ +#ifdef __HOSTBOOT_MODULE + +ThresholdResolution::ThresholdPolicy getImpeTh() +{ + uint32_t th = MCA_IMPE_NON_MNFG_TH; + + if ( mfgMode() ) + { + th = MfgThresholdMgr::getInstance()-> + getThreshold( ATTR_MNFG_TH_MEMORY_IMPES ); + } + + return ThresholdResolution::ThresholdPolicy( th, + ThresholdResolution::ONE_DAY ); +} + +#endif + +//------------------------------------------------------------------------------ + ThresholdResolution::ThresholdPolicy getRceThreshold() { uint32_t th = MBA_RCE_NON_MNFG_TH; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.H b/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.H index 35acf95cf..1c3769ea1 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemThresholds.H @@ -65,6 +65,11 @@ ThresholdResolution::ThresholdPolicy getRcdParityTh(); */ ThresholdResolution::ThresholdPolicy getIueTh(); +/** + * @brief Returns IMPE threshold policy. + */ +ThresholdResolution::ThresholdPolicy getImpeTh(); + #endif /** diff --git a/src/usr/diag/prdf/common/plat/mem/prdfP9McaDataBundle.H b/src/usr/diag/prdf/common/plat/mem/prdfP9McaDataBundle.H index 9dd9f632a..ed505c1d7 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfP9McaDataBundle.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfP9McaDataBundle.H @@ -123,6 +123,12 @@ class McaDataBundle : public DataBundle /** Threshold table for IUEs. Threshold per DIMM */ std::map<uint8_t, TimeBasedThreshold> iv_iueTh; + /** Threshold table for IMPEs. Threshold per rank */ + std::map<MemRank, TimeBasedThreshold> iv_impeThMap; + + /** Map to keep track of which DRAMs on which ranks have reported IMPEs */ + std::map<MemRank, std::vector<uint8_t>> iv_impeDramMap; + #endif }; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C b/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C index 6a064bf87..cac225d23 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfP9Mca_common.C @@ -257,6 +257,21 @@ PRDF_PLUGIN_DEFINE( p9_mca, AnalyzeMaintIue ); //------------------------------------------------------------------------------ +/** + * @brief MCAECCFIR[19,39] - Mainline and Maint IMPE + * @param i_chip MCA chip. + * @param io_sc The step code data struct. + * @return SUCCESS + */ +int32_t AnalyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) +{ + MemEcc::analyzeImpe<TYPE_MCA, McaDataBundle *>( i_chip, io_sc ); + return SUCCESS; // nothing to return to rule code +} +PRDF_PLUGIN_DEFINE( p9_mca, AnalyzeImpe ); + +//------------------------------------------------------------------------------ + } // end namespace p9_mca } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule index 4f54882b0..cb7b1b83b 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule @@ -424,7 +424,7 @@ group gMCAECCFIR filter singlebit, cs_root_cause( 14, 17, 37 ) /** MCAECCFIR[19] * Mainline read IMPE */ - (rMCAECCFIR, bit(19)) ? mainline_impe_handling; + (rMCAECCFIR, bit(19)) ? impe_handling; /** MCAECCFIR[20:27] * Maintenance MPE @@ -489,7 +489,7 @@ group gMCAECCFIR filter singlebit, cs_root_cause( 14, 17, 37 ) /** MCAECCFIR[39] * Maintenance IMPE */ - (rMCAECCFIR, bit(39)) ? maintenance_impe_handling; + (rMCAECCFIR, bit(39)) ? impe_handling; /** MCAECCFIR[40] * spare diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule index d3ebc436e..95d591c49 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule @@ -85,8 +85,6 @@ actionclass mainline_iue_handling funccall("AnalyzeMainlineIue"); }; -actionclass mainline_impe_handling { TBDDefaultCallout; }; # TODO RTC 165384 - /** Handle Maintenance IUEs */ actionclass maintenance_iue_handling { @@ -95,7 +93,7 @@ actionclass maintenance_iue_handling funccall("AnalyzeMaintIue"); }; -actionclass maintenance_impe_handling { TBDDefaultCallout; }; # TODO RTC 165384 +actionclass impe_handling { funccall("AnalyzeImpe"); }; /** MCA/UE algroithm, threshold 5 per day */ actionclass mca_ue_algorithm_th_5perDay diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule index bf2fd3fd1..d146517d1 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule @@ -220,3 +220,10 @@ capture group default; }; + register MSR + { + name "P9 Mark Shadow Register"; + scomaddr 0x07010A0C; + capture group default; + }; + |