diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2017-07-25 16:26:23 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2017-07-31 11:16:54 -0400 |
commit | fadc1f7542d63ef55f383cf922db86d4f5e48ffe (patch) | |
tree | ec739300a13675a4d25117c1d38e4519b3e645a3 /src/usr/diag/prdf/plat | |
parent | e7955db9ace86b83313545014a805c9678034839 (diff) | |
download | talos-hostboot-fadc1f7542d63ef55f383cf922db86d4f5e48ffe.tar.gz talos-hostboot-fadc1f7542d63ef55f383cf922db86d4f5e48ffe.zip |
PRD: consistent handling for IMPE and VCM
Change-Id: Ic30829c48e54448c9a4f828dc24afe0e4d4d6bf0
CQ: SW394364
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43611
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43798
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf/plat')
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H | 219 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm.H | 47 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C | 34 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C | 85 |
4 files changed, 308 insertions, 77 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H b/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H index fd38ff832..9b497f8be 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H @@ -26,8 +26,6 @@ #ifndef __prdfMemTdFalseAlarm_H #define __prdfMemTdFalseAlarm_H -#ifdef __HOSTBOOT_RUNTIME // All of this code is runtime only - // Framework includes #include <iipServiceDataCollector.h> #include <prdfThresholdUtils.H> @@ -38,58 +36,213 @@ namespace PRDF { -/** - * @brief At runtime, we have to keep a false alarm threshold for Targeted - * Diagnostics to avoid flooding of intermittent errors. - * - * This class is intented to be a static class variable for each TD event class - * that requires this type of thresholding. It will contain a map for each chip - * and unique key within each chip to the threshold container. Note that the key - * could be different per TD event class. For example, VCM events will use only - * the master rank, where TPS events will use both the master and slave rank. - */ +/** @brief This is used at runtime to keep track of false alarm for Targeted + * Diagnostics to avoid flooding of intermittent errors. */ class TdFalseAlarm { - public: + public: // functions + + /** @brief Default destructor */ + virtual ~TdFalseAlarm() = default; + + /** + * @brief Queries if the threshold timer has expired. + * @param i_rank The rank with the false alarm. + * @param io_sc The step code data struct. + * @return True if the threshold timer has expired, false otherwise. + */ + bool elapsed( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + uint32_t key = init( i_rank, io_sc ); + return iv_thMap[key].timeElapsed( io_sc ); + } + + /** + * @brief Queries the current false alarm count. + * @param i_rank The rank with the false alarm. + * @param io_sc The step code data struct. + * @return The current false alarm count. + */ + uint8_t count( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + uint32_t key = init( i_rank, io_sc ); + return iv_thMap[key].getCount(); + } + + protected: // functions /** * @brief Constructor. - * @param i_th Threshold value for all entries in the map. - * @param i_int Threshold interval for all entries in the map. + * @param i_th Threshold for each entry in the threshold map. */ - TdFalseAlarm( uint8_t i_th, uint32_t i_int ) : - iv_thVal(i_th), iv_thInt(i_int) - {} + explicit TdFalseAlarm( TimeBasedThreshold i_th ) : iv_th(i_th) {} + + /** + * @brief The key could be different per TD procedure. For example, VCM + * events will use only the master rank, where TPS events will use + * both the master and slave rank. + * @param i_rank The rank with the false alarm. + * @return The key value for this rank. + */ + virtual uint32_t getKey( MemRank i_rank ) const = 0; + + /** + * @brief Initializes data specific to each child class. + * @param i_rank The rank with the false alarm. + * @param io_sc The step code data struct. + * @return The key for this rank (see getKey()). + * @note This function should be called before accessing the instance + * variables to ensure this key is initialized with the correct + * threshold. + * @note The default is to call initThMap(). Each child class that + * overloads this function should also call initThMap() to ensure + * the threshold map is initialized properly. + */ + virtual uint32_t init( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + return initThMap( i_rank ); + } + + /** + * @brief Initializes this key in the threshold map. + * @param i_rank The rank with the false alarm. + * @return The key for this rank (see getKey()). + * @note This function should only be called from init(). + */ + uint32_t initThMap( MemRank i_rank ) + { + uint32_t key = getKey( i_rank ); + + // Create a new entry if an entry does not exist. + if ( iv_thMap.end() == iv_thMap.find(key) ) iv_thMap[key] = iv_th; + + return key; + } /** * @brief Increments the false alarm count. - * @param i_chip Target chip. - * @param i_key Key relative to the chip. + * @param i_rank The rank with the false alarm. * @param io_sc The step code data struct. * @return True if false alarm count has reached threshold, false otherwise. */ - bool inc( ExtensibleChip * i_chip, uint32_t i_key, - STEP_CODE_DATA_STRUCT & io_sc ) + bool incThMap( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) { - // Create a new entry if an entry does not exist. - if ( iv_map[i_chip].end() == iv_map[i_chip].find(i_key) ) - iv_map[i_chip][i_key] = TimeBasedThreshold( iv_thVal, iv_thInt ); + uint32_t key = init( i_rank, io_sc ); + return iv_thMap[key].inc( io_sc ); + } + + private: // instance variables + + /** Threshold for each entries in the map. */ + const TimeBasedThreshold iv_th; - return iv_map[i_chip][i_key].inc( io_sc ); + /** A map containing the thresholds for each key. */ + std::map<uint32_t, TimeBasedThreshold> iv_thMap; +}; + +/** @brief A false alarm class specific to VCM procedures. */ +class VcmFalseAlarm : public TdFalseAlarm +{ + public: + + /** + * @brief Constructor. + * @param i_th Threshold for each entry in the map. + */ + VcmFalseAlarm( TimeBasedThreshold i_th ) : TdFalseAlarm(i_th) {} + + /** + * @brief Increments the false alarm count and stores the DRAM. + * @param i_rank The rank with the false alarm. + * @param i_dram The DRAM with the false alarm. + * @param io_sc The step code data struct. + * @return True if false alarm count has reached threshold, false otherwise. + */ + bool inc( MemRank i_rank, uint8_t i_dram, STEP_CODE_DATA_STRUCT & io_sc ) + { + // Increment the count and determine whether threshold is reached or + // not. Note that incThMap() calls init() and initializes all maps. + bool isTh = incThMap( i_rank, io_sc ); + + // Add the DRAM to the list. Note that the value for each DRAM is not + // important. The only reason to use a map verses a vector is to ensure + // unique entries in the list. + iv_dramMap[getKey(i_rank)][i_dram] = 1; + + return isTh; + } + + /** + * @param i_rank The rank with the false alarm. + * @param io_sc The step code data struct. + * @return True there is more than one DRAM on this rank and the timer has + * not elapsed, false otherwise. + */ + bool queryDrams( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + uint32_t key = init( i_rank, io_sc ); // will clear list if time elapsed + + // Return true if there is more than one DRAM on this rank. + return ( 1 < iv_dramMap[key].size() ); + } + + private: // functions + + // Overloaded from parent class. + uint32_t getKey( MemRank i_rank ) const + { + return MemRank(i_rank.getMaster()).getKey(); // master only + } + + // Overloaded from parent class. + uint32_t init( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + uint32_t key = initThMap( i_rank ); + + // Clear out the list of DRAMs if the threshold time has elapsed. + if ( elapsed(i_rank, io_sc) ) iv_dramMap[key].clear(); + + return key; } - private: + private: // instance variables + + /** A map to keep track of which DRAMs per rank have reported chip marks. */ + std::map< uint32_t, std::map<uint8_t, uint8_t> > iv_dramMap; +}; + +/** @brief A false alarm class specific to TPS procedures. */ +class TpsFalseAlarm : public TdFalseAlarm +{ + public: + + /** + * @brief Constructor. + * @param i_th Threshold for each entry in the map. + */ + TpsFalseAlarm( TimeBasedThreshold i_th ) : TdFalseAlarm(i_th) {} + + /** + * @brief Increments the false alarm count. + * @param i_rank The rank with the false alarm. + * @param io_sc The step code data struct. + * @return True if false alarm count has reached threshold, false otherwise. + */ + bool inc( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc ) + { + return incThMap( i_rank, io_sc ); + } - const uint8_t iv_thVal; ///< Threshold value for all entries in the map. - const uint32_t iv_thInt; ///< Threshold interval for all entries in the map. + private: // functions - /** A nested map containing the thresholds for each chip and key. */ - std::map< ExtensibleChip *, std::map<uint32_t,TimeBasedThreshold> > iv_map; + // Overloaded from parent class. + uint32_t getKey( MemRank i_rank ) const + { + return i_rank.getKey(); // both master and slave + } }; } // end namespace PRDF -#endif // __HOSTBOOT_RUNTIME - #endif // __prdfMemTdFalseAlarm_H diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H index 9c39040cd..c42598935 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H @@ -260,32 +260,20 @@ class VcmEvent : public TdEntry io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_VcmVerified ); - do + if ( PlatServices::areDramRepairsDisabled() ) { - // If DRAM repairs are disabled, make the error log predictive. - if ( PlatServices::areDramRepairsDisabled() ) - { - io_sc.service_data->setServiceCall(); - break; // Nothing more to do. - } - - // If there is a symbol mark on the same DRAM as the newly verified - // chip mark, remove the symbol mark. - o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc ); + // Make the error log predictive, nothing else to do. + io_sc.service_data->setServiceCall(); + } + else + { + // Leave the chip mark in place and do any necessary cleanup. + o_rc = cleanup( io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", - iv_chip->getHuid(), getKey() ); - break; + PRDF_ERR( PRDF_FUNC "cleanup() failed" ); } - - // Set the entire chip in DRAM Repairs VPD. - // TODO: RTC 169939 - - // Add a DRAM sparing procedure to the queue, if supported. - // TODO: RTC 157888 - - } while (0); + } return o_rc; @@ -293,6 +281,16 @@ class VcmEvent : public TdEntry } /** + * @brief Cleanup required when a chip mark is left in place (i.e. chip + * mark verified or false alarm threshold). Will balance the chip + * and symbol marks, set VPD, and initiate DRAM sparing if + * supported. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ + uint32_t cleanup( STEP_CODE_DATA_STRUCT & io_sc ); + + /** * @brief Verification failed. Do additional processing such as removing * the chip mark and false alarm threshold handling. * @param io_sc The step code data struct. @@ -303,11 +301,6 @@ class VcmEvent : public TdEntry private: // instance variables const MemMark iv_mark; ///< The chip mark from hardware. - - #ifdef __HOSTBOOT_RUNTIME - /** False alarm counter for all instances of this class. */ - static TdFalseAlarm cv_falseAlarm; - #endif }; } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C index e331d736c..50cfec50f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C @@ -81,6 +81,40 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +template<TARGETING::TYPE T> +uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + do + { + // If there is a symbol mark on the same DRAM as the newly verified chip + // mark, remove the symbol mark. + o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // Set the entire chip in DRAM Repairs VPD. + // TODO: RTC 169939 + + // Add a DRAM sparing procedure to the queue, if supported. + // TODO: RTC 157888 + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C index 835a54cfe..bce497fcd 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C @@ -38,6 +38,29 @@ using namespace PlatServices; //############################################################################## // +// Helper functions +// +//############################################################################## + +template<TARGETING::TYPE T> +VcmFalseAlarm * __getFalseAlarmCounter( ExtensibleChip * i_chip ); + +template<> +VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip ) +{ + return getMcaDataBundle(i_chip)->getVcmFalseAlarmCounter(); +} + +template<> +VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip ) +{ + // TODO: RTC 157888 + //return getMbaDataBundle(i_chip)->getVcmFalseAlarmCounter(); + return nullptr; +} + +//############################################################################## +// // Generic template functions // //############################################################################## @@ -65,10 +88,10 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) } // Increment the false alarm counter and check threshold. - if ( cv_falseAlarm.inc(iv_chip, getKey(), io_sc) ) + uint8_t dram = iv_mark.getSymbol().getDram(); + if ( __getFalseAlarmCounter<T>(iv_chip)->inc(iv_rank, dram, io_sc) ) { - // False alarm threshold has been reached. Leave the mark in place - // and treat the chip mark as verified. + // False alarm threshold has been reached. io_sc.service_data->setSignature( iv_chip->getHuid(), PRDFSIG_VcmFalseAlarmTH ); @@ -76,10 +99,11 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) PRDF_TRAC( PRDF_FUNC "False alarm threshold: 0x%08x,0x%02x", iv_chip->getHuid(), getKey() ); - o_rc = verified( io_sc ); + // Leave the chip mark in place and do any necessary cleanup. + o_rc = cleanup( io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "verified() failed" ); + PRDF_ERR( PRDF_FUNC "cleanup() failed" ); break; } } @@ -104,6 +128,45 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc ) //------------------------------------------------------------------------------ +template<TARGETING::TYPE T> +uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::cleanup] " + + uint32_t o_rc = SUCCESS; + + do + { + // If there is a symbol mark on the same DRAM as the newly verified chip + // mark, remove the symbol mark. + o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed", + iv_chip->getHuid(), getKey() ); + break; + } + + // Set the entire chip in DRAM Repairs VPD. + // TODO: RTC 169939 + + // Add a DRAM sparing procedure to the queue, if supported. + // TODO: RTC 157888 + + // If there was more than one DRAM on this rank with a false alarm, make + // the error log predictive. + if ( __getFalseAlarmCounter<T>(iv_chip)->queryDrams(iv_rank, io_sc) ) + io_sc.service_data->setServiceCall(); + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // Avoid linker errors with the template. template class VcmEvent<TYPE_MCA>; template class VcmEvent<TYPE_MBA>; @@ -115,12 +178,6 @@ template class VcmEvent<TYPE_MBA>; //############################################################################## template<> -TdFalseAlarm VcmEvent<TYPE_MCA>::cv_falseAlarm - = TdFalseAlarm { 4, ThresholdResolution::ONE_DAY }; - -//------------------------------------------------------------------------------ - -template<> uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) @@ -203,12 +260,6 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns, //############################################################################## template<> -TdFalseAlarm VcmEvent<TYPE_MBA>::cv_falseAlarm - = TdFalseAlarm { 4, 7 * ThresholdResolution::ONE_DAY }; - -//------------------------------------------------------------------------------ - -template<> uint32_t VcmEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns, STEP_CODE_DATA_STRUCT & io_sc, bool & o_done ) |