summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/plat
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2017-07-25 16:26:23 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2017-07-31 11:16:54 -0400
commitfadc1f7542d63ef55f383cf922db86d4f5e48ffe (patch)
treeec739300a13675a4d25117c1d38e4519b3e645a3 /src/usr/diag/prdf/plat
parente7955db9ace86b83313545014a805c9678034839 (diff)
downloadtalos-hostboot-fadc1f7542d63ef55f383cf922db86d4f5e48ffe.tar.gz
talos-hostboot-fadc1f7542d63ef55f383cf922db86d4f5e48ffe.zip
PRD: consistent handling for IMPE and VCM
Change-Id: Ic30829c48e54448c9a4f828dc24afe0e4d4d6bf0 CQ: SW394364 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43611 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com> Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43798 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf/plat')
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H219
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm.H47
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C34
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C85
4 files changed, 308 insertions, 77 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H b/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H
index fd38ff832..9b497f8be 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTdFalseAlarm.H
@@ -26,8 +26,6 @@
#ifndef __prdfMemTdFalseAlarm_H
#define __prdfMemTdFalseAlarm_H
-#ifdef __HOSTBOOT_RUNTIME // All of this code is runtime only
-
// Framework includes
#include <iipServiceDataCollector.h>
#include <prdfThresholdUtils.H>
@@ -38,58 +36,213 @@
namespace PRDF
{
-/**
- * @brief At runtime, we have to keep a false alarm threshold for Targeted
- * Diagnostics to avoid flooding of intermittent errors.
- *
- * This class is intented to be a static class variable for each TD event class
- * that requires this type of thresholding. It will contain a map for each chip
- * and unique key within each chip to the threshold container. Note that the key
- * could be different per TD event class. For example, VCM events will use only
- * the master rank, where TPS events will use both the master and slave rank.
- */
+/** @brief This is used at runtime to keep track of false alarm for Targeted
+ * Diagnostics to avoid flooding of intermittent errors. */
class TdFalseAlarm
{
- public:
+ public: // functions
+
+ /** @brief Default destructor */
+ virtual ~TdFalseAlarm() = default;
+
+ /**
+ * @brief Queries if the threshold timer has expired.
+ * @param i_rank The rank with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return True if the threshold timer has expired, false otherwise.
+ */
+ bool elapsed( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ uint32_t key = init( i_rank, io_sc );
+ return iv_thMap[key].timeElapsed( io_sc );
+ }
+
+ /**
+ * @brief Queries the current false alarm count.
+ * @param i_rank The rank with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return The current false alarm count.
+ */
+ uint8_t count( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ uint32_t key = init( i_rank, io_sc );
+ return iv_thMap[key].getCount();
+ }
+
+ protected: // functions
/**
* @brief Constructor.
- * @param i_th Threshold value for all entries in the map.
- * @param i_int Threshold interval for all entries in the map.
+ * @param i_th Threshold for each entry in the threshold map.
*/
- TdFalseAlarm( uint8_t i_th, uint32_t i_int ) :
- iv_thVal(i_th), iv_thInt(i_int)
- {}
+ explicit TdFalseAlarm( TimeBasedThreshold i_th ) : iv_th(i_th) {}
+
+ /**
+ * @brief The key could be different per TD procedure. For example, VCM
+ * events will use only the master rank, where TPS events will use
+ * both the master and slave rank.
+ * @param i_rank The rank with the false alarm.
+ * @return The key value for this rank.
+ */
+ virtual uint32_t getKey( MemRank i_rank ) const = 0;
+
+ /**
+ * @brief Initializes data specific to each child class.
+ * @param i_rank The rank with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return The key for this rank (see getKey()).
+ * @note This function should be called before accessing the instance
+ * variables to ensure this key is initialized with the correct
+ * threshold.
+ * @note The default is to call initThMap(). Each child class that
+ * overloads this function should also call initThMap() to ensure
+ * the threshold map is initialized properly.
+ */
+ virtual uint32_t init( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ return initThMap( i_rank );
+ }
+
+ /**
+ * @brief Initializes this key in the threshold map.
+ * @param i_rank The rank with the false alarm.
+ * @return The key for this rank (see getKey()).
+ * @note This function should only be called from init().
+ */
+ uint32_t initThMap( MemRank i_rank )
+ {
+ uint32_t key = getKey( i_rank );
+
+ // Create a new entry if an entry does not exist.
+ if ( iv_thMap.end() == iv_thMap.find(key) ) iv_thMap[key] = iv_th;
+
+ return key;
+ }
/**
* @brief Increments the false alarm count.
- * @param i_chip Target chip.
- * @param i_key Key relative to the chip.
+ * @param i_rank The rank with the false alarm.
* @param io_sc The step code data struct.
* @return True if false alarm count has reached threshold, false otherwise.
*/
- bool inc( ExtensibleChip * i_chip, uint32_t i_key,
- STEP_CODE_DATA_STRUCT & io_sc )
+ bool incThMap( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
{
- // Create a new entry if an entry does not exist.
- if ( iv_map[i_chip].end() == iv_map[i_chip].find(i_key) )
- iv_map[i_chip][i_key] = TimeBasedThreshold( iv_thVal, iv_thInt );
+ uint32_t key = init( i_rank, io_sc );
+ return iv_thMap[key].inc( io_sc );
+ }
+
+ private: // instance variables
+
+ /** Threshold for each entries in the map. */
+ const TimeBasedThreshold iv_th;
- return iv_map[i_chip][i_key].inc( io_sc );
+ /** A map containing the thresholds for each key. */
+ std::map<uint32_t, TimeBasedThreshold> iv_thMap;
+};
+
+/** @brief A false alarm class specific to VCM procedures. */
+class VcmFalseAlarm : public TdFalseAlarm
+{
+ public:
+
+ /**
+ * @brief Constructor.
+ * @param i_th Threshold for each entry in the map.
+ */
+ VcmFalseAlarm( TimeBasedThreshold i_th ) : TdFalseAlarm(i_th) {}
+
+ /**
+ * @brief Increments the false alarm count and stores the DRAM.
+ * @param i_rank The rank with the false alarm.
+ * @param i_dram The DRAM with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return True if false alarm count has reached threshold, false otherwise.
+ */
+ bool inc( MemRank i_rank, uint8_t i_dram, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ // Increment the count and determine whether threshold is reached or
+ // not. Note that incThMap() calls init() and initializes all maps.
+ bool isTh = incThMap( i_rank, io_sc );
+
+ // Add the DRAM to the list. Note that the value for each DRAM is not
+ // important. The only reason to use a map verses a vector is to ensure
+ // unique entries in the list.
+ iv_dramMap[getKey(i_rank)][i_dram] = 1;
+
+ return isTh;
+ }
+
+ /**
+ * @param i_rank The rank with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return True there is more than one DRAM on this rank and the timer has
+ * not elapsed, false otherwise.
+ */
+ bool queryDrams( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ uint32_t key = init( i_rank, io_sc ); // will clear list if time elapsed
+
+ // Return true if there is more than one DRAM on this rank.
+ return ( 1 < iv_dramMap[key].size() );
+ }
+
+ private: // functions
+
+ // Overloaded from parent class.
+ uint32_t getKey( MemRank i_rank ) const
+ {
+ return MemRank(i_rank.getMaster()).getKey(); // master only
+ }
+
+ // Overloaded from parent class.
+ uint32_t init( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ uint32_t key = initThMap( i_rank );
+
+ // Clear out the list of DRAMs if the threshold time has elapsed.
+ if ( elapsed(i_rank, io_sc) ) iv_dramMap[key].clear();
+
+ return key;
}
- private:
+ private: // instance variables
+
+ /** A map to keep track of which DRAMs per rank have reported chip marks. */
+ std::map< uint32_t, std::map<uint8_t, uint8_t> > iv_dramMap;
+};
+
+/** @brief A false alarm class specific to TPS procedures. */
+class TpsFalseAlarm : public TdFalseAlarm
+{
+ public:
+
+ /**
+ * @brief Constructor.
+ * @param i_th Threshold for each entry in the map.
+ */
+ TpsFalseAlarm( TimeBasedThreshold i_th ) : TdFalseAlarm(i_th) {}
+
+ /**
+ * @brief Increments the false alarm count.
+ * @param i_rank The rank with the false alarm.
+ * @param io_sc The step code data struct.
+ * @return True if false alarm count has reached threshold, false otherwise.
+ */
+ bool inc( MemRank i_rank, STEP_CODE_DATA_STRUCT & io_sc )
+ {
+ return incThMap( i_rank, io_sc );
+ }
- const uint8_t iv_thVal; ///< Threshold value for all entries in the map.
- const uint32_t iv_thInt; ///< Threshold interval for all entries in the map.
+ private: // functions
- /** A nested map containing the thresholds for each chip and key. */
- std::map< ExtensibleChip *, std::map<uint32_t,TimeBasedThreshold> > iv_map;
+ // Overloaded from parent class.
+ uint32_t getKey( MemRank i_rank ) const
+ {
+ return i_rank.getKey(); // both master and slave
+ }
};
} // end namespace PRDF
-#endif // __HOSTBOOT_RUNTIME
-
#endif // __prdfMemTdFalseAlarm_H
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
index 9c39040cd..c42598935 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.H
@@ -260,32 +260,20 @@ class VcmEvent : public TdEntry
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_VcmVerified );
- do
+ if ( PlatServices::areDramRepairsDisabled() )
{
- // If DRAM repairs are disabled, make the error log predictive.
- if ( PlatServices::areDramRepairsDisabled() )
- {
- io_sc.service_data->setServiceCall();
- break; // Nothing more to do.
- }
-
- // If there is a symbol mark on the same DRAM as the newly verified
- // chip mark, remove the symbol mark.
- o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc );
+ // Make the error log predictive, nothing else to do.
+ io_sc.service_data->setServiceCall();
+ }
+ else
+ {
+ // Leave the chip mark in place and do any necessary cleanup.
+ o_rc = cleanup( io_sc );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed",
- iv_chip->getHuid(), getKey() );
- break;
+ PRDF_ERR( PRDF_FUNC "cleanup() failed" );
}
-
- // Set the entire chip in DRAM Repairs VPD.
- // TODO: RTC 169939
-
- // Add a DRAM sparing procedure to the queue, if supported.
- // TODO: RTC 157888
-
- } while (0);
+ }
return o_rc;
@@ -293,6 +281,16 @@ class VcmEvent : public TdEntry
}
/**
+ * @brief Cleanup required when a chip mark is left in place (i.e. chip
+ * mark verified or false alarm threshold). Will balance the chip
+ * and symbol marks, set VPD, and initiate DRAM sparing if
+ * supported.
+ * @param io_sc The step code data struct.
+ * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
+ */
+ uint32_t cleanup( STEP_CODE_DATA_STRUCT & io_sc );
+
+ /**
* @brief Verification failed. Do additional processing such as removing
* the chip mark and false alarm threshold handling.
* @param io_sc The step code data struct.
@@ -303,11 +301,6 @@ class VcmEvent : public TdEntry
private: // instance variables
const MemMark iv_mark; ///< The chip mark from hardware.
-
- #ifdef __HOSTBOOT_RUNTIME
- /** False alarm counter for all instances of this class. */
- static TdFalseAlarm cv_falseAlarm;
- #endif
};
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
index e331d736c..50cfec50f 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_ipl.C
@@ -81,6 +81,40 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+template<TARGETING::TYPE T>
+uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[VcmEvent::cleanup] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // If there is a symbol mark on the same DRAM as the newly verified chip
+ // mark, remove the symbol mark.
+ o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed",
+ iv_chip->getHuid(), getKey() );
+ break;
+ }
+
+ // Set the entire chip in DRAM Repairs VPD.
+ // TODO: RTC 169939
+
+ // Add a DRAM sparing procedure to the queue, if supported.
+ // TODO: RTC 157888
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
// Avoid linker errors with the template.
template class VcmEvent<TYPE_MCA>;
template class VcmEvent<TYPE_MBA>;
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
index 835a54cfe..bce497fcd 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm_rt.C
@@ -38,6 +38,29 @@ using namespace PlatServices;
//##############################################################################
//
+// Helper functions
+//
+//##############################################################################
+
+template<TARGETING::TYPE T>
+VcmFalseAlarm * __getFalseAlarmCounter( ExtensibleChip * i_chip );
+
+template<>
+VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MCA>( ExtensibleChip * i_chip )
+{
+ return getMcaDataBundle(i_chip)->getVcmFalseAlarmCounter();
+}
+
+template<>
+VcmFalseAlarm * __getFalseAlarmCounter<TYPE_MBA>( ExtensibleChip * i_chip )
+{
+ // TODO: RTC 157888
+ //return getMbaDataBundle(i_chip)->getVcmFalseAlarmCounter();
+ return nullptr;
+}
+
+//##############################################################################
+//
// Generic template functions
//
//##############################################################################
@@ -65,10 +88,10 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
}
// Increment the false alarm counter and check threshold.
- if ( cv_falseAlarm.inc(iv_chip, getKey(), io_sc) )
+ uint8_t dram = iv_mark.getSymbol().getDram();
+ if ( __getFalseAlarmCounter<T>(iv_chip)->inc(iv_rank, dram, io_sc) )
{
- // False alarm threshold has been reached. Leave the mark in place
- // and treat the chip mark as verified.
+ // False alarm threshold has been reached.
io_sc.service_data->setSignature( iv_chip->getHuid(),
PRDFSIG_VcmFalseAlarmTH );
@@ -76,10 +99,11 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
PRDF_TRAC( PRDF_FUNC "False alarm threshold: 0x%08x,0x%02x",
iv_chip->getHuid(), getKey() );
- o_rc = verified( io_sc );
+ // Leave the chip mark in place and do any necessary cleanup.
+ o_rc = cleanup( io_sc );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "verified() failed" );
+ PRDF_ERR( PRDF_FUNC "cleanup() failed" );
break;
}
}
@@ -104,6 +128,45 @@ uint32_t VcmEvent<T>::falseAlarm( STEP_CODE_DATA_STRUCT & io_sc )
//------------------------------------------------------------------------------
+template<TARGETING::TYPE T>
+uint32_t VcmEvent<T>::cleanup( STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[VcmEvent::cleanup] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // If there is a symbol mark on the same DRAM as the newly verified chip
+ // mark, remove the symbol mark.
+ o_rc = MarkStore::balance<T>( iv_chip, iv_rank, io_sc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MarkStore::balance(0x%08x,0x%02x) failed",
+ iv_chip->getHuid(), getKey() );
+ break;
+ }
+
+ // Set the entire chip in DRAM Repairs VPD.
+ // TODO: RTC 169939
+
+ // Add a DRAM sparing procedure to the queue, if supported.
+ // TODO: RTC 157888
+
+ // If there was more than one DRAM on this rank with a false alarm, make
+ // the error log predictive.
+ if ( __getFalseAlarmCounter<T>(iv_chip)->queryDrams(iv_rank, io_sc) )
+ io_sc.service_data->setServiceCall();
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
// Avoid linker errors with the template.
template class VcmEvent<TYPE_MCA>;
template class VcmEvent<TYPE_MBA>;
@@ -115,12 +178,6 @@ template class VcmEvent<TYPE_MBA>;
//##############################################################################
template<>
-TdFalseAlarm VcmEvent<TYPE_MCA>::cv_falseAlarm
- = TdFalseAlarm { 4, ThresholdResolution::ONE_DAY };
-
-//------------------------------------------------------------------------------
-
-template<>
uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
STEP_CODE_DATA_STRUCT & io_sc,
bool & o_done )
@@ -203,12 +260,6 @@ uint32_t VcmEvent<TYPE_MCA>::checkEcc( const uint32_t & i_eccAttns,
//##############################################################################
template<>
-TdFalseAlarm VcmEvent<TYPE_MBA>::cv_falseAlarm
- = TdFalseAlarm { 4, 7 * ThresholdResolution::ONE_DAY };
-
-//------------------------------------------------------------------------------
-
-template<>
uint32_t VcmEvent<TYPE_MBA>::checkEcc( const uint32_t & i_eccAttns,
STEP_CODE_DATA_STRUCT & io_sc,
bool & o_done )
OpenPOWER on IntegriCloud