summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2018-06-02 17:28:45 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2018-06-08 22:45:47 -0400
commit38666ab58f157b82c3dca2d782667cf071a75cb2 (patch)
treeb26c051744ee9abb117e7c58094ecc6d8ec4ea40
parente38d6b0d199b045cce44db69f8594eaaa0990a9c (diff)
downloadtalos-hostboot-38666ab58f157b82c3dca2d782667cf071a75cb2.tar.gz
talos-hostboot-38666ab58f157b82c3dca2d782667cf071a75cb2.zip
PRD: create MarkStore::applyRasPolicies()
Change-Id: Ifd08172b960b5c526a014076e79d5c45df54ee45 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59819 Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Matt Derksen <mderkse1@us.ibm.com> Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com> Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60136 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemMark.C193
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemMark.H49
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C26
3 files changed, 180 insertions, 88 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
index 9a94e3fbb..64193ea66 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
@@ -391,7 +391,7 @@ template<>
uint32_t __clearFetchAttn<TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank )
{
- #define PRDF_FUNC "[__readMarks<TYPE_MBA>] "
+ #define PRDF_FUNC "[__clearFetchAttn<TYPE_MBA>] "
uint32_t o_rc = SUCCESS;
@@ -880,31 +880,43 @@ uint32_t writeSymbolMark<TYPE_MBA>( ExtensibleChip * i_chip,
#ifdef __HOSTBOOT_MODULE // Not supported on FSP.
+//------------------------------------------------------------------------------
+
+void __addCallout( ExtensibleChip * i_chip, const MemRank & i_rank,
+ const MemSymbol & i_symbol, STEP_CODE_DATA_STRUCT & io_sc )
+{
+ if ( i_symbol.isValid() )
+ {
+ MemoryMru mm { i_chip->getTrgt(), i_rank, i_symbol };
+ io_sc.service_data->SetCallout( mm );
+ }
+}
+
+//------------------------------------------------------------------------------
+
template<TARGETING::TYPE T>
uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc,
const MemMark & i_chipMark,
- const MemMark & i_symMark );
+ const MemMark & i_symMark,
+ TdEntry * & o_dsdEvent, bool & o_allRepairsUsed );
template<>
uint32_t __applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip,
const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc,
const MemMark & i_chipMark,
- const MemMark & i_symMark )
+ const MemMark & i_symMark,
+ TdEntry * & o_dsdEvent,
+ bool & o_allRepairsUsed )
{
// There is no DRAM sparing on Nimbus so simply check if both the chip and
// symbol mark have been used.
if ( i_chipMark.isValid() && i_symMark.isValid() )
{
- io_sc.service_data->setServiceCall();
+ o_allRepairsUsed = true;
io_sc.service_data->setSignature( i_chip->getHuid(),
PRDFSIG_AllDramRepairs );
-
- #ifdef __HOSTBOOT_RUNTIME
- // No more repairs left so no point doing any more TPS procedures.
- MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank );
- #endif
}
return SUCCESS;
@@ -915,14 +927,14 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc,
const MemMark & i_chipMark,
- const MemMark & i_symMark )
+ const MemMark & i_symMark,
+ TdEntry * & o_dsdEvent,
+ bool & o_allRepairsUsed )
{
#define PRDF_FUNC "[__applyRasPolicies<TYPE_MBA>] "
uint32_t o_rc = SUCCESS;
- bool allRepairsUsed = false;
-
do
{
const uint8_t ps = i_chipMark.getSymbol().getPortSlct();
@@ -964,21 +976,9 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
*/
// Add the spares to the callout list if they exist.
- if ( sp0.isValid() )
- {
- MemoryMru mm { i_chip->getTrgt(), i_rank, sp0 };
- io_sc.service_data->SetCallout( mm );
- }
- if ( sp1.isValid() )
- {
- MemoryMru mm { i_chip->getTrgt(), i_rank, sp1 };
- io_sc.service_data->SetCallout( mm );
- }
- if ( ecc.isValid() )
- {
- MemoryMru mm { i_chip->getTrgt(), i_rank, ecc };
- io_sc.service_data->SetCallout( mm );
- }
+ __addCallout( i_chip, i_rank, sp0, io_sc );
+ __addCallout( i_chip, i_rank, sp1, io_sc );
+ __addCallout( i_chip, i_rank, ecc, io_sc );
// If the chip mark is on a spare then the spare is bad and hardware
// can not steer it to another DRAM even if one is available (e.g.
@@ -987,7 +987,7 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
( (1 == ps) && sp1.isValid() && (dram == sp1.getDram()) ) ||
( isX4 && ecc.isValid() && (dram == ecc.getDram()) ) )
{
- allRepairsUsed = true;
+ o_allRepairsUsed = true;
io_sc.service_data->setSignature( i_chip->getHuid(),
PRDFSIG_VcmBadSpare );
break; // Nothing more to do.
@@ -1011,21 +1011,19 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
(0 == ps ? !sp0.isValid() : !sp1.isValid()) )
{
// A spare DRAM is available.
- TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
- i_chipMark };
- MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e );
+ o_dsdEvent = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
+ i_chipMark };
}
else if ( eccSparePossible && !ecc.isValid() )
{
// The ECC spare is available.
- TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
- i_chipMark, true };
- MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e );
+ o_dsdEvent = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
+ i_chipMark, true };
}
else
{
// Chip mark is in place and sparing is not possible.
- allRepairsUsed = true;
+ o_allRepairsUsed = true;
io_sc.service_data->setSignature( i_chip->getHuid(),
PRDFSIG_AllDramRepairs );
}
@@ -1034,36 +1032,36 @@ uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
// mark have been used.
else if ( i_chipMark.isValid() && i_symMark.isValid() )
{
- allRepairsUsed = true;
+ o_allRepairsUsed = true;
io_sc.service_data->setSignature( i_chip->getHuid(),
PRDFSIG_AllDramRepairs );
}
} while (0);
- if ( allRepairsUsed )
- {
- io_sc.service_data->setServiceCall();
-
- #ifdef __HOSTBOOT_RUNTIME
- // No more repairs left so no point doing any more TPS procedures.
- MemDbUtils::banTps<TYPE_MBA>( i_chip, i_rank );
- #endif
- }
-
return o_rc;
#undef PRDF_FUNC
}
+//------------------------------------------------------------------------------
+
template<TARGETING::TYPE T>
-uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
- STEP_CODE_DATA_STRUCT & io_sc )
+uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ TdEntry * & o_dsdEvent )
{
- #define PRDF_FUNC "[chipMarkCleanup] "
+ #define PRDF_FUNC "[MarkStore::applyRasPolicies] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( T == i_chip->getType() );
uint32_t o_rc = SUCCESS;
+ delete o_dsdEvent; o_dsdEvent = nullptr; // just in case
+
+ bool allRepairsUsed = false;
+
do
{
// Get the chip mark.
@@ -1080,8 +1078,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
if ( !chipMark.isValid() ) break;
// Add the chip mark to the callout list.
- MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() };
- io_sc.service_data->SetCallout( cm_mm );
+ __addCallout( i_chip, i_rank, chipMark.getSymbol(), io_sc );
// Get the symbol mark.
MemMark symMark;
@@ -1095,7 +1092,8 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
// If both the chip and symbol mark are on the same DRAM, clear the
// symbol mark.
- if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() )
+ if ( symMark.isValid() &&
+ chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() )
{
o_rc = clearSymbolMark<T>( i_chip, i_rank );
if ( SUCCESS != o_rc )
@@ -1110,11 +1108,7 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
}
// Add the symbol mark to the callout list if it exists.
- if ( symMark.isValid() )
- {
- MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() };
- io_sc.service_data->SetCallout( sm_mm );
- }
+ __addCallout( i_chip, i_rank, symMark.getSymbol(), io_sc );
// Make the error log predictive and exit if DRAM repairs are disabled.
if ( areDramRepairsDisabled() )
@@ -1123,25 +1117,96 @@ uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
break; // nothing else to do
}
- // Set the chip mark in the DRAM Repairs VPD.
- o_rc = setDramInVpd<T>( i_chip, i_rank, chipMark.getSymbol() );
+ // Apply type specific RAS policies.
+ o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark, symMark,
+ o_dsdEvent, allRepairsUsed );
+ if ( SUCCESS != o_rc ) break;
+
+ } while (0);
+
+ if ( allRepairsUsed )
+ {
+ io_sc.service_data->setServiceCall();
+
+ #ifdef __HOSTBOOT_RUNTIME
+ // No more repairs left so no point doing any more TPS procedures.
+ MemDbUtils::banTps<T>( i_chip, i_rank );
+ #endif
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template
+uint32_t applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ TdEntry * & o_dsdEvent );
+template
+uint32_t applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ TdEntry * & o_dsdEvent );
+
+//------------------------------------------------------------------------------
+
+template<TARGETING::TYPE T>
+uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[chipMarkCleanup] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( T == i_chip->getType() );
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // It is possible this function was called and there is no chip mark. So
+ // first check if one exists.
+ MemMark chipMark;
+ o_rc = readChipMark<T>( i_chip, i_rank, chipMark );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed",
+ PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed",
i_chip->getHuid(), i_rank.getKey() );
break;
}
- // Apply RAS policies.
- o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark,
- symMark );
+ // There is nothing else to do if there is no chip mark.
+ if ( !chipMark.isValid() ) break;
+
+ // Apply all RAS policies.
+ TdEntry * dsdEvent = nullptr;
+ o_rc = applyRasPolicies<T>( i_chip, i_rank, io_sc, dsdEvent );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "__applyRasPolicies(0x%08x,0x%02x) failed",
+ PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x,0x%02x) failed",
i_chip->getHuid(), i_rank.getKey() );
break;
}
+ // Add the DRAM spare event to the queue if needed.
+ if ( nullptr != dsdEvent )
+ {
+ MemDbUtils::pushToQueue<T>( i_chip, dsdEvent );
+ }
+
+ // Set the chip mark in the DRAM Repairs VPD.
+ if ( !areDramRepairsDisabled() )
+ {
+ o_rc = setDramInVpd<T>( i_chip, i_rank, chipMark.getSymbol() );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+ }
+
} while (0);
return o_rc;
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
index fca039258..d0f8c57ef 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
@@ -36,6 +36,10 @@
#include <prdfP9McaExtraSig.H>
#include <prdfPlatServices.H>
+#ifdef __HOSTBOOT_MODULE
+ #include <prdfMemTdQueue.H>
+#endif
+
//##############################################################################
// class MemMark
//##############################################################################
@@ -173,21 +177,44 @@ uint32_t writeSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank,
template<TARGETING::TYPE T>
uint32_t clearSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank );
+#ifdef __HOSTBOOT_MODULE // Not supported on FSP.
+
+/**
+ * @brief Applies RAS policies on a rank of memory based on based on the number
+ * of repairs available on that rank.
+ *
+ * Function details:
+ * - Removes the symbol mark if it is on the same DRAM as the chip mark.
+ * - Adds the following to the callout list if they exist: chip mark, symbol
+ * mark, DRAM spares, and ECC spare.
+ * - Makes the error log predictive and exits if DRAM repairs are disabled.
+ * - Makes the error log predictive and bans TPS on this rank if all available
+ * repairs have been used.
+ * - Returns a new DsdEvent if DRAM sparing is available.
+ *
+ * @param i_chip MBA or MCA chip.
+ * @param i_rank Target rank.
+ * @param io_sc The step code data struct.
+ * @param o_dsdEvent A new DsdEvent if DRAM sparing is available. Otherwise,
+ * nullptr. Note that this is not used in all cases so the
+ * event will need to be manually deleted if not added to the
+ * TD queue.
+ * @return Non-SUCCESS if an internal function fails. SUCCESS otherwise.
+ */
+template<TARGETING::TYPE T>
+uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ TdEntry * & o_dsdEvent );
+
/**
* @brief If a chip mark has been verified or explicitly set due other RAS
* policies, this function does all the necessary cleanup.
*
* Function details:
- * - Adds the chip mark to the callout list.
- * - Removes the symbol mark if it is on the same DRAM as the chip mark.
- * - Adds the symbol mark to the callout list if it exists on another DRAM.
- * - If DRAM repairs are disabled:
- * - Makes the error log predictive.
- * - Otherwise:
- * - Sets the DRAM in the DRAM Repair VPD.
- * - Makes the error log predictive if RAS policies apply.
- * - Adds a DSD procedure to the TD queue is a DRAM spare is available.
- * - Bans TPS on the rank if all repairs are used.
+ * - Calls applyRasPolicies() to make any necessary callouts.
+ * - If DRAM repairs are not disabled:
+ * - Sets the DRAM in the DRAM Repair VPD if DRAM repairs.
+ * - Adds a DSD procedure to the TD queue if a DRAM spare is available
*
* @param i_chip MBA or MCA chip.
* @param i_rank Target rank.
@@ -198,6 +225,8 @@ template<TARGETING::TYPE T>
uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
STEP_CODE_DATA_STRUCT & io_sc );
+#endif // Not supported on FSP.
+
} // end namespace MarkStore
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
index 29270ae34..dfd36e9be 100644
--- a/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
+++ b/src/usr/diag/prdf/plat/mem/prdfMemTps_rt.C
@@ -1520,25 +1520,23 @@ uint32_t TpsEvent<TYPE_MBA>::analyzeCeStats( STEP_CODE_DATA_STRUCT & io_sc )
// after the VCM procedure.
if ( chipMark.isValid() )
{
- /* TODO RTC 189221 DRAM sparing support
- bool available;
- o_rc = checkForAvailableSpares( iv_mark.getCM().getPortSlct(),
- available );
- if ( SUCCESS != o_rc )
+ TdEntry * dsdEvent = nullptr;
+ o_rc = MarkStore::applyRasPolicies<TYPE_MBA>( iv_chip, iv_rank,
+ io_sc, dsdEvent );
+ if ( nullptr != dsdEvent )
{
- PRDF_ERR( PRDF_FUNC "checkForAvailableSpares() failed" );
- break;
+ // We don't want to do the DRAM spare procedure at this time,
+ // because we haven't even run the VCM procedure yet. So just
+ // delete the procedure instead of adding it to the queue.
+ delete dsdEvent; dsdEvent = nullptr;
}
- if ( !available )
+ if ( SUCCESS != o_rc )
{
- // Spares have been used. Callout the mark. Make the error log
- // predictive.
- CalloutUtil::calloutMark( iv_mbaTrgt, iv_rank, iv_mark, io_sc );
- setTdSignature( io_sc, PRDFSIG_TpsCmAndSpare );
- io_sc.service_data->setServiceCall();
+ PRDF_ERR( PRDF_FUNC "applyRasPolicies(0x%08x, 0x%02x) failed.",
+ iv_chip->getHuid(), iv_rank.getKey() );
+ break;
}
- */
}
} while (0);
OpenPOWER on IntegriCloud