summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf/common/plat/mem
diff options
context:
space:
mode:
Diffstat (limited to 'src/usr/diag/prdf/common/plat/mem')
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C17
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemMark.C287
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemMark.H67
3 files changed, 306 insertions, 65 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
index 467441c5a..bf0508d70 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C
@@ -1111,25 +1111,14 @@ uint32_t analyzeImpe<TYPE_MCA>( ExtensibleChip * i_chip,
break;
}
- o_rc = MarkStore::balance<TYPE_MCA>( i_chip, rank, io_sc );
+ o_rc = MarkStore::chipMarkCleanup<TYPE_MCA>( i_chip, rank,
+ io_sc );
if ( SUCCESS != o_rc )
{
- PRDF_ERR( PRDF_FUNC "balance(0x%08x,0x%02x) failed",
+ PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed",
i_chip->getHuid(), rank.getKey() );
break;
}
-
- // Set the dram in DRAM Repairs VPD.
- o_rc = setDramInVpd<TYPE_MCA>( i_chip, rank, symbol );
- if ( SUCCESS != o_rc )
- {
- PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed",
- i_chip->getHuid(), rank.getKey() );
- break;
- }
-
- // Add a DRAM sparing procedure to the queue, if supported.
- // TODO: RTC 157888
}
}
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
index 84e7c09b5..35f7803e7 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C
@@ -30,6 +30,8 @@
#include <prdfMemDbUtils.H>
#ifdef __HOSTBOOT_MODULE
+#include <prdfCenMbaExtraSig.H>
+#include <prdfMemDsd.H>
#include <prdfMemVcm.H>
#endif
@@ -872,7 +874,290 @@ uint32_t writeSymbolMark<TYPE_MBA>( ExtensibleChip * i_chip,
#undef PRDF_FUNC
}
-//------------------------------------------------------------------------------
+//##############################################################################
+// Utilities to cleanup markstore after a chip mark is verified
+//##############################################################################
+
+#ifdef __HOSTBOOT_MODULE // Not supported on FSP.
+
+template<TARGETING::TYPE T>
+uint32_t __applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ const MemMark & i_chipMark,
+ const MemMark & i_symMark );
+
+template<>
+uint32_t __applyRasPolicies<TYPE_MCA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ const MemMark & i_chipMark,
+ const MemMark & i_symMark )
+{
+ // There is no DRAM sparing on Nimbus so simply check if both the chip and
+ // symbol mark have been used.
+ if ( i_chipMark.isValid() && i_symMark.isValid() )
+ {
+ io_sc.service_data->setServiceCall();
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_AllDramRepairs );
+
+ #ifdef __HOSTBOOT_RUNTIME
+ // No more repairs left so no point doing any more TPS procedures.
+ MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank );
+ #endif
+ }
+
+ return SUCCESS;
+}
+
+template<>
+uint32_t __applyRasPolicies<TYPE_MBA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc,
+ const MemMark & i_chipMark,
+ const MemMark & i_symMark )
+{
+ #define PRDF_FUNC "[__applyRasPolicies<TYPE_MBA>] "
+
+ uint32_t o_rc = SUCCESS;
+
+ bool allRepairsUsed = false;
+
+ do
+ {
+ const uint8_t ps = i_chipMark.getSymbol().getPortSlct();
+ const uint8_t dram = i_chipMark.getSymbol().getDram();
+
+ const bool isX4 = isDramWidthX4( i_chip->getTrgt() );
+
+ // Determine if DRAM sparing is enabled.
+ bool isEnabled = isX4; // Always an ECC spare in x4 mode.
+
+ if ( !isEnabled )
+ {
+ /* TODO RTC 189221
+ // Check for any DRAM spares.
+ uint8_t cnfg = ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE;
+ o_rc = getDimmSpareConfig<TYPE_MBA>( i_chip, i_rank, ps, cnfg );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "getDimmSpareConfig(0x%08x,0x%02x,%d) "
+ "failed", i_chip->getHuid(), i_rank.getKey(), ps );
+ break;
+ }
+ isEnabled = (ENUM_ATTR_VPD_DIMM_SPARE_NO_SPARE != cnfg);
+ */
+ }
+
+ if ( isEnabled )
+ {
+ // Sparing is enabled. Get the current spares in hardware.
+ MemSymbol sp0, sp1, ecc;
+ /* TODO RTC 189221
+ o_rc = mssGetSteerMux<TYPE_MBA>( i_chip, i_rank, sp0, sp1, ecc );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "mssGetSteerMux(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+ */
+
+ // Add the spares to the callout list if they exist.
+ if ( sp0.isValid() )
+ {
+ MemoryMru mm { i_chip->getTrgt(), i_rank, sp0 };
+ io_sc.service_data->SetCallout( mm );
+ }
+ if ( sp1.isValid() )
+ {
+ MemoryMru mm { i_chip->getTrgt(), i_rank, sp1 };
+ io_sc.service_data->SetCallout( mm );
+ }
+ if ( ecc.isValid() )
+ {
+ MemoryMru mm { i_chip->getTrgt(), i_rank, ecc };
+ io_sc.service_data->SetCallout( mm );
+ }
+
+ // If the chip mark is on a spare then the spare is bad and hardware
+ // can not steer it to another DRAM even if one is available (e.g.
+ // the ECC spare). In this this case, make error log predictive.
+ if ( ( dram == (0 == ps ? sp0.getDram() : sp1.getDram()) ) ||
+ ( dram == ecc.getDram() ) )
+ {
+ allRepairsUsed = true;
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_VcmBadSpare );
+ break; // Nothing more to do.
+ }
+
+ // Certain DIMMs may have had spares intentially made unavailable by
+ // the manufacturer. Check the VPD for available spares.
+ bool dramSparePossible = false;
+ bool eccSparePossible = false;
+ /* TODO RTC 189221
+ o_rc = bitmap.isSpareAvailable( ps, dramSparePossible,
+ eccSparePossible );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "isDramSpareAvailable() failed" );
+ break;
+ }
+ */
+
+ if ( dramSparePossible &&
+ (0 == ps ? !sp0.isValid() : !sp1.isValid()) )
+ {
+ // A spare DRAM is available.
+ TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
+ i_chipMark };
+ MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e );
+ }
+ else if ( eccSparePossible && !ecc.isValid() )
+ {
+ // The ECC spare is available.
+ TdEntry * e = new DsdEvent<TYPE_MBA>{ i_chip, i_rank,
+ i_chipMark, true };
+ MemDbUtils::pushToQueue<TYPE_MBA>( i_chip, e );
+ }
+ else
+ {
+ // Chip mark is in place and sparing is not possible.
+ allRepairsUsed = true;
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_AllDramRepairs );
+ }
+ }
+ // There is no DRAM sparing so simply check if both the chip and symbol
+ // mark have been used.
+ else if ( i_chipMark.isValid() && i_symMark.isValid() )
+ {
+ allRepairsUsed = true;
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_AllDramRepairs );
+ }
+
+ } while (0);
+
+ if ( allRepairsUsed )
+ {
+ io_sc.service_data->setServiceCall();
+
+ #ifdef __HOSTBOOT_RUNTIME
+ // No more repairs left so no point doing any more TPS procedures.
+ MemDbUtils::banTps<TYPE_MCA>( i_chip, i_rank );
+ #endif
+ }
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template<TARGETING::TYPE T>
+uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[chipMarkCleanup] "
+
+ uint32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Get the chip mark.
+ MemMark chipMark;
+ o_rc = readChipMark<T>( i_chip, i_rank, chipMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+
+ // There is nothing else to do if there is no chip mark.
+ if ( !chipMark.isValid() ) break;
+
+ // Add the chip mark to the callout list.
+ MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() };
+ io_sc.service_data->SetCallout( cm_mm );
+
+ // Get the symbol mark.
+ MemMark symMark;
+ o_rc = readSymbolMark<T>( i_chip, i_rank, symMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "readSymbolMark(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+
+ // If both the chip and symbol mark are on the same DRAM, clear the
+ // symbol mark.
+ if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() )
+ {
+ o_rc = clearSymbolMark<T>( i_chip, i_rank );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "clearSymbolMark(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+
+ // Reset the symbol mark variable to invalid.
+ symMark = MemMark();
+ }
+
+ // Add the symbol mark to the callout list if it exists.
+ if ( symMark.isValid() )
+ {
+ MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() };
+ io_sc.service_data->SetCallout( sm_mm );
+ }
+
+ // Make the error log predictive and exit if DRAM repairs are disabled.
+ if ( areDramRepairsDisabled() )
+ {
+ io_sc.service_data->setServiceCall();
+ break; // nothing else to do
+ }
+
+ // Set the chip mark in the DRAM Repairs VPD.
+ o_rc = setDramInVpd<TYPE_MCA>( i_chip, i_rank, chipMark.getSymbol() );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "setDramInVpd(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+
+ // Apply RAS policies.
+ o_rc = __applyRasPolicies<T>( i_chip, i_rank, io_sc, chipMark,
+ symMark );
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "__applyRasPolicies(0x%08x,0x%02x) failed",
+ i_chip->getHuid(), i_rank.getKey() );
+ break;
+ }
+
+ } while (0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+template
+uint32_t chipMarkCleanup<TYPE_MCA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc );
+template
+uint32_t chipMarkCleanup<TYPE_MBA>( ExtensibleChip * i_chip,
+ const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc );
+
+#endif // not supported on FSP
} // end namespace MarkStore
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
index 8ea692ea9..fca039258 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.H
@@ -174,62 +174,29 @@ template<TARGETING::TYPE T>
uint32_t clearSymbolMark( ExtensibleChip * i_chip, const MemRank & i_rank );
/**
- * @brief If a rank contains a symbol mark that is on the same DRAM as the chip
- * mark, the symbol mark is removed. This is done to free up available
- * repairs. Will also apply RAS policies where necessary.
+ * @brief If a chip mark has been verified or explicitly set due other RAS
+ * policies, this function does all the necessary cleanup.
+ *
+ * Function details:
+ * - Adds the chip mark to the callout list.
+ * - Removes the symbol mark if it is on the same DRAM as the chip mark.
+ * - Adds the symbol mark to the callout list if it exists on another DRAM.
+ * - If DRAM repairs are disabled:
+ * - Makes the error log predictive.
+ * - Otherwise:
+ * - Sets the DRAM in the DRAM Repair VPD.
+ * - Makes the error log predictive if RAS policies apply.
+ * - Adds a DSD procedure to the TD queue is a DRAM spare is available.
+ * - Bans TPS on the rank if all repairs are used.
+ *
* @param i_chip MBA or MCA chip.
* @param i_rank Target rank.
* @param io_sc The step code data struct.
* @return Non-SUCCESS if an internal function fails. SUCCESS otherwise.
*/
template<TARGETING::TYPE T>
-uint32_t balance( ExtensibleChip * i_chip, const MemRank & i_rank,
- STEP_CODE_DATA_STRUCT & io_sc )
-{
- uint32_t o_rc = SUCCESS;
-
- do
- {
- // Get the chip mark.
- MemMark chipMark;
- o_rc = readChipMark<T>( i_chip, i_rank, chipMark );
- if ( SUCCESS != o_rc ) break;
- if ( !chipMark.isValid() ) break; // nothing to do.
-
- // Get the symbol mark.
- MemMark symMark;
- o_rc = readSymbolMark<T>( i_chip, i_rank, symMark );
- if ( SUCCESS != o_rc ) break;
- if ( !symMark.isValid() ) break; // nothing to do.
-
- // If both the chip and symbol mark are on the same DRAM, clear the
- // symbol mark.
- if ( chipMark.getSymbol().getDram() == symMark.getSymbol().getDram() )
- {
- o_rc = clearSymbolMark<T>( i_chip, i_rank );
- if ( SUCCESS != o_rc ) break;
- }
- else
- {
- // Both a chip and symbol mark exist, but they are on separate
- // DRAMs. So, make the error log predictive.
- io_sc.service_data->setServiceCall();
- io_sc.service_data->setSignature( i_chip->getHuid(),
- PRDFSIG_AllDramRepairs );
-
- // The chip and symbol mark may be on different DIMMs (Centaur ranks
- // span two DIMMs). Therefore, we must add both to the callout list
- // to ensure all DIMMs are in the callout list.
- MemoryMru cm_mm { i_chip->getTrgt(), i_rank, chipMark.getSymbol() };
- MemoryMru sm_mm { i_chip->getTrgt(), i_rank, symMark.getSymbol() };
- io_sc.service_data->SetCallout( cm_mm );
- io_sc.service_data->SetCallout( sm_mm );
- }
-
- } while (0);
-
- return o_rc;
-}
+uint32_t chipMarkCleanup( ExtensibleChip * i_chip, const MemRank & i_rank,
+ STEP_CODE_DATA_STRUCT & io_sc );
} // end namespace MarkStore
OpenPOWER on IntegriCloud