summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorCaleb Palmer <cnpalmer@us.ibm.com>2018-09-27 09:46:55 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2018-10-08 20:19:14 -0500
commit31b6cf0ac237a0365bcf225fc0e8ae20c7012b87 (patch)
tree1b8aa50be424c893efe83366268d72fc10a2ecf3 /src
parentd02cb05f827a5204bd1d5bde793aac6d8952496d (diff)
downloadtalos-hostboot-31b6cf0ac237a0365bcf225fc0e8ae20c7012b87.tar.gz
talos-hostboot-31b6cf0ac237a0365bcf225fc0e8ae20c7012b87.zip
PRD: Fixes for MBS timeout cases
Change-Id: I4a5970ccaee60df83dc48503ede3655a34dd8b1c CQ: SW444990 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/66726 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/67098 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r--src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule10
-rw-r--r--src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C102
-rwxr-xr-xsrc/usr/diag/prdf/common/plat/mem/prdfMemUtils.C64
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_dmi.rule2
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule6
-rw-r--r--src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C57
6 files changed, 235 insertions, 6 deletions
diff --git a/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule b/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule
index f47d463ed..f013614e7 100644
--- a/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule
+++ b/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule
@@ -132,11 +132,19 @@ actionclass replay_timeout_UERE
funccall("ClearMbsSecondaryBits");
};
+/** Handles cases where both MBSFIR[3:4] are on, else calls out self. */
+actionclass mbs_internal_timeout_precheck
+{
+ threshold32pday;
+ funccall("mbsInternalTimeoutPrecheck");
+};
+
/** Handles RCD parity errors, if present. Otherwise, calls out self (TH 1). */
actionclass mbs_internal_timeout
{
try ( funccall("analyzeMbaRcdParityError0"),
- try ( funccall("analyzeMbaRcdParityError1"), self_th_1 ) );
+ try ( funccall("analyzeMbaRcdParityError1"),
+ mbs_internal_timeout_precheck ) );
};
/** Handles RCD parity errors, if present. Otherwise, calls out lvl 2 (TH 1). */
diff --git a/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C b/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
index 72cf24527..41a7fe017 100644
--- a/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
+++ b/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C
@@ -240,6 +240,108 @@ PLUGIN_RCD_PARITY_UE_SIDEEFFECTS( 1 )
#undef PLUGIN_RCD_PARITY_UE_SIDEEFFECTS
+//------------------------------------------------------------------------------
+
+/**
+ * @brief Clears and ignores MBSFIR[3:4] if both are on at the same time. Masks
+ * them at threshold of 32 per day.
+ * @param i_mbChip MEMBUF chip.
+ * @param io_sc Step code data struct
+ * @return SUCCESS if both MBSFIR[3] and MBSFIR[4] are on.
+ * PRD_SCAN_COMM_REGISTER_ZERO if not.
+ */
+int32_t mbsInternalTimeoutPrecheck( ExtensibleChip * i_mbChip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[mbsInternalTimeoutPrecheck] "
+
+ int32_t o_rc = SUCCESS;
+
+ do
+ {
+ // Get MBSFIR
+ SCAN_COMM_REGISTER_CLASS * mbsFir = i_mbChip->getRegister("MBSFIR");
+
+ o_rc = mbsFir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x",
+ i_mbChip->getHuid() );
+ break;
+ }
+
+ if ( mbsFir->IsBitSet(3) && mbsFir->IsBitSet(4) )
+ {
+ // We are going to ignore this attention. If there is a system
+ // checkstop, we will have to return a DD02 so that the rule code
+ // can try to find something else as the root cause. Otherwise,
+ // apply the "threshold and mask" policy.
+
+ if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() )
+ {
+ o_rc = PRD_SCAN_COMM_REGISTER_ZERO;
+ }
+ else
+ {
+ // Add Centaur callout just in case.
+ io_sc.service_data->SetCallout( i_mbChip->getTrgt() );
+
+ // Clear MBSFIR[3:4].
+ SCAN_COMM_REGISTER_CLASS * mbsFirAnd =
+ i_mbChip->getRegister("MBSFIR_AND");
+
+ mbsFirAnd->setAllBits();
+ mbsFirAnd->ClearBit(3);
+ mbsFirAnd->ClearBit(4);
+
+ o_rc = mbsFirAnd->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MBSFIR_AND write failed for 0x%08x",
+ i_mbChip->getHuid() );
+ break;
+ }
+
+ if ( io_sc.service_data->IsAtThreshold() )
+ {
+ // Prevent a predictive error, if needed.
+ if ( !mfgMode() && !io_sc.service_data->isMemChnlFail() )
+ {
+ io_sc.service_data->clearServiceCall();
+ }
+
+ // Mask MBSFIR[3:4].
+ SCAN_COMM_REGISTER_CLASS * mbsFirMaskOr =
+ i_mbChip->getRegister("MBSFIR_MASK_OR");
+
+ mbsFirMaskOr->SetBit(3);
+ mbsFirMaskOr->SetBit(4);
+
+ o_rc = mbsFirMaskOr->Write();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MBSFIR_MASK_OR write failed for "
+ "0x%08x", i_mbChip->getHuid() );
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ // Legitimate internal timeout, make the error log predictive.
+ io_sc.service_data->SetCallout( i_mbChip->getTrgt() );
+ io_sc.service_data->setServiceCall();
+ }
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+PRDF_PLUGIN_DEFINE( cen_centaur, mbsInternalTimeoutPrecheck );
+
//##############################################################################
//
// MBSECCFIRs
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
index d2d748e30..cfd5db927 100755
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
@@ -619,6 +619,8 @@ bool __queryUcsCentaur<TYPE_MEMBUF>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------
+// This excludes CHIFIR[16,19:21] to avoid a loop in isolation. Also excludes
+// CHIFIR[61] due to a hardware workaround, see __queryUcsChifir_61().
template<TARGETING::TYPE T>
bool __queryUcsChifir( ExtensibleChip * i_chip );
@@ -639,11 +641,13 @@ bool __queryUcsChifir<TYPE_DMI>( ExtensibleChip * i_chip )
{
// Make sure to ignore CHIFIR[16,19:21], which simply say there is an
// attention on the Centaur. Otherwise, we will get stuck in a loop.
+ // CHIFIR[61] is also ignored as it needs to be looked for later
+ // for a special MBS timeout case.
if ( 0 != ( fir->GetBitFieldJustified( 0,64) &
~mask->GetBitFieldJustified(0,64) &
act0->GetBitFieldJustified(0,64) &
act1->GetBitFieldJustified(0,64) &
- 0xffff63ffffffffffull ) )
+ 0xffff63fffffffffbull ) )
{
o_activeAttn = true;
}
@@ -654,6 +658,46 @@ bool __queryUcsChifir<TYPE_DMI>( ExtensibleChip * i_chip )
//------------------------------------------------------------------------------
+// WORKAROUND:
+// This function only queries for CHIFIR[61]. There is a hardware workaround
+// that changes some behavior. CHIFIR[16] will no longer report channel failure
+// attentions from the Centaur. Also, any time there is a channel failure
+// attention from the Centaur, CHIFIR[61] will get set. In addition, CHIFIR[61]
+// can report an attention on its own, no need for Centaur attention. Therefore,
+// we must workaround the workaround and isolate to CHIFIR[61] only after
+// analyzing the Centaur.
+template<TARGETING::TYPE T>
+bool __queryUcsChifir_61( ExtensibleChip * i_chip );
+
+template<>
+bool __queryUcsChifir_61<TYPE_DMI>( ExtensibleChip * i_chip )
+{
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_DMI == i_chip->getType() );
+
+ uint32_t o_activeAttn= false;
+
+ // Check if there is an active UCS attention on CHIFIR[61]
+ SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "CHIFIR" );
+ SCAN_COMM_REGISTER_CLASS * mask = i_chip->getRegister( "CHIFIR_MASK" );
+ SCAN_COMM_REGISTER_CLASS * act0 = i_chip->getRegister( "CHIFIR_ACT0" );
+ SCAN_COMM_REGISTER_CLASS * act1 = i_chip->getRegister( "CHIFIR_ACT1" );
+
+ if ( SUCCESS == (fir->Read() | mask->Read() | act0->Read() | act1->Read()) )
+ {
+ if ( fir->IsBitSet(61) & !mask->IsBitSet(61) & act0->IsBitSet(61) &
+ act1->IsBitSet(61) )
+ {
+ o_activeAttn = true;
+ }
+ }
+
+
+ return o_activeAttn;
+}
+
+//------------------------------------------------------------------------------
+
template<TARGETING::TYPE T>
bool __queryUcsIomcfir( ExtensibleChip * i_chip );
@@ -756,8 +800,9 @@ bool __queryChnlFail<TYPE_DMI,TYPE_MEMBUF>( ExtensibleChip * i_dmiChip,
if ( !tmpChnlFail ) break; // nothing more to do.
// Check for an active attention on the CHIFIR or IOMCFIR.
- if ( __queryUcsChifir<TYPE_DMI>( i_dmiChip) ||
- __queryUcsIomcfir<TYPE_DMI>(i_dmiChip) )
+ if ( __queryUcsChifir <TYPE_DMI>(i_dmiChip) ||
+ __queryUcsChifir_61<TYPE_DMI>(i_dmiChip) ||
+ __queryUcsIomcfir <TYPE_DMI>(i_dmiChip) )
{
o_chnlFail = true;
break; // nothing more to do.
@@ -926,7 +971,8 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip,
o_analyzed = true; break; // analysis complete
}
- // Now, look for unit checkstops in the CHIFIR.
+ // Now, look for unit checkstops in the CHIFIR, excluding
+ // CHIFIR[16,19:21,61].
if ( __queryUcsChifir<TYPE_DMI>(dmiChip) )
{
// Analyze UNIT_CS on the DMI chip.
@@ -946,6 +992,16 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip,
}
}
+ // Now, look for unit checkstop from CHIFIR[61].
+ if ( __queryUcsChifir_61<TYPE_DMI>(dmiChip) )
+ {
+ // Analyze UNIT_CS on the DMI chip.
+ if ( SUCCESS == dmiChip->Analyze(io_sc, UNIT_CS) )
+ {
+ o_analyzed = true; break; // analysis complete
+ }
+ }
+
// Now, look for unit checkstops in the IOMCFIR.
if ( __queryUcsIomcfir<TYPE_DMI>(dmiChip) )
{
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule
index 74ae29831..7961f2110 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule
@@ -396,7 +396,7 @@ group gCHIFIR filter priority( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
/** CHIFIR[61]
* DSFF channel timeout
*/
- (rCHIFIR, bit(61)) ? self_th_1_UERE;
+ (rCHIFIR, bit(61)) ? dsffChannelTimeout_UERE;
/** CHIFIR[62]
* SCOM error
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule
index 7de1662ab..ee4735b38 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule
@@ -46,3 +46,9 @@ actionclass dmi_bus_th_1_UERE
dmi_bus_th_1;
};
+actionclass dsffChannelTimeout_UERE
+{
+ SueSource;
+ threshold1;
+ funccall("dsffChannelTimeoutCheck");
+};
diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
index 03a0a913e..300ba6266 100644
--- a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
+++ b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
@@ -88,6 +88,63 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
}
PRDF_PLUGIN_DEFINE( p9_dmi, PostAnalysis );
+//##############################################################################
+//
+// CHIFIR
+//
+//##############################################################################
+/**
+ * @brief Checks if we have a legitimate CHIFIR[61] channel timeout or if its
+ * a side effect of a MBSFIR[4] internal timeout.
+ * @param i_dmiChip DMI chip.
+ * @param io_sc Step code data struct
+ * @return SUCCESS if MBSFIR[4] is set but MBSFIR[3] is not.
+ * PRD_SCAN_COMM_REGISTER_ZERO otherwise.
+
+ */
+int32_t dsffChannelTimeoutCheck( ExtensibleChip * i_dmiChip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[dsffChannelTimeoutCheck] "
+
+ int32_t o_rc = SUCCESS;
+
+ ExtensibleChip * membChip = getConnectedChild( i_dmiChip, TYPE_MEMBUF, 0 );
+ PRDF_ASSERT( nullptr != membChip );
+
+ do
+ {
+ // Get MBSFIR
+ SCAN_COMM_REGISTER_CLASS * mbsFir = membChip->getRegister("MBSFIR");
+
+ o_rc = mbsFir->Read();
+ if ( SUCCESS != o_rc )
+ {
+ PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x",
+ membChip->getHuid() );
+ break;
+ }
+
+ // If MBSFIR[4] is set and MBSFIR[3] is not set
+ if( mbsFir->IsBitSet(4) && !mbsFir->IsBitSet(3) )
+ {
+ // MBSFIR[4] internal timeout, predictive centaur callout
+ io_sc.service_data->SetCallout( membChip->getTrgt() );
+ }
+ else
+ {
+ // CHIFIR[61] channel timeout, predictive DMI callout
+ io_sc.service_data->SetCallout( i_dmiChip->getTrgt() );
+ }
+
+ }while(0);
+
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+PRDF_PLUGIN_DEFINE( p9_dmi, dsffChannelTimeoutCheck );
+
//------------------------------------------------------------------------------
} // end namespace p9_dmi
OpenPOWER on IntegriCloud