diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2018-09-27 09:46:55 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-10-08 20:19:14 -0500 |
commit | 31b6cf0ac237a0365bcf225fc0e8ae20c7012b87 (patch) | |
tree | 1b8aa50be424c893efe83366268d72fc10a2ecf3 /src | |
parent | d02cb05f827a5204bd1d5bde793aac6d8952496d (diff) | |
download | talos-hostboot-31b6cf0ac237a0365bcf225fc0e8ae20c7012b87.tar.gz talos-hostboot-31b6cf0ac237a0365bcf225fc0e8ae20c7012b87.zip |
PRD: Fixes for MBS timeout cases
Change-Id: I4a5970ccaee60df83dc48503ede3655a34dd8b1c
CQ: SW444990
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/66726
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/67098
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src')
6 files changed, 235 insertions, 6 deletions
diff --git a/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule b/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule index f47d463ed..f013614e7 100644 --- a/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule +++ b/src/usr/diag/prdf/common/plat/cen/cen_centaur_actions.rule @@ -132,11 +132,19 @@ actionclass replay_timeout_UERE funccall("ClearMbsSecondaryBits"); }; +/** Handles cases where both MBSFIR[3:4] are on, else calls out self. */ +actionclass mbs_internal_timeout_precheck +{ + threshold32pday; + funccall("mbsInternalTimeoutPrecheck"); +}; + /** Handles RCD parity errors, if present. Otherwise, calls out self (TH 1). */ actionclass mbs_internal_timeout { try ( funccall("analyzeMbaRcdParityError0"), - try ( funccall("analyzeMbaRcdParityError1"), self_th_1 ) ); + try ( funccall("analyzeMbaRcdParityError1"), + mbs_internal_timeout_precheck ) ); }; /** Handles RCD parity errors, if present. Otherwise, calls out lvl 2 (TH 1). */ diff --git a/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C b/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C index 72cf24527..41a7fe017 100644 --- a/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C +++ b/src/usr/diag/prdf/common/plat/cen/prdfCenMembuf_common.C @@ -240,6 +240,108 @@ PLUGIN_RCD_PARITY_UE_SIDEEFFECTS( 1 ) #undef PLUGIN_RCD_PARITY_UE_SIDEEFFECTS +//------------------------------------------------------------------------------ + +/** + * @brief Clears and ignores MBSFIR[3:4] if both are on at the same time. Masks + * them at threshold of 32 per day. + * @param i_mbChip MEMBUF chip. + * @param io_sc Step code data struct + * @return SUCCESS if both MBSFIR[3] and MBSFIR[4] are on. + * PRD_SCAN_COMM_REGISTER_ZERO if not. + */ +int32_t mbsInternalTimeoutPrecheck( ExtensibleChip * i_mbChip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[mbsInternalTimeoutPrecheck] " + + int32_t o_rc = SUCCESS; + + do + { + // Get MBSFIR + SCAN_COMM_REGISTER_CLASS * mbsFir = i_mbChip->getRegister("MBSFIR"); + + o_rc = mbsFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x", + i_mbChip->getHuid() ); + break; + } + + if ( mbsFir->IsBitSet(3) && mbsFir->IsBitSet(4) ) + { + // We are going to ignore this attention. If there is a system + // checkstop, we will have to return a DD02 so that the rule code + // can try to find something else as the root cause. Otherwise, + // apply the "threshold and mask" policy. + + if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) + { + o_rc = PRD_SCAN_COMM_REGISTER_ZERO; + } + else + { + // Add Centaur callout just in case. + io_sc.service_data->SetCallout( i_mbChip->getTrgt() ); + + // Clear MBSFIR[3:4]. + SCAN_COMM_REGISTER_CLASS * mbsFirAnd = + i_mbChip->getRegister("MBSFIR_AND"); + + mbsFirAnd->setAllBits(); + mbsFirAnd->ClearBit(3); + mbsFirAnd->ClearBit(4); + + o_rc = mbsFirAnd->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MBSFIR_AND write failed for 0x%08x", + i_mbChip->getHuid() ); + break; + } + + if ( io_sc.service_data->IsAtThreshold() ) + { + // Prevent a predictive error, if needed. + if ( !mfgMode() && !io_sc.service_data->isMemChnlFail() ) + { + io_sc.service_data->clearServiceCall(); + } + + // Mask MBSFIR[3:4]. + SCAN_COMM_REGISTER_CLASS * mbsFirMaskOr = + i_mbChip->getRegister("MBSFIR_MASK_OR"); + + mbsFirMaskOr->SetBit(3); + mbsFirMaskOr->SetBit(4); + + o_rc = mbsFirMaskOr->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MBSFIR_MASK_OR write failed for " + "0x%08x", i_mbChip->getHuid() ); + break; + } + } + } + } + else + { + // Legitimate internal timeout, make the error log predictive. + io_sc.service_data->SetCallout( i_mbChip->getTrgt() ); + io_sc.service_data->setServiceCall(); + } + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE( cen_centaur, mbsInternalTimeoutPrecheck ); + //############################################################################## // // MBSECCFIRs diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C index d2d748e30..cfd5db927 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C @@ -619,6 +619,8 @@ bool __queryUcsCentaur<TYPE_MEMBUF>( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ +// This excludes CHIFIR[16,19:21] to avoid a loop in isolation. Also excludes +// CHIFIR[61] due to a hardware workaround, see __queryUcsChifir_61(). template<TARGETING::TYPE T> bool __queryUcsChifir( ExtensibleChip * i_chip ); @@ -639,11 +641,13 @@ bool __queryUcsChifir<TYPE_DMI>( ExtensibleChip * i_chip ) { // Make sure to ignore CHIFIR[16,19:21], which simply say there is an // attention on the Centaur. Otherwise, we will get stuck in a loop. + // CHIFIR[61] is also ignored as it needs to be looked for later + // for a special MBS timeout case. if ( 0 != ( fir->GetBitFieldJustified( 0,64) & ~mask->GetBitFieldJustified(0,64) & act0->GetBitFieldJustified(0,64) & act1->GetBitFieldJustified(0,64) & - 0xffff63ffffffffffull ) ) + 0xffff63fffffffffbull ) ) { o_activeAttn = true; } @@ -654,6 +658,46 @@ bool __queryUcsChifir<TYPE_DMI>( ExtensibleChip * i_chip ) //------------------------------------------------------------------------------ +// WORKAROUND: +// This function only queries for CHIFIR[61]. There is a hardware workaround +// that changes some behavior. CHIFIR[16] will no longer report channel failure +// attentions from the Centaur. Also, any time there is a channel failure +// attention from the Centaur, CHIFIR[61] will get set. In addition, CHIFIR[61] +// can report an attention on its own, no need for Centaur attention. Therefore, +// we must workaround the workaround and isolate to CHIFIR[61] only after +// analyzing the Centaur. +template<TARGETING::TYPE T> +bool __queryUcsChifir_61( ExtensibleChip * i_chip ); + +template<> +bool __queryUcsChifir_61<TYPE_DMI>( ExtensibleChip * i_chip ) +{ + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_DMI == i_chip->getType() ); + + uint32_t o_activeAttn= false; + + // Check if there is an active UCS attention on CHIFIR[61] + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "CHIFIR" ); + SCAN_COMM_REGISTER_CLASS * mask = i_chip->getRegister( "CHIFIR_MASK" ); + SCAN_COMM_REGISTER_CLASS * act0 = i_chip->getRegister( "CHIFIR_ACT0" ); + SCAN_COMM_REGISTER_CLASS * act1 = i_chip->getRegister( "CHIFIR_ACT1" ); + + if ( SUCCESS == (fir->Read() | mask->Read() | act0->Read() | act1->Read()) ) + { + if ( fir->IsBitSet(61) & !mask->IsBitSet(61) & act0->IsBitSet(61) & + act1->IsBitSet(61) ) + { + o_activeAttn = true; + } + } + + + return o_activeAttn; +} + +//------------------------------------------------------------------------------ + template<TARGETING::TYPE T> bool __queryUcsIomcfir( ExtensibleChip * i_chip ); @@ -756,8 +800,9 @@ bool __queryChnlFail<TYPE_DMI,TYPE_MEMBUF>( ExtensibleChip * i_dmiChip, if ( !tmpChnlFail ) break; // nothing more to do. // Check for an active attention on the CHIFIR or IOMCFIR. - if ( __queryUcsChifir<TYPE_DMI>( i_dmiChip) || - __queryUcsIomcfir<TYPE_DMI>(i_dmiChip) ) + if ( __queryUcsChifir <TYPE_DMI>(i_dmiChip) || + __queryUcsChifir_61<TYPE_DMI>(i_dmiChip) || + __queryUcsIomcfir <TYPE_DMI>(i_dmiChip) ) { o_chnlFail = true; break; // nothing more to do. @@ -926,7 +971,8 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip, o_analyzed = true; break; // analysis complete } - // Now, look for unit checkstops in the CHIFIR. + // Now, look for unit checkstops in the CHIFIR, excluding + // CHIFIR[16,19:21,61]. if ( __queryUcsChifir<TYPE_DMI>(dmiChip) ) { // Analyze UNIT_CS on the DMI chip. @@ -946,6 +992,16 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip, } } + // Now, look for unit checkstop from CHIFIR[61]. + if ( __queryUcsChifir_61<TYPE_DMI>(dmiChip) ) + { + // Analyze UNIT_CS on the DMI chip. + if ( SUCCESS == dmiChip->Analyze(io_sc, UNIT_CS) ) + { + o_analyzed = true; break; // analysis complete + } + } + // Now, look for unit checkstops in the IOMCFIR. if ( __queryUcsIomcfir<TYPE_DMI>(dmiChip) ) { diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule index 74ae29831..7961f2110 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule @@ -396,7 +396,7 @@ group gCHIFIR filter priority( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /** CHIFIR[61] * DSFF channel timeout */ - (rCHIFIR, bit(61)) ? self_th_1_UERE; + (rCHIFIR, bit(61)) ? dsffChannelTimeout_UERE; /** CHIFIR[62] * SCOM error diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule index 7de1662ab..ee4735b38 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi_actions.rule @@ -46,3 +46,9 @@ actionclass dmi_bus_th_1_UERE dmi_bus_th_1; }; +actionclass dsffChannelTimeout_UERE +{ + SueSource; + threshold1; + funccall("dsffChannelTimeoutCheck"); +}; diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C index 03a0a913e..300ba6266 100644 --- a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C +++ b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C @@ -88,6 +88,63 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) } PRDF_PLUGIN_DEFINE( p9_dmi, PostAnalysis ); +//############################################################################## +// +// CHIFIR +// +//############################################################################## +/** + * @brief Checks if we have a legitimate CHIFIR[61] channel timeout or if its + * a side effect of a MBSFIR[4] internal timeout. + * @param i_dmiChip DMI chip. + * @param io_sc Step code data struct + * @return SUCCESS if MBSFIR[4] is set but MBSFIR[3] is not. + * PRD_SCAN_COMM_REGISTER_ZERO otherwise. + + */ +int32_t dsffChannelTimeoutCheck( ExtensibleChip * i_dmiChip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[dsffChannelTimeoutCheck] " + + int32_t o_rc = SUCCESS; + + ExtensibleChip * membChip = getConnectedChild( i_dmiChip, TYPE_MEMBUF, 0 ); + PRDF_ASSERT( nullptr != membChip ); + + do + { + // Get MBSFIR + SCAN_COMM_REGISTER_CLASS * mbsFir = membChip->getRegister("MBSFIR"); + + o_rc = mbsFir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x", + membChip->getHuid() ); + break; + } + + // If MBSFIR[4] is set and MBSFIR[3] is not set + if( mbsFir->IsBitSet(4) && !mbsFir->IsBitSet(3) ) + { + // MBSFIR[4] internal timeout, predictive centaur callout + io_sc.service_data->SetCallout( membChip->getTrgt() ); + } + else + { + // CHIFIR[61] channel timeout, predictive DMI callout + io_sc.service_data->SetCallout( i_dmiChip->getTrgt() ); + } + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE( p9_dmi, dsffChannelTimeoutCheck ); + //------------------------------------------------------------------------------ } // end namespace p9_dmi |