summaryrefslogtreecommitdiffstats
path: root/src/usr/diag/prdf
diff options
context:
space:
mode:
authorZane Shelley <zshelle@us.ibm.com>2019-01-09 16:57:36 -0600
committerZane C. Shelley <zshelle@us.ibm.com>2019-01-18 15:26:55 -0600
commit7ef75d2c684442c3bb668aec4d21cb938b330e47 (patch)
tree71ac9875c17a01541a5f430aa8323c15fd41ef03 /src/usr/diag/prdf
parent9fc690c83456910e45a85e9a72ac4dc729365761 (diff)
downloadtalos-hostboot-7ef75d2c684442c3bb668aec4d21cb938b330e47.tar.gz
talos-hostboot-7ef75d2c684442c3bb668aec4d21cb938b330e47.zip
PRD: Prioritize centaur internal timeout over channel failure
Change-Id: Idc9b57c28d48a9426bb8aee7ae4d17dac285e537 CQ: SW451358 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70277 Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70569 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf')
-rw-r--r--src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule3
-rw-r--r--src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H38
-rw-r--r--src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule5
-rwxr-xr-xsrc/usr/diag/prdf/common/plat/mem/prdfMemUtils.C76
-rw-r--r--src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C57
5 files changed, 116 insertions, 63 deletions
diff --git a/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule b/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule
index 500e9974e..ead65abc6 100644
--- a/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule
+++ b/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule
@@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
-# Contributors Listed Below - COPYRIGHT 2016,2018
+# Contributors Listed Below - COPYRIGHT 2016,2019
# [+] International Business Machines Corp.
#
#
@@ -33,6 +33,7 @@ chip centaur_membuf
# Import signatures
.include "prdfP9ProcMbCommonExtraSig.H";
+.include "prdfCenMembufExtraSig.H";
.include "prdfLaneRepairExtraSig.H";
#############################################################################
diff --git a/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H b/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H
new file mode 100644
index 000000000..3ae36cee4
--- /dev/null
+++ b/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H
@@ -0,0 +1,38 @@
+/* IBM_PROLOG_BEGIN_TAG */
+/* This is an automatically generated prolog. */
+/* */
+/* $Source: src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H $ */
+/* */
+/* OpenPOWER HostBoot Project */
+/* */
+/* Contributors Listed Below - COPYRIGHT 2013,2019 */
+/* [+] International Business Machines Corp. */
+/* */
+/* */
+/* Licensed under the Apache License, Version 2.0 (the "License"); */
+/* you may not use this file except in compliance with the License. */
+/* You may obtain a copy of the License at */
+/* */
+/* http://www.apache.org/licenses/LICENSE-2.0 */
+/* */
+/* Unless required by applicable law or agreed to in writing, software */
+/* distributed under the License is distributed on an "AS IS" BASIS, */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */
+/* implied. See the License for the specific language governing */
+/* permissions and limitations under the License. */
+/* */
+/* IBM_PROLOG_END_TAG */
+
+#ifndef __prdfCenMembufExtraSig_H
+#define __prdfCenMembufExtraSig_H
+
+#include <prdrSignatures.H>
+#include <prdfMemExtraSig.H>
+
+PRDR_ERROR_SIGNATURE( InternalTimeout, 0xbbbb0000, "(MBSFIR[4])",
+ "INTERNAL_TIMEOUT" );
+
+#endif // __prdfCenMembufExtraSig_H
+
+
+
diff --git a/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule b/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule
index 9f4c88d49..406276f3d 100644
--- a/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule
+++ b/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule
@@ -5,7 +5,7 @@
#
# OpenPOWER HostBoot Project
#
-# Contributors Listed Below - COPYRIGHT 2017,2018
+# Contributors Listed Below - COPYRIGHT 2017,2019
# [+] International Business Machines Corp.
#
#
@@ -49,8 +49,7 @@ actionclass dmi_bus_th_1_UERE
actionclass dsffChannelTimeout_UERE
{
SueSource;
- threshold1;
- funccall("dsffChannelTimeoutCheck");
+ self_th_1;
};
################################################################################
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
index cfd5db927..57d43d6ad 100755
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2013,2018 */
+/* Contributors Listed Below - COPYRIGHT 2013,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -37,6 +37,7 @@
// Platform includes
#include <prdfCenMbaDataBundle.H>
#include <prdfCenMembufDataBundle.H>
+#include <prdfCenMembufExtraSig.H>
#include <prdfMemSymbol.H>
#include <prdfParserUtils.H>
#include <prdfPlatServices.H>
@@ -917,6 +918,69 @@ bool __analyzeRcdParityError<TYPE_MEMBUF>( ExtensibleChip * i_chip,
//------------------------------------------------------------------------------
+// Channel failure analysis is designed to only look for UNIT_CS attentions and
+// not associate any recoverables as the root cause. Of course, now we have yet
+// another special case. An internal timeout is a recoverable attention that
+// could cause unit CS attentions as a side effect. Therefore, we must analyze
+// it first before looking for any UNIT_CS attentions.
+
+template<TARGETING::TYPE T>
+bool __analyzeInternalTimeout( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc );
+
+template<>
+bool __analyzeInternalTimeout<TYPE_MEMBUF>( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & io_sc )
+{
+ #define PRDF_FUNC "[MemUtils::__analyzeInternalTimeout] "
+
+ PRDF_ASSERT( nullptr != i_chip );
+ PRDF_ASSERT( TYPE_MEMBUF == i_chip->getType() );
+
+ uint32_t o_analyzed = false;
+
+ SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "MBSFIR" );
+ SCAN_COMM_REGISTER_CLASS * mask = i_chip->getRegister( "MBSFIR_MASK" );
+
+ do
+ {
+ if ( SUCCESS != (fir->Read() | mask->Read()) )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read MBSFIRs on 0x%08x",
+ i_chip->getHuid() );
+ break;
+ }
+
+ // If there is an internal timeout that is not masked and there is not
+ // an external timeout (note external timeout is always masked), then
+ // there is a legit internal timeout attention.
+ if ( fir->IsBitSet(4) && !mask->IsBitSet(4) && !fir->IsBitSet(3) )
+ {
+ // We are not going to analyze the MEMBUF chip like we do with some
+ // of the other helper functions in this file because the rule code
+ // priority will put the MBSFIR after the MBIFIR and DMIFIR.
+ // Therefore, there is no way to guarantee this attention will be
+ // analyzed. Since we do know there is a channel failure we can
+ // simply make a predictive callout because the channel failure code
+ // will eventually mask the entire Centaur.
+
+ io_sc.service_data->SetCallout( i_chip->getTrgt() );
+
+ io_sc.service_data->setSignature( i_chip->getHuid(),
+ PRDFSIG_InternalTimeout );
+
+ o_analyzed = true; break; // analysis complete
+ }
+
+ } while (0);
+
+ return o_analyzed;
+
+ #undef PRDF_FUNC
+}
+
+//------------------------------------------------------------------------------
+
// Handling channel failures from more than one channel at a time:
// Say we were called to handle a recoverable attention on a Centaur, but the
// channel containing that Centaur has a unit checkstop attention in the
@@ -965,12 +1029,20 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip,
}
// First, check for RCD parity errors. They are recoverable attentions
- // that could has a channel failure attention as a side effect.
+ // that could have a channel failure attention as a side effect.
if ( __analyzeRcdParityError<TYPE_MEMBUF>(membChip, io_sc) )
{
o_analyzed = true; break; // analysis complete
}
+ // Now, check for an internal timeout error. This is a recoverable
+ // attention that could have a channel failure attention as a side
+ // effect.
+ if ( __analyzeInternalTimeout<TYPE_MEMBUF>(membChip, io_sc) )
+ {
+ o_analyzed = true; break; // analysis complete
+ }
+
// Now, look for unit checkstops in the CHIFIR, excluding
// CHIFIR[16,19:21,61].
if ( __queryUcsChifir<TYPE_DMI>(dmiChip) )
diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
index 4ad3f4455..0290bee9b 100644
--- a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
+++ b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C
@@ -88,63 +88,6 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc )
}
PRDF_PLUGIN_DEFINE( cumulus_dmi, PostAnalysis );
-//##############################################################################
-//
-// CHIFIR
-//
-//##############################################################################
-/**
- * @brief Checks if we have a legitimate CHIFIR[61] channel timeout or if its
- * a side effect of a MBSFIR[4] internal timeout.
- * @param i_dmiChip DMI chip.
- * @param io_sc Step code data struct
- * @return SUCCESS if MBSFIR[4] is set but MBSFIR[3] is not.
- * PRD_SCAN_COMM_REGISTER_ZERO otherwise.
-
- */
-int32_t dsffChannelTimeoutCheck( ExtensibleChip * i_dmiChip,
- STEP_CODE_DATA_STRUCT & io_sc )
-{
- #define PRDF_FUNC "[dsffChannelTimeoutCheck] "
-
- int32_t o_rc = SUCCESS;
-
- ExtensibleChip * membChip = getConnectedChild( i_dmiChip, TYPE_MEMBUF, 0 );
- PRDF_ASSERT( nullptr != membChip );
-
- do
- {
- // Get MBSFIR
- SCAN_COMM_REGISTER_CLASS * mbsFir = membChip->getRegister("MBSFIR");
-
- o_rc = mbsFir->Read();
- if ( SUCCESS != o_rc )
- {
- PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x",
- membChip->getHuid() );
- break;
- }
-
- // If MBSFIR[4] is set and MBSFIR[3] is not set
- if( mbsFir->IsBitSet(4) && !mbsFir->IsBitSet(3) )
- {
- // MBSFIR[4] internal timeout, predictive centaur callout
- io_sc.service_data->SetCallout( membChip->getTrgt() );
- }
- else
- {
- // CHIFIR[61] channel timeout, predictive DMI callout
- io_sc.service_data->SetCallout( i_dmiChip->getTrgt() );
- }
-
- }while(0);
-
- return o_rc;
-
- #undef PRDF_FUNC
-}
-PRDF_PLUGIN_DEFINE( cumulus_dmi, dsffChannelTimeoutCheck );
-
//------------------------------------------------------------------------------
} // end namespace cumulus_dmi
OpenPOWER on IntegriCloud