diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2019-01-09 16:57:36 -0600 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2019-01-18 15:26:55 -0600 |
commit | 7ef75d2c684442c3bb668aec4d21cb938b330e47 (patch) | |
tree | 71ac9875c17a01541a5f430aa8323c15fd41ef03 /src/usr/diag/prdf | |
parent | 9fc690c83456910e45a85e9a72ac4dc729365761 (diff) | |
download | talos-hostboot-7ef75d2c684442c3bb668aec4d21cb938b330e47.tar.gz talos-hostboot-7ef75d2c684442c3bb668aec4d21cb938b330e47.zip |
PRD: Prioritize centaur internal timeout over channel failure
Change-Id: Idc9b57c28d48a9426bb8aee7ae4d17dac285e537
CQ: SW451358
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70277
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70569
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf')
5 files changed, 116 insertions, 63 deletions
diff --git a/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule b/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule index 500e9974e..ead65abc6 100644 --- a/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule +++ b/src/usr/diag/prdf/common/plat/centaur/centaur_membuf.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2016,2018 +# Contributors Listed Below - COPYRIGHT 2016,2019 # [+] International Business Machines Corp. # # @@ -33,6 +33,7 @@ chip centaur_membuf # Import signatures .include "prdfP9ProcMbCommonExtraSig.H"; +.include "prdfCenMembufExtraSig.H"; .include "prdfLaneRepairExtraSig.H"; ############################################################################# diff --git a/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H b/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H new file mode 100644 index 000000000..3ae36cee4 --- /dev/null +++ b/src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H @@ -0,0 +1,38 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/diag/prdf/common/plat/centaur/prdfCenMembufExtraSig.H $ */ +/* */ +/* OpenPOWER HostBoot Project */ +/* */ +/* Contributors Listed Below - COPYRIGHT 2013,2019 */ +/* [+] International Business Machines Corp. */ +/* */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/* Unless required by applicable law or agreed to in writing, software */ +/* distributed under the License is distributed on an "AS IS" BASIS, */ +/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ +/* implied. See the License for the specific language governing */ +/* permissions and limitations under the License. */ +/* */ +/* IBM_PROLOG_END_TAG */ + +#ifndef __prdfCenMembufExtraSig_H +#define __prdfCenMembufExtraSig_H + +#include <prdrSignatures.H> +#include <prdfMemExtraSig.H> + +PRDR_ERROR_SIGNATURE( InternalTimeout, 0xbbbb0000, "(MBSFIR[4])", + "INTERNAL_TIMEOUT" ); + +#endif // __prdfCenMembufExtraSig_H + + + diff --git a/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule b/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule index 9f4c88d49..406276f3d 100644 --- a/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule +++ b/src/usr/diag/prdf/common/plat/cumulus/cumulus_dmi_actions.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2017,2018 +# Contributors Listed Below - COPYRIGHT 2017,2019 # [+] International Business Machines Corp. # # @@ -49,8 +49,7 @@ actionclass dmi_bus_th_1_UERE actionclass dsffChannelTimeout_UERE { SueSource; - threshold1; - funccall("dsffChannelTimeoutCheck"); + self_th_1; }; ################################################################################ diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C index cfd5db927..57d43d6ad 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2013,2018 */ +/* Contributors Listed Below - COPYRIGHT 2013,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -37,6 +37,7 @@ // Platform includes #include <prdfCenMbaDataBundle.H> #include <prdfCenMembufDataBundle.H> +#include <prdfCenMembufExtraSig.H> #include <prdfMemSymbol.H> #include <prdfParserUtils.H> #include <prdfPlatServices.H> @@ -917,6 +918,69 @@ bool __analyzeRcdParityError<TYPE_MEMBUF>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ +// Channel failure analysis is designed to only look for UNIT_CS attentions and +// not associate any recoverables as the root cause. Of course, now we have yet +// another special case. An internal timeout is a recoverable attention that +// could cause unit CS attentions as a side effect. Therefore, we must analyze +// it first before looking for any UNIT_CS attentions. + +template<TARGETING::TYPE T> +bool __analyzeInternalTimeout( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + +template<> +bool __analyzeInternalTimeout<TYPE_MEMBUF>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemUtils::__analyzeInternalTimeout] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_MEMBUF == i_chip->getType() ); + + uint32_t o_analyzed = false; + + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "MBSFIR" ); + SCAN_COMM_REGISTER_CLASS * mask = i_chip->getRegister( "MBSFIR_MASK" ); + + do + { + if ( SUCCESS != (fir->Read() | mask->Read()) ) + { + PRDF_ERR( PRDF_FUNC "Failed to read MBSFIRs on 0x%08x", + i_chip->getHuid() ); + break; + } + + // If there is an internal timeout that is not masked and there is not + // an external timeout (note external timeout is always masked), then + // there is a legit internal timeout attention. + if ( fir->IsBitSet(4) && !mask->IsBitSet(4) && !fir->IsBitSet(3) ) + { + // We are not going to analyze the MEMBUF chip like we do with some + // of the other helper functions in this file because the rule code + // priority will put the MBSFIR after the MBIFIR and DMIFIR. + // Therefore, there is no way to guarantee this attention will be + // analyzed. Since we do know there is a channel failure we can + // simply make a predictive callout because the channel failure code + // will eventually mask the entire Centaur. + + io_sc.service_data->SetCallout( i_chip->getTrgt() ); + + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_InternalTimeout ); + + o_analyzed = true; break; // analysis complete + } + + } while (0); + + return o_analyzed; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // Handling channel failures from more than one channel at a time: // Say we were called to handle a recoverable attention on a Centaur, but the // channel containing that Centaur has a unit checkstop attention in the @@ -965,12 +1029,20 @@ bool __analyzeChnlFail<TYPE_MC>( ExtensibleChip * i_chip, } // First, check for RCD parity errors. They are recoverable attentions - // that could has a channel failure attention as a side effect. + // that could have a channel failure attention as a side effect. if ( __analyzeRcdParityError<TYPE_MEMBUF>(membChip, io_sc) ) { o_analyzed = true; break; // analysis complete } + // Now, check for an internal timeout error. This is a recoverable + // attention that could have a channel failure attention as a side + // effect. + if ( __analyzeInternalTimeout<TYPE_MEMBUF>(membChip, io_sc) ) + { + o_analyzed = true; break; // analysis complete + } + // Now, look for unit checkstops in the CHIFIR, excluding // CHIFIR[16,19:21,61]. if ( __queryUcsChifir<TYPE_DMI>(dmiChip) ) diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C index 4ad3f4455..0290bee9b 100644 --- a/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C +++ b/src/usr/diag/prdf/common/plat/p9/prdfP9Dmi_common.C @@ -88,63 +88,6 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) } PRDF_PLUGIN_DEFINE( cumulus_dmi, PostAnalysis ); -//############################################################################## -// -// CHIFIR -// -//############################################################################## -/** - * @brief Checks if we have a legitimate CHIFIR[61] channel timeout or if its - * a side effect of a MBSFIR[4] internal timeout. - * @param i_dmiChip DMI chip. - * @param io_sc Step code data struct - * @return SUCCESS if MBSFIR[4] is set but MBSFIR[3] is not. - * PRD_SCAN_COMM_REGISTER_ZERO otherwise. - - */ -int32_t dsffChannelTimeoutCheck( ExtensibleChip * i_dmiChip, - STEP_CODE_DATA_STRUCT & io_sc ) -{ - #define PRDF_FUNC "[dsffChannelTimeoutCheck] " - - int32_t o_rc = SUCCESS; - - ExtensibleChip * membChip = getConnectedChild( i_dmiChip, TYPE_MEMBUF, 0 ); - PRDF_ASSERT( nullptr != membChip ); - - do - { - // Get MBSFIR - SCAN_COMM_REGISTER_CLASS * mbsFir = membChip->getRegister("MBSFIR"); - - o_rc = mbsFir->Read(); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "MBSFIR read failed for 0x%08x", - membChip->getHuid() ); - break; - } - - // If MBSFIR[4] is set and MBSFIR[3] is not set - if( mbsFir->IsBitSet(4) && !mbsFir->IsBitSet(3) ) - { - // MBSFIR[4] internal timeout, predictive centaur callout - io_sc.service_data->SetCallout( membChip->getTrgt() ); - } - else - { - // CHIFIR[61] channel timeout, predictive DMI callout - io_sc.service_data->SetCallout( i_dmiChip->getTrgt() ); - } - - }while(0); - - return o_rc; - - #undef PRDF_FUNC -} -PRDF_PLUGIN_DEFINE( cumulus_dmi, dsffChannelTimeoutCheck ); - //------------------------------------------------------------------------------ } // end namespace cumulus_dmi |