diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2018-07-14 15:18:54 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-07-18 21:28:31 -0400 |
commit | 1ba5e879f3a4215ad91a43d650e8f7e820f4ea09 (patch) | |
tree | 61c94f81069aee6d0e51e921e0863d0f99506932 | |
parent | 1ff70a1f246d98e03e3b3862d9ec96e5e9a80a73 (diff) | |
download | talos-hostboot-1ba5e879f3a4215ad91a43d650e8f7e820f4ea09.tar.gz talos-hostboot-1ba5e879f3a4215ad91a43d650e8f7e820f4ea09.zip |
PRD: firmware assisted channel failure workaround
Change-Id: Id33c77bb1bc5aa081f04cf71acd47e436e5b5cde
CQ: SW436013
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/62495
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/62816
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rwxr-xr-x | src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C | 121 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/p9/p9_dmi.rule | 4 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/p9/p9_dmi_regs.rule | 10 |
3 files changed, 135 insertions, 0 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C index ae736b8b4..0aef091df 100755 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemUtils.C @@ -32,6 +32,7 @@ // Framework includes #include <iipServiceDataCollector.h> #include <prdfExtensibleChip.H> +#include <UtilHash.H> // Platform includes #include <prdfCenMbaDataBundle.H> @@ -482,6 +483,112 @@ void cleanupChnlAttns<TYPE_MEMBUF>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ template<TARGETING::TYPE T> +uint32_t __fwAssistChnlFailWorkaround( ExtensibleChip * i_chip ); + +template<> +uint32_t __fwAssistChnlFailWorkaround<TYPE_DMI>( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[MemUtils::__fwAssistChnlFailWorkaround] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_DMI == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + #ifdef __HOSTBOOT_MODULE + + // On the Cumulus chip, channel failure attentions from the IOMCFIR are + // forwarded down to CHIFIR[4] for each of the channels. Unfortunately, + // there is a mapping bug in the hardware where the channel failures are + // forwarded to the wrong CHIFIR, which causes the wrong channel to fail. + + // We considered making all of the channel fail attentions in the IOMCFIR + // recoverable, but the "too many bus errors" attentions are a DI risk. We + // also considered setting them to checkstop, but customers with mirroring + // would be really mad at us, because mirroring should have protected their + // system. So the compromise is to have firmware force the channel failure. + + // The goal here is to make the workaround behave like the hardware as much + // as possible. CHIFIR[4] is no longer configured to trigger a channel + // failure because of the bug. So what we will do is query if this channel + // should have failed via the IOMCFIR. If so, configure CHIFIR[4] to trigger + // a channel failure and set CHIFIR[4]. This will not actually trigger the + // channel failure, but is necessary for the analysis to work. To actually + // trigger the channel failure, we must set MCICFG0[25]. + + // It is likely that the channel failure could cause a system checkstop or + // PHYP/Hostboot TI. Especially, if it occurred in memory that is not + // mirrored. This will end up killing the host and this analysis code, which + // is ok. We would experience the same behavior if the hardware actually + // worked the way it should. The analysis code on the FSP will pick up where + // we left off and callout/gard the bad channel. + + SCAN_COMM_REGISTER_CLASS * chifir = i_chip->getRegister( "CHIFIR" ); + SCAN_COMM_REGISTER_CLASS * chifir_or = i_chip->getRegister( "CHIFIR_OR" ); + SCAN_COMM_REGISTER_CLASS * mcicfg0 = i_chip->getRegister( "MCICFG0" ); + SCAN_COMM_REGISTER_CLASS * mcicfg1 = i_chip->getRegister( "MCICFG1" ); + + ExtensibleChip * mcChip = getConnectedParent( i_chip, TYPE_MC ); + uint32_t dmiPos = i_chip->getPos() % MAX_DMI_PER_MC; + uint32_t bitPos = 8 + dmiPos * 8; + + SCAN_COMM_REGISTER_CLASS * iomcfir = mcChip->getRegister( "IOMCFIR" ); + SCAN_COMM_REGISTER_CLASS * iomc_cfg = mcChip->getRegister( "SCOM_MODE_PB" ); + + do + { + // First, check if there are any bits for this channel that are set in + // the IOMCFIR and configured to channel fail. + o_rc = iomcfir->Read() | iomc_cfg->Read(); + if ( SUCCESS != o_rc ) break; + + // For reference, SCOM_MODE_PB[15:22]: 0=enabled, 1=disabled. + if ( 0 == ( iomcfir->GetBitFieldJustified( bitPos,8) & + ~iomc_cfg->GetBitFieldJustified(15, 8) ) ) + { + break; // nothing more to do. + } + + // The channel should fail. + o_rc = chifir->Read() | mcicfg0->Read() | mcicfg1->Read(); + if ( SUCCESS != o_rc ) break; + + // Configure CHIFIR[4] to channel fail via MCICFG1[47]. + if ( mcicfg1->IsBitSet(47) ) // 0=enabled, 1=disabled + { + mcicfg1->ClearBit(47); + o_rc = mcicfg1->Write(); + if ( SUCCESS != o_rc ) break; + } + + // Set CHIFIR[4]. + if ( !chifir->IsBitSet(4) ) + { + chifir_or->SetBit(4); + o_rc = chifir_or->Write(); + if ( SUCCESS != o_rc ) break; + } + + // Force the channel failure via MCICFG0[25]. + if ( !mcicfg0->IsBitSet(25) ) + { + mcicfg0->SetBit(25); + o_rc = mcicfg0->Write(); + if ( SUCCESS != o_rc ) break; + } + + } while (0); + + #endif // __HOSTBOOT_MODULE + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> uint32_t __queryChnlFail( ExtensibleChip * i_chip, bool & o_chnlFail ); template<> @@ -537,6 +644,13 @@ uint32_t __queryChnlFail<TYPE_DMI>( ExtensibleChip * i_chip, bool & o_chnlFail ) do { + // There is a hardware bug where channel failures from the IOMCFIRs + // doesn't report correctly. This workaround fixes the reporting and + // forces a channel failure if needed. It must be called before calling + // the query HWP in order for the hardware to be in the correct state. + o_rc = __fwAssistChnlFailWorkaround<TYPE_DMI>( i_chip ); + if ( SUCCESS != o_rc ) break; + // There is a HWP on the processor side that will query if this channel // has failed. Unfortunately, it does not check for an active channel // fail attention (i.e. not masked). That will need to be done @@ -704,6 +818,13 @@ uint32_t handleChnlFail<TYPE_MC>( ExtensibleChip * i_chip, for ( auto & dmiChip : getConnected(i_chip, TYPE_DMI) ) { + // The MC target will get the IOMCFIR registers by default, but we + // will need to manually capture the channel failure registers on the + // DMI target just in case the rule code analysis never makes it to + // that target. + dmiChip->CaptureErrorData( io_sc.service_data->GetCaptureData(), + Util::hashString( "chnlFail" ) ); + o_rc = handleChnlFail<TYPE_DMI>( dmiChip, io_sc ); if ( SUCCESS != o_rc ) break; } diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule index 46602ffa4..7731930de 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi.rule @@ -54,6 +54,7 @@ chip p9_dmi reset (&, 0x07010901); mask (|, 0x07010905); capture group default; + capture group chnlFail; }; register CHIFIR_MASK @@ -61,6 +62,7 @@ chip p9_dmi name "P9 DMI target CHIFIR MASK"; scomaddr 0x07010903; capture group default; + capture group chnlFail; }; register CHIFIR_ACT0 @@ -69,6 +71,7 @@ chip p9_dmi scomaddr 0x07010906; capture group default; capture req nonzero("CHIFIR"); + capture group chnlFail; }; register CHIFIR_ACT1 @@ -77,6 +80,7 @@ chip p9_dmi scomaddr 0x07010907; capture group default; capture req nonzero("CHIFIR"); + capture group chnlFail; }; # Include registers not defined by the xml diff --git a/src/usr/diag/prdf/common/plat/p9/p9_dmi_regs.rule b/src/usr/diag/prdf/common/plat/p9/p9_dmi_regs.rule index d681846d8..9349c744e 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_dmi_regs.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_dmi_regs.rule @@ -35,6 +35,14 @@ access write_only; }; + register CHIFIR_OR + { + name "P9 DMI target CHIFIR atomic OR"; + scomaddr 0x07010902; + capture group never; + access write_only; + }; + register CHIFIR_MASK_OR { name "P9 DMI target CHIFIR_MASK atomic OR"; @@ -48,6 +56,7 @@ name "MCI Configuration Register 0"; scomaddr 0x0701090A; capture group default; + capture group chnlFail; }; register MCISTAT @@ -76,6 +85,7 @@ name "MCI Configuration Register 1"; scomaddr 0x0701090E; capture group default; + capture group chnlFail; }; register RECR |