diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2019-06-13 09:17:51 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2019-06-28 14:07:59 -0500 |
commit | 8db1ba5aaa8198a0535cc472eda56cecdbb016f9 (patch) | |
tree | 54f5e433bff81a254e95a963265677a992474d35 /src/usr | |
parent | 1581c67d3151e1c17d81c739d69e2122afd08364 (diff) | |
download | blackbird-hostboot-8db1ba5aaa8198a0535cc472eda56cecdbb016f9.tar.gz blackbird-hostboot-8db1ba5aaa8198a0535cc472eda56cecdbb016f9.zip |
PRD: Avoid gard for NVDIMMs
Change-Id: Icaa517b196826c2b442da769ef45b3cdf56e6a9d
CQ: SW467502
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/79189
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamen G. Tyner <ben.tyner@ibm.com>
Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/79670
Diffstat (limited to 'src/usr')
14 files changed, 202 insertions, 22 deletions
diff --git a/src/usr/diag/prdf/common/framework/service/iipServiceDataCollector.h b/src/usr/diag/prdf/common/framework/service/iipServiceDataCollector.h index 704dddf70..e8cdb79a5 100755 --- a/src/usr/diag/prdf/common/framework/service/iipServiceDataCollector.h +++ b/src/usr/diag/prdf/common/framework/service/iipServiceDataCollector.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2018 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -628,6 +628,11 @@ public: void clearMruListGard(); /** + * @brief Iterates the MRU list and clears gard for any NVDIMM targets. + */ + void clearNvdimmMruListGard(); + + /** * @brief Iterates the MRU list and returns true if at least on target in * the list is set to be garded. * @return True if there is at least one target set to be garded. diff --git a/src/usr/diag/prdf/common/framework/service/prdfServiceDataCollector.C b/src/usr/diag/prdf/common/framework/service/prdfServiceDataCollector.C index d9681d66b..8ba990077 100755 --- a/src/usr/diag/prdf/common/framework/service/prdfServiceDataCollector.C +++ b/src/usr/diag/prdf/common/framework/service/prdfServiceDataCollector.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2015 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -177,6 +177,50 @@ void ServiceDataCollector::clearMruListGard() //------------------------------------------------------------------------------ +void ServiceDataCollector::clearNvdimmMruListGard() +{ + #define PRDF_FUNC "[ServiceDataCollector::clearNvdimmMruListGard] " + + // Loop through the MRU list. + for ( auto & mru : xMruList ) + { + PRDcallout callout = mru.callout; + TargetHandle_t trgt = callout.getTarget(); + if ( TYPE_DIMM == PlatServices::getTargetType(trgt) ) + { + // If the callout target is an NVDIMM, do not gard it and send a + // message to PHYP/Hostboot that a save/restore may work. + if ( isNVDIMM(trgt) ) + { + mru.gardState = NO_GARD; + + #ifdef __HOSTBOOT_MODULE + + #ifdef __HOSTBOOT_RUNTIME + // Hostboot runtime, send the message to PHYP + uint32_t l_rc = PlatServices::nvdimmNotifyPhypProtChange( trgt, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) + { + PRDF_TRAC( PRDF_FUNC "nvdimmNotifyPhypProtChange(0x%08x) " + "failed.", PlatServices::getHuid(trgt) ); + continue; + } + #else + // IPL, set the appropriate internal attribute in Hostboot + trgt->setAttr<ATTR_NV_STATUS_FLAG>(0x40); + #endif + + #endif // __HOSTBOOT_MODULE + } + } + } + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + bool ServiceDataCollector::isGardRequested() { bool gardRecordExit = false; diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C index 61dd6e548..e0b54ab31 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemMark.C @@ -1390,6 +1390,9 @@ uint32_t applyRasPolicies( ExtensibleChip * i_chip, const MemRank & i_rank, { io_sc.service_data->setServiceCall(); + // We want to try to avoid garding NVDIMMs, so clear gard for them now. + io_sc.service_data->clearNvdimmMruListGard(); + #ifdef __HOSTBOOT_RUNTIME // No more repairs left so no point doing any more TPS procedures. MemDbUtils::banTps<T>( i_chip, i_rank ); diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca.rule index d61845b20..d1a6bc290 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca.rule @@ -241,7 +241,7 @@ group gMCACALFIR /** MCACALFIR[0] * A MBA recoverable error has occurred. */ - (rMCACALFIR, bit(0)) ? self_th_1; + (rMCACALFIR, bit(0)) ? nvdimm_self_th_1; /** MCACALFIR[1] * MBA Nonrecoverable Error @@ -251,7 +251,7 @@ group gMCACALFIR /** MCACALFIR[2] * Excessive refreshes to a single rank. */ - (rMCACALFIR, bit(2)) ? self_th_32perDay; + (rMCACALFIR, bit(2)) ? nvdimm_self_th_32perDay; /** MCACALFIR[3] * Err detected in the MBA debug WAT logic @@ -266,7 +266,7 @@ group gMCACALFIR /** MCACALFIR[5] * Calibration complete indication xout */ - (rMCACALFIR, bit(5)) ? self_th_32perDay; + (rMCACALFIR, bit(5)) ? nvdimm_self_th_32perDay; /** MCACALFIR[6] * Emergency Throttle @@ -533,7 +533,7 @@ group gMCAECCFIR /** MCAECCFIR[42] * SCOM_PARITY_CLASS_RECOVERABLE */ - (rMCAECCFIR, bit(42)) ? self_th_1; + (rMCAECCFIR, bit(42)) ? nvdimm_self_th_1; /** MCAECCFIR[43] * SCOM_PARITY_CLASS_UNRECOVERABLE @@ -548,7 +548,7 @@ group gMCAECCFIR /** MCAECCFIR[45] * WRITE_RMW_CE */ - (rMCAECCFIR, bit(45)) ? self_th_32perDay; + (rMCAECCFIR, bit(45)) ? nvdimm_self_th_32perDay; /** MCAECCFIR[46] * WRITE_RMW_UE @@ -686,12 +686,12 @@ group gDDRPHYFIR /** DDRPHYFIR[60] * Register PE 4 bit impact */ - (rDDRPHYFIR, bit(60)) ? self_th_1; + (rDDRPHYFIR, bit(60)) ? nvdimm_self_th_1; /** DDRPHYFIR[61] * Register PE 1 bit impact */ - (rDDRPHYFIR, bit(61)) ? self_th_1; + (rDDRPHYFIR, bit(61)) ? nvdimm_self_th_1; }; diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule index da3a73f82..e0529afd5 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule @@ -70,6 +70,7 @@ actionclass rcd_parity_error calloutSelfLowNoGard; # Self LOW # Thresholding done in plugin funccall("RcdParityError"); # Run TPS on TH for all MCA ranks + funccall("ClearNvdimmGardState"); # Clear gard for NVDIMMs }; /** Handle Mainline IUEs */ @@ -125,7 +126,7 @@ actionclass maintenance_iaue_handling /** MCA/UE algroithm, threshold 5 per day */ actionclass mca_ue_algorithm_th_5perDay { - calloutSelfMed; + try( funccall("CheckForNvdimms"), calloutSelfMed ); threshold5pday; funccall("mcaUeAlgorithm"); # must be called last }; @@ -133,12 +134,29 @@ actionclass mca_ue_algorithm_th_5perDay /** MCA/UE algroithm, threshold 1 */ actionclass mca_ue_algorithm_th_1 { - calloutSelfMed; + try( funccall("CheckForNvdimms"), calloutSelfMed ); threshold1; funccall("mcaUeAlgorithm"); # must be called last }; ################################################################################ +# NVDIMM callouts # +################################################################################ + +# Simple callouts that will avoid gard for NVDIMMs +actionclass nvdimm_self_th_1 +{ + try( funccall("CheckForNvdimms"), calloutSelfMed ); + threshold1; +}; + +actionclass nvdimm_self_th_32perDay +{ + try( funccall("CheckForNvdimms"), calloutSelfMed ); + threshold32pday; +}; + +################################################################################ # Analyze groups ################################################################################ diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist.rule index 1f61719a7..0a3301e2a 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2016,2018 +# Contributors Listed Below - COPYRIGHT 2016,2019 # [+] International Business Machines Corp. # # @@ -599,7 +599,7 @@ group gMCBISTFIR /** MCBISTFIR[13] * SCOM_RECOVERABLE_REG_PE */ - (rMCBISTFIR, bit(13)) ? self_th_1; + (rMCBISTFIR, bit(13)) ? nvdimm_self_th_1; /** MCBISTFIR[14] * SCOM_FATAL_REG_PE diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist_actions.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist_actions.rule index 9b2127f3f..b71610835 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist_actions.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcbist_actions.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2016,2018 +# Contributors Listed Below - COPYRIGHT 2016,2019 # [+] International Business Machines Corp. # # @@ -36,6 +36,17 @@ actionclass command_addr_timeout funccall("commandAddrTimeout"); }; +################################################################################ +# NVDIMM callouts # +################################################################################ + +# Simple callouts that will avoid gard for NVDIMMs +actionclass nvdimm_self_th_1 +{ + try( funccall("CheckForNvdimms"), calloutSelfMed ); + threshold1; +}; + ############################################################################### # Analyze groups ############################################################################### diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs.rule index 71a0342ab..987d68afb 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2016,2018 +# Contributors Listed Below - COPYRIGHT 2016,2019 # [+] International Business Machines Corp. # # @@ -148,7 +148,7 @@ group gMCFIR /** MCFIR[0] * mc internal recoverable eror */ - (rMCFIR, bit(0)) ? self_th_1; + (rMCFIR, bit(0)) ? nvdimm_self_th_1; /** MCFIR[1] * mc internal non recovervable error diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs_actions.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs_actions.rule index 1497cdccb..839a9dc44 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs_actions.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mcs_actions.rule @@ -5,7 +5,7 @@ # # OpenPOWER HostBoot Project # -# Contributors Listed Below - COPYRIGHT 2018 +# Contributors Listed Below - COPYRIGHT 2018,2019 # [+] International Business Machines Corp. # # @@ -24,6 +24,17 @@ # IBM_PROLOG_END_TAG ################################################################################ +# NVDIMM callouts # +################################################################################ + +# Simple callouts that will avoid gard for NVDIMMs +actionclass nvdimm_self_th_1 +{ + try( funccall("CheckForNvdimms"), calloutSelfMed ); + threshold1; +}; + +################################################################################ # Analyze groups ################################################################################ diff --git a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule index 174009192..2e7e32869 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule @@ -279,4 +279,3 @@ actionclass chip_to_chip calloutSelfMed; threshold1; }; - diff --git a/src/usr/diag/prdf/common/plat/p9/prdfCommonPlugins.C b/src/usr/diag/prdf/common/plat/p9/prdfCommonPlugins.C index ece3fc1a8..77cecfb9f 100644 --- a/src/usr/diag/prdf/common/plat/p9/prdfCommonPlugins.C +++ b/src/usr/diag/prdf/common/plat/p9/prdfCommonPlugins.C @@ -127,6 +127,66 @@ PRDF_PLUGIN_DEFINE_NS(nimbus_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo PRDF_PLUGIN_DEFINE_NS(cumulus_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo); PRDF_PLUGIN_DEFINE_NS(axone_proc, CommonPlugins, ClearServiceCallFlag_mnfgInfo); +/** + * @brief Will change the gard state of any NVDIMMs in the callout list to + * NO_GARD. + * @param i_chip The chip. + * @param io_sc The step code data struct. + * @returns SUCCESS + */ +int32_t ClearNvdimmGardState( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #ifdef __HOSTBOOT_MODULE + + // Call the sdc to clear the NVDIMM mru list. + io_sc.service_data->clearNvdimmMruListGard(); + + #endif + + return SUCCESS; +} +PRDF_PLUGIN_DEFINE_NS(nimbus_mcs, CommonPlugins, ClearNvdimmGardState); +PRDF_PLUGIN_DEFINE_NS(nimbus_mca, CommonPlugins, ClearNvdimmGardState); +PRDF_PLUGIN_DEFINE_NS(nimbus_mcbist, CommonPlugins, ClearNvdimmGardState); + +/** + * @brief Will check if any of the DIMMs connected to this chip are NVDIMMs + * and callout self, no gard if there are. + * @param i_chip The chip of the DIMM parent. + * @param io_sc The step code data struct. + * @returns SUCCESS if NVDIMMs found, PRD_SCAN_COMM_REGISTER_ZERO if not. + */ +int32_t CheckForNvdimms( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + int32_t rc = PRD_SCAN_COMM_REGISTER_ZERO; + + #ifdef __HOSTBOOT_MODULE + + TargetHandleList dimmList = getConnected( i_chip->getTrgt(), TYPE_DIMM ); + + for ( auto & dimm : dimmList ) + { + if ( isNVDIMM(dimm) ) + { + // Callout self, no gard + io_sc.service_data->SetCallout(i_chip->getTrgt(), MRU_MED, NO_GARD); + + // No need for other actions, so return SUCCESS + rc = SUCCESS; + break; + } + } + + #endif + + return rc; +} +PRDF_PLUGIN_DEFINE_NS(nimbus_mcs, CommonPlugins, CheckForNvdimms); +PRDF_PLUGIN_DEFINE_NS(nimbus_mca, CommonPlugins, CheckForNvdimms); +PRDF_PLUGIN_DEFINE_NS(nimbus_mcbist, CommonPlugins, CheckForNvdimms); + } // namespace CommonPlugins ends }// namespace PRDF ends diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C index 0cf4bfa7c..9286a31ee 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C @@ -1351,6 +1351,20 @@ int32_t dimmList( TargetHandleList & i_dimmList ) sendPredDeallocRequest( ssAddr, seAddr ); PRDF_TRAC( PRDF_FUNC "Predictive dealloc for start addr: 0x%016llx " "end addr: 0x%016llx", ssAddr, seAddr ); + + // If the DIMM is an NVDIMM, send a message to PHYP that a save/restore + // may work. + if ( isNVDIMM(*it) ) + { + uint32_t l_rc = PlatServices::nvdimmNotifyPhypProtChange( *it, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) + { + PRDF_TRAC( PRDF_FUNC "nvdimmNotifyPhypProtChange(0x%08x) " + "failed.", getHuid(*it) ); + continue; + } + } } return o_rc; diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 5f7efa274..b8367ee4d 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -782,9 +782,9 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // and make the log predictive. io_sc.service_data->SetThresholdMaskId(0); - // Send persistency lost message to PHYP + // Send message to PHYP that save/restore may work l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm, - NVDIMM::UNPROTECTED_BECAUSE_ERROR ); + NVDIMM::NVDIMM_RISKY_HW_ERROR ); if ( SUCCESS != l_rc ) continue; // Analyze Health Status0 Reg, Health Status1 Reg, diff --git a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C index ef3a143eb..04eff661e 100644 --- a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C +++ b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C @@ -109,9 +109,24 @@ void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt, PRDF_ASSERT( nullptr != i_dimmTrgt ); PRDF_ASSERT( TYPE_DIMM == getTargetType(i_dimmTrgt) ); - // Callout the DIMM. + HWAS::DeconfigEnum deconfigPolicy = HWAS::DELAYED_DECONFIG; + HWAS::GARD_ErrorType gardPolicy = HWAS::GARD_Predictive; + + // If the DIMM is an NVDIMM, change the gard and deconfig options to no + // gard/deconfig and set the appropriate attribute to indicate a + // save/restore may work + if ( isNVDIMM(i_dimmTrgt) ) + { + deconfigPolicy = HWAS::NO_DECONFIG; + gardPolicy = HWAS::GARD_NULL; + + i_dimmTrgt->setAttr<ATTR_NV_STATUS_FLAG>(0x40); + } + + io_errl->addHwCallout( i_dimmTrgt, HWAS::SRCI_PRIORITY_HIGH, - HWAS::DELAYED_DECONFIG, HWAS::GARD_Predictive ); + deconfigPolicy, gardPolicy ); + // Clear the VPD on this DIMM. The DIMM has been garded, but it is possible // the customer will want to ungard the DIMM. Without clearing the VPD, the |