diff options
Diffstat (limited to 'src/import/chips/p9')
13 files changed, 432 insertions, 26 deletions
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/ccs/ccs.C b/src/import/chips/p9/procedures/hwp/memory/lib/ccs/ccs.C index b82951221..669048513 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/ccs/ccs.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/ccs/ccs.C @@ -37,6 +37,7 @@ #include <mss.H> #include <lib/ccs/ccs.H> +#include <lib/fir/check.H> using fapi2::TARGET_TYPE_MCBIST; using fapi2::TARGET_TYPE_MCA; @@ -83,6 +84,7 @@ fapi2::ReturnCode fail_type( const fapi2::Target<TARGET_TYPE_MCBIST>& i_target, const uint64_t& i_type, const fapi2::Target<TARGET_TYPE_MCA>& i_mca ) { + fapi2::ReturnCode l_failing_rc(fapi2::FAPI2_RC_SUCCESS); // Including the MCA_TARGET here and below at CAL_TIMEOUT since these problems likely lie at the MCA level // So we disable the PORT and hopefully that's it // If the problem lies with the MCBIST, it'll just have to loop @@ -112,7 +114,10 @@ fapi2::ReturnCode fail_type( const fapi2::Target<TARGET_TYPE_MCBIST>& i_target, fapi2::MSS_CCS_HUNG().set_MCBIST_TARGET(i_target), "%s CCS appears hung", mss::c_str(i_target)); fapi_try_exit: - return fapi2::current_err; + // Due to the PRD update, we need to check for FIR's + // If any FIR's have lit up, this CCS fail could have been caused by the FIR + // So, let PRD retrigger this step to see if we can resolve the issue + return mss::check::fir_or_pll_fail(i_target, fapi2::current_err); } /// diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/ddr4/mrs_load_ddr4.C b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/ddr4/mrs_load_ddr4.C index 43694ac2d..cec455f6a 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/ddr4/mrs_load_ddr4.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/ddr4/mrs_load_ddr4.C @@ -64,7 +64,7 @@ fapi2::ReturnCode mrs_engine( const fapi2::Target<fapi2::TARGET_TYPE_DIMM>& i_ta const uint64_t i_rank, std::vector< ccs::instruction_t<fapi2::TARGET_TYPE_MCBIST> >& io_inst ) { - FAPI_TRY( mrs_engine(i_target, i_data, i_rank, i_data.iv_delay, io_inst) ); + FAPI_TRY( mrs_engine(i_target, i_data, i_rank, i_data.iv_delay, io_inst) ); fapi_try_exit: return fapi2::current_err; diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.C b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.C index 6404adf0b..f2edb7873 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.C @@ -233,7 +233,6 @@ fapi_try_exit: /// /// @brief Return a vector of rank numbers which represent the primary rank pairs for this port -/// @tparam T the target type /// @param[in] i_target TARGET_TYPE_MCA /// @param[out] o_rps a vector of rank_pairs /// @return FAPI2_RC_SUCCESS iff all is ok @@ -251,7 +250,7 @@ fapi2::ReturnCode primary_ranks( const fapi2::Target<TARGET_TYPE_MCA>& i_target, FAPI_TRY( mss::eff_num_master_ranks_per_dimm(d, l_rank_count[mss::index(d)]) ); } - FAPI_DBG("ranks: %d, %d", l_rank_count[0], l_rank_count[1]); + FAPI_DBG("%s ranks: %d, %d", mss::c_str(i_target), l_rank_count[0], l_rank_count[1]); // Walk through rank pair table and skip empty pairs o_rps.clear(); @@ -264,13 +263,15 @@ fapi2::ReturnCode primary_ranks( const fapi2::Target<TARGET_TYPE_MCA>& i_target, } } + // Returning success in case no DIMM's are configured + return fapi2::FAPI2_RC_SUCCESS; + fapi_try_exit: return fapi2::current_err; } /// /// @brief Return a vector of rank numbers which represent the primary rank pairs for this dimm -/// @tparam T the target type /// @param[in] i_target TARGET_TYPE_DIMM /// @param[out] o_rps a vector of rank_pairs /// @return FAPI2_RC_SUCCESS iff all is ok @@ -344,7 +345,6 @@ fapi_try_exit: /// /// @brief Given a target, get the rank pair assignments, based on DIMMs -/// @tparam T the fapi2::TargetType /// @param[in] i_target the target (MCA or MBA?) /// @param[out] o_registers the regiter settings for the appropriate rank pairs /// @return FAPI2_RC_SUCCESS if and only if ok @@ -382,8 +382,7 @@ fapi_try_exit: /// /// @brief Setup the rank information in the port -/// @tparam T the fapi2::TargetType -/// @param[in] i_target the target (MCA or MBA?) +/// @param[in] i_target the target (MCA) /// @return FAPI2_RC_SUCCESS if and only if ok /// template<> @@ -485,7 +484,6 @@ fapi_try_exit: /// /// @brief Get a vector of configured rank pairs. /// Returns a vector of ordinal values of the configured rank pairs. e.g., for a 2R DIMM, {0, 1} -/// @tparam T the fapi2::TargetType /// @param[in]i_target the target (MCA or MBA?) /// @param[out] o_pairs std::vector of rank pairs configured /// @return FAPI2_RC_SUCCESS if and only if ok @@ -565,7 +563,6 @@ fapi_try_exit: /// /// @brief Get a rank-pair id from a physical rank /// Returns a number representing which rank-pair this rank is a part of -/// @tparam T the fapi2::TargetType /// @param[in] i_target the target (MCA or MBA?) /// @param[in] i_rank the physical rank number /// @param[out] o_pairs the rank pair @@ -573,7 +570,8 @@ fapi_try_exit: /// template<> fapi2::ReturnCode get_pair_from_rank(const fapi2::Target<TARGET_TYPE_MCA>& i_target, - uint64_t i_rank, uint64_t& o_pair) + uint64_t i_rank, + uint64_t& o_pair) { // Sort of brute-force, but no real good other way to do it. Given the // rank-pair configuration we walk the config looking for our rank, and diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H index 34310cc56..e5b3b9041 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H @@ -1060,7 +1060,7 @@ inline fapi2::ReturnCode set_pair_valid( const fapi2::Target<T>& i_target, fapi2::MSS_INVALID_RANK() .set_RANK(i_rank) .set_MCA_TARGET(i_target) - .set_FUNCTION(GET_RANKS_IN_PAIR), + .set_FUNCTION(SET_PAIR_VALID), "%s Invalid rank (%d) in get_ranks_in_pair", mss::c_str(i_target), i_rank); @@ -1231,7 +1231,7 @@ fapi2::ReturnCode get_ranks_in_pair( const fapi2::Target<T>& i_target, // Get data for (uint64_t l_ordinal = 0; l_ordinal < TT::NUM_RANKS_IN_PAIR; ++l_ordinal) { - // Check to make sure rank is vlaid + // Check to make sure rank is valid FAPI_ASSERT( l_ordinal < MAX_RANK_PER_DIMM, fapi2::MSS_INVALID_RANK() .set_RANK(l_ordinal) diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.C b/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.C index 58f1f0d94..7a329aaed 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.C @@ -36,6 +36,8 @@ #include <fapi2.H> #include <p9_mc_scom_addresses.H> #include <p9_mc_scom_addresses_fld.H> +#include <p9_perv_scom_addresses.H> +#include <p9_perv_scom_addresses_fld.H> #include <generic/memory/lib/utils/scom.H> #include <lib/fir/fir.H> @@ -205,6 +207,9 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY fapi2::buffer<uint64_t> l_phyfir_data; fapi2::buffer<uint64_t> l_phyfir_masked; + // If we have a FIR that is lit up, we want to see if it could have been caused by a more drastic FIR + bool l_check_fir = false; + FAPI_TRY( mss::getScom(l_mca, MCA_IOM_PHY0_DDRPHY_FIR_REG, l_phyfir_data) ); l_phyfir_masked = l_phyfir_data & l_phyfir_mask; @@ -213,6 +218,8 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY // We'll have the error log to know what fir bit triggered and when, so we should be fine clearing here FAPI_TRY( mss::putScom(l_mca, MCA_IOM_PHY0_DDRPHY_FIR_REG_AND, l_phyfir_mask.invert()) ); + // Check the FIR here + l_check_fir = true; FAPI_ASSERT( l_phyfir_masked == 0, fapi2::MSS_DRAMINIT_TRAINING_PORT_FIR() .set_PHY_FIR(l_phyfir_masked) @@ -222,8 +229,203 @@ fapi2::ReturnCode during_draminit_training( const fapi2::Target<fapi2::TARGET_TY mss::c_str(i_target), l_phyfir_masked); fapi_try_exit: + + // Handle any fails seen above accordingly + return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_check_fir); +} + +// Declares FIR registers that are re-used between multiple functions +// Vectors of FIR and mask registers to read through +// As check_fir can be called in multiple places, we don't know what the mask may hold +// In order to tell if a FIR is legit or not, we read the FIR and check it against the mask reg +// Note: using a vector here in case we need to expand +static const std::vector<std::pair<uint64_t, uint64_t>> MCBIST_FIR_REGS = +{ + // MCBIST FIR + {MCBIST_MCBISTFIRQ, MCBIST_MCBISTFIRMASK}, +}; + +static const std::vector<std::pair<uint64_t, uint64_t>> MCA_FIR_REGS = +{ + // MCA ECC FIR + {MCA_FIR, MCA_MASK}, + // MCA CAL FIR + {MCA_MBACALFIRQ, MCA_MBACALFIR_MASK}, + // DDRPHY FIR + {MCA_IOM_PHY0_DDRPHY_FIR_REG, MCA_IOM_PHY0_DDRPHY_FIR_MASK_REG}, +}; + +/// +/// @brief Checks whether any of the PLL unlock values are set +/// @param[in] i_local_fir - the overall FIR register +/// @param[in] i_perv_fir - the pervasive PLL FIR +/// @param[in] i_mc_fir - the memory controller FIR +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +bool pll_unlock( const fapi2::buffer<uint64_t>& i_local_fir, + const fapi2::buffer<uint64_t>& i_perv_fir, + const fapi2::buffer<uint64_t>& i_mc_fir ) +{ + // Note: the following registers did not have the scom fields defined, so we're constexpr'ing them here + constexpr uint64_t PERV_TP_ERROR_START = 25; + constexpr uint64_t PERV_TP_ERROR_LEN = 4; + constexpr uint64_t PERV_MC_ERROR_START = 25; + + // No overall FIR (bit 21) was set, so just exit + if(!i_local_fir.getBit<PERV_1_LOCAL_FIR_IN21>()) + { + FAPI_INF("Did not have the PERV_LOCAL_FIR bit set. No PLL error, exiting"); + return false; + } + + // Now, identify whether a PLL unlock caused the FIR bit to fail + FAPI_INF("PERV_TP_ERROR_REG %s PERV_MC01_ERROR_REG %s", + i_perv_fir.getBit<PERV_TP_ERROR_START, PERV_TP_ERROR_LEN>() ? "PLL lock fail" : "PLL ok", + i_mc_fir.getBit<PERV_MC_ERROR_START>() ? "PLL lock fail" : "PLL ok"); + + // We have a PLL unlock if the MC PLL unlock FIR bit is on or any of the TP PLL unlock bits are on + return (i_mc_fir.getBit<PERV_MC_ERROR_START>()) || (i_perv_fir.getBit<PERV_TP_ERROR_START, PERV_TP_ERROR_LEN>()); +} + +/// +/// @brief Checks whether any PLL FIRs have been set on a target +/// @param[in] i_target - the target on which to operate +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +fapi2::ReturnCode pll_fir( const fapi2::Target<fapi2::TARGET_TYPE_MCBIST>& i_target, bool& o_fir_error ) +{ + // Sets o_fir_error to false to begin with, just in case we have scom issues + o_fir_error = false; + + // Gets the processor target + const auto& l_proc = mss::find_target<fapi2::TARGET_TYPE_PROC_CHIP>(i_target); + + // Gets the register data + fapi2::buffer<uint64_t> l_local_fir; + fapi2::buffer<uint64_t> l_perv_fir; + fapi2::buffer<uint64_t> l_mc_fir; + + FAPI_TRY(mss::getScom(l_proc, PERV_TP_LOCAL_FIR, l_local_fir), "%s failed to get 0x%016llx", mss::c_str(i_target), + PERV_TP_LOCAL_FIR); + FAPI_TRY(mss::getScom(l_proc, PERV_TP_ERROR_REG, l_perv_fir), "%s failed to get 0x%016llx", mss::c_str(i_target), + PERV_TP_ERROR_REG); + FAPI_TRY(mss::getScom(i_target, PERV_MC01_ERROR_REG, l_mc_fir), "%s failed to get 0x%016llx", mss::c_str(i_target), + PERV_MC01_ERROR_REG); + + // Checks the data + o_fir_error = pll_unlock(l_local_fir, l_perv_fir, l_mc_fir); + +fapi_try_exit: return fapi2::current_err; } +/// +/// @brief Checks whether any FIR have lit up +/// @param[in] i_target - the target on which to operate - MCBIST specialization +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< > +fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCBIST>& i_target, bool& o_fir_error ) +{ + // Start by assuming we do not have a FIR + o_fir_error = false; + + // Loop, check the scoms, and check the FIR + // Note: we return out if any FIR is bad + for(const auto& l_fir_reg : MCBIST_FIR_REGS) + { + FAPI_TRY(fir_with_mask(i_target, l_fir_reg, o_fir_error)); + + // Exit if we found a FIR + if(o_fir_error) + { + return fapi2::FAPI2_RC_SUCCESS; + } + } + + // Loop through all MCA's and all MCA FIR's + for(const auto& l_mca : mss::find_targets<fapi2::TARGET_TYPE_MCA>(i_target)) + { + for(const auto& l_fir_reg : MCA_FIR_REGS) + { + FAPI_TRY(fir_with_mask(l_mca, l_fir_reg, o_fir_error)); + + // Exit if we found a FIR + if(o_fir_error) + { + return fapi2::FAPI2_RC_SUCCESS; + } + } + } + + // Lastly, check for PLL unlocks + FAPI_TRY(pll_fir(i_target, o_fir_error)); + +fapi_try_exit: + return fapi2::current_err; +} + + +/// +/// @brief Checks whether any FIR have lit up +/// @param[in] i_target - the target on which to operate - MCA specialization +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< > +fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, bool& o_fir_error ) +{ + const auto& l_mcbist = mss::find_target<fapi2::TARGET_TYPE_MCBIST>(i_target); + // Start by assuming we do not have a FIR + o_fir_error = false; + + // Loop, check the scoms, and check the FIR + // Note: we return out if any FIR is bad + for(const auto& l_fir_reg : MCBIST_FIR_REGS) + { + FAPI_TRY(fir_with_mask(l_mcbist, l_fir_reg, o_fir_error)); + + // Exit if we found a FIR + if(o_fir_error) + { + return fapi2::FAPI2_RC_SUCCESS; + } + } + + // Loop through all MCA FIR's + for(const auto& l_fir_reg : MCA_FIR_REGS) + { + FAPI_TRY(fir_with_mask(i_target, l_fir_reg, o_fir_error)); + + // Exit if we found a FIR + if(o_fir_error) + { + return fapi2::FAPI2_RC_SUCCESS; + } + } + + // Lastly, check for PLL unlocks + FAPI_TRY(pll_fir(l_mcbist, o_fir_error)); + +fapi_try_exit: + return fapi2::current_err; +} + + +/// +/// @brief Checks whether any FIR have lit up +/// @param[in] i_target - the target on which to operate - DIMM specialization +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< > +fapi2::ReturnCode bad_fir_bits( const fapi2::Target<fapi2::TARGET_TYPE_DIMM>& i_target, bool& o_fir_error ) +{ + const auto l_mca = mss::find_target<fapi2::TARGET_TYPE_MCA>(i_target); + return bad_fir_bits(l_mca, o_fir_error); +} + } } diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.H b/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.H index ded638e49..fc82aaed1 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/fir/check.H @@ -27,7 +27,7 @@ /// @file check.H /// @brief Subroutines for checking MSS FIR /// -// *HWP HWP Owner: Brian Silver <bsilver@us.ibm.com> +// *HWP HWP Owner: Andre Marin <aamarin@us.ibm.com> // *HWP HWP Backup: Marc Gollub <gollub@us.ibm.com> // *HWP Team: Memory // *HWP Level: 2 @@ -37,6 +37,7 @@ #define _MSS_CHECK_FIR_H_ #include <fapi2.H> +#include <generic/memory/lib/utils/scom.H> namespace mss { @@ -58,6 +59,7 @@ fapi2::ReturnCode during_phy_reset( const fapi2::Target<T>& i_target ); /// /// @brief Check FIR bits during draminit training +/// @tparam T the fapi2::TargetType which hold the FIR bits /// @param[in] i_target the dimm that was trained /// @note We check for fir errors after training each rank /// to see if there was a problem with the engine @@ -69,6 +71,149 @@ fapi2::ReturnCode during_phy_reset( const fapi2::Target<T>& i_target ); template< fapi2::TargetType T > fapi2::ReturnCode during_draminit_training( const fapi2::Target<T>& i_target ); +/// +/// @brief Checks whether any of the PLL unlock values are set +/// @param[in] i_local_fir - the overall FIR register +/// @param[in] i_perv_fir - the pervasive PLL FIR +/// @param[in] i_mc_fir - the memory controller FIR +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +bool pll_unlock( const fapi2::buffer<uint64_t>& i_local_fir, + const fapi2::buffer<uint64_t>& i_perv_fir, + const fapi2::buffer<uint64_t>& i_mc_fir ); + +/// +/// @brief Checks whether any PLL FIRs have been set on a target +/// @param[in] i_target - the target on which to operate +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +fapi2::ReturnCode pll_fir( const fapi2::Target<fapi2::TARGET_TYPE_MCBIST>& i_target, bool& o_fir_error ); + +/// +/// @brief Checks whether any FIRs have lit up on a target +/// @tparam T the fapi2::TargetType which hold the FIR bits +/// @param[in] i_target - the target on which to operate +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< fapi2::TargetType T > +fapi2::ReturnCode bad_fir_bits( const fapi2::Target<T>& i_target, bool& o_fir_error ); + +/// +/// @brief Checks whether the passed in FIRs have any un-masked errors set +/// @tparam T the fapi2::TargetType which hold the FIR bits +/// @param[in] i_target - the target on which to operate +/// @param[in] i_fir_regs - FIR register and mask register +/// @param[out] o_fir_error - true iff a FIR was hit +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< fapi2::TargetType T > +inline fapi2::ReturnCode fir_with_mask( const fapi2::Target<T>& i_target, + const std::pair<uint64_t, uint64_t>& i_fir_regs, + bool& o_fir_error ) +{ + // Temporary variables to make the code a bit more readable + const auto FIR_REG = i_fir_regs.first; + const auto FIR_MASK = i_fir_regs.second; + + fapi2::buffer<uint64_t> l_fir; + fapi2::buffer<uint64_t> l_fir_mask; + + // Read the registers + FAPI_TRY(mss::getScom(i_target, FIR_REG, l_fir)); + FAPI_TRY(mss::getScom(i_target, FIR_MASK, l_fir_mask)); + + + // The mask register will need to be inverted as a 0 in the mask register means the FIR is legit + // A bitwise and works the opposite way + l_fir_mask.invert(); + + // If we have any unmasked bit, set that we have a FIR error and exit out with success + // Note: we want to set success here as PRD will find the FIR as "new" and retrigger the procedure this way + o_fir_error = ((l_fir & l_fir_mask) != 0); + + // And print the information for debuggability + FAPI_INF("%s %s on reg 0x%016lx value 0x%016lx and mask 0x%016lx value 0x%016lx", mss::c_str(i_target), + o_fir_error ? "has FIR's set" : "has no FIR's set", FIR_REG, l_fir, FIR_MASK, l_fir_mask.invert()); + +fapi_try_exit: + return fapi2::current_err; +} + +/// +/// @brief Checks whether a FIR or unlocked PLL could be the root cause of another failure +/// @tparam T the fapi2::TargetType which hold the FIR bits +/// @param[in] i_target - the target on which to operate +/// @param[in] i_rc - the return code for the function - cannot be const due to a HB compile issue +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// @note This is a helper function to enable unit testing +/// +template< fapi2::TargetType T > +fapi2::ReturnCode hostboot_fir_or_pll_fail( const fapi2::Target<T>& i_target, fapi2::ReturnCode& i_rc) +{ + // We didn't have an error, so return success + if(i_rc == fapi2::FAPI2_RC_SUCCESS) + { + FAPI_INF("%s has a good return code, returning success", mss::c_str(i_target)); + return fapi2::FAPI2_RC_SUCCESS; + } + + fapi2::ReturnCode l_fircheck_scom_err(fapi2::FAPI2_RC_SUCCESS); + bool l_fir_error = false; + + FAPI_ERR("%s has a bad return code, time to check some firs!", mss::c_str(i_target)); + + l_fircheck_scom_err = bad_fir_bits(i_target, l_fir_error); + + FAPI_ERR("%s took a fail. FIR was %s", mss::c_str(i_target), + l_fir_error ? "set - returning FIR RC" : "unset - returning inputted RC"); + + // If we had a FIR error, log the original error and return success + // PRD will handle the original error + if(l_fir_error) + { + fapi2::log_related_error(i_target, i_rc, fapi2::FAPI2_ERRL_SEV_RECOVERED); + fapi2::current_err = fapi2::FAPI2_RC_SUCCESS; + } + else + { + fapi2::current_err = i_rc; + } + + return fapi2::current_err; +} + +/// +/// @brief Checks whether a FIR or unlocked PLL could be the root cause of another failure, if a check fir boolean is passed in +/// @tparam T the fapi2::TargetType which hold the FIR bits +/// @param[in] i_target - the target on which to operate +/// @param[in] i_rc - the return code for the function - cannot be const due to a HB compile issue +/// @param[in] i_check_fir - true IFF the FIR needs to be checked - defaults to true +/// @return fapi2::ReturnCode FAPI2_RC_SUCCESS iff ok +/// +template< fapi2::TargetType T > +fapi2::ReturnCode fir_or_pll_fail( const fapi2::Target<T>& i_target, fapi2::ReturnCode& i_rc, + const bool i_check_fir = true) +{ +#ifdef __HOSTBOOT_MODULE + + fapi2::ReturnCode l_rc(i_rc); + + // If need be, check the FIR below + if(i_check_fir) + { + // Handle any issues according to PRD FIR scheme, as a FIR could have caused this issue + l_rc = hostboot_fir_or_pll_fail(i_target, l_rc); + } + + return l_rc; + +#else + return i_rc; +#endif +} + } } #endif diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/mc/port.H b/src/import/chips/p9/procedures/hwp/memory/lib/mc/port.H index 876a83909..b6c2ece01 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/mc/port.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/mc/port.H @@ -964,7 +964,7 @@ fapi2::ReturnCode reset_zqcal_config( const fapi2::Target<T>& i_target ) for (const auto r : l_ranks) { - l_phy_zqcal_config.setBit(TT::PER_ZCAL_ENA_RANK + rank::map_rank_ordinal_to_phy(i_target, r)); + FAPI_TRY(l_phy_zqcal_config.setBit(TT::PER_ZCAL_ENA_RANK + rank::map_rank_ordinal_to_phy(i_target, r))); } // Write the ZQCAL periodic config diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/mc/xlate.C b/src/import/chips/p9/procedures/hwp/memory/lib/mc/xlate.C index 17563fc83..bdee48e3c 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/mc/xlate.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/mc/xlate.C @@ -936,7 +936,7 @@ fapi2::ReturnCode xlate_dimm_2R2T8Gbx4( const dimm::kind& i_kind, // We're basically a 2R 4Gbx4 with an extra row. So lets setup like we're one of those, // add row 16 and shift the D bit as needed. - xlate_dimm_2R2T4Gbx4(i_kind, i_offset, i_largest, io_xlate0, io_xlate1, io_xlate2); + FAPI_TRY(xlate_dimm_2R2T4Gbx4(i_kind, i_offset, i_largest, io_xlate0, io_xlate1, io_xlate2)); // Tell the MC which of the row bits are valid, and map the DIMM selector // We're a 17 row DIMM, so ROW16 is valid. @@ -1941,7 +1941,7 @@ fapi2::ReturnCode setup_xlate_map_helper( std::vector<dimm::kind>& io_dimm_kinds set_DIMM_TYPE(k.iv_dimm_type). set_ROWS(k.iv_rows). set_SIZE(k.iv_size), - "no address translation funtion for DIMM %s %dMR (%d total ranks) %dGbx%d (%dGB) %d rows in slot %d", + "no address translation function for DIMM %s %dMR (%d total ranks) %dGbx%d (%dGB) %d rows in slot %d", mss::c_str(k.iv_target), k.iv_master_ranks, k.iv_total_ranks, diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C index 86a8621fa..e1e63fec5 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C @@ -521,6 +521,11 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_DI uint64_t l_rank_pairs = 0; uint8_t cal_abort_on_error = 0; + // This boolean tells the code whether we took a training fail or a scom fail reading the status registers + // It starts as false, given that we need to read out the registers + // When we start checking all of the values of the status registers, it gets set to true + bool l_check_firs = false; + const auto& l_mca = mss::find_target<fapi2::TARGET_TYPE_MCA>(i_target); fapi2::buffer<uint64_t> l_err_data; @@ -550,6 +555,9 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_DI } // Error information from other registers is gathered in the FFDC from the XML + // From here on out, check the FIRs + // Using this boolean to avoid having to check the FIR's after each assert below + l_check_firs = true; // So we can do a few things here. If we're aborting on the first calibration error, // we only expect to have one error bit set. If we ran all the calibrations, we can @@ -692,7 +700,8 @@ fapi_try_exit: (fapi2::current_err == fapi2::FAPI2_RC_SUCCESS ? "success" : "errors reported"), mss::c_str(l_mca)); - return fapi2::current_err; + // Checks the FIR's, if need be + return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_check_firs); } /// diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C index 0e346881a..129c37515 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C @@ -52,6 +52,7 @@ #include <generic/memory/lib/utils/c_str.H> #include <lib/workarounds/dp16_workarounds.H> +#include <lib/fir/check.H> #include <generic/memory/lib/utils/mss_math.H> using fapi2::TARGET_TYPE_MCS; @@ -3260,6 +3261,22 @@ fapi_try_exit: /// fapi2::ReturnCode record_bad_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target ) { + // If we have a FIR set that could have caused our training fail, then skip checking bad bits in FW + // PRD will handle the FIR and retrigger the procedure +#ifdef __HOSTBOOT_MODULE + bool l_fir_error = false; + FAPI_TRY(mss::check::bad_fir_bits(i_target, l_fir_error), "%s took an error while checking FIR's", + mss::c_str(i_target)); + + // Exit if we took a FIR error - PRD will handle bad bits + if(l_fir_error) + { + FAPI_INF("%s has FIR's set, exiting to let PRD handle it", mss::c_str(i_target)); + return fapi2::FAPI2_RC_SUCCESS; + } + +#endif + for( const auto& d : mss::find_targets<fapi2::TARGET_TYPE_DIMM>(i_target) ) { uint8_t l_data[MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = {}; @@ -3367,11 +3384,17 @@ fapi2::ReturnCode process_rdvref_cal_errors( const fapi2::Target<fapi2::TARGET_T size_t l_index = 0; std::vector<fapi2::buffer<uint64_t>> l_data; + // Boolean to keep track of if a fail was calibration related, or scom related + bool l_cal_fail = false; + // Suck all the cal error bits out ... FAPI_TRY( mss::scom_suckah(l_mca, TT::RD_VREF_CAL_ERROR_REG, l_data) ); FAPI_INF("%s Processing RD_VREF_CAL_ERROR", mss::c_str(i_target)); + // From here on out, the FIR's are all cal fails + l_cal_fail = true; + for (const auto& v : l_data) { // They should all be 0's. If they're not, we have a problem. @@ -3383,14 +3406,17 @@ fapi2::ReturnCode process_rdvref_cal_errors( const fapi2::Target<fapi2::TARGET_T .set_VALUE(v), "DP16 failed read vref calibration on %s. register 0x%016lx value 0x%016lx", mss::c_str(l_mca), TT::RD_VREF_CAL_ERROR_REG[l_index], v); + ++l_index; } - FAPI_INF("RD_VREF_CAL_ERROR complete"); + FAPI_INF("%s RD_VREF_CAL_ERROR complete", mss::c_str(i_target)); return fapi2::FAPI2_RC_SUCCESS; fapi_try_exit: - return fapi2::current_err; + + // If the FIR's are cal fails, then check to see if FIRs or PLL fails were the cause + return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_cal_fail); } /// @@ -3412,10 +3438,16 @@ fapi2::ReturnCode process_wrvref_cal_errors( const fapi2::Target<fapi2::TARGET_T std::vector<std::pair<fapi2::buffer<uint64_t>, fapi2::buffer<uint64_t>>> l_data; std::vector<std::pair<fapi2::buffer<uint64_t>, fapi2::buffer<uint64_t>>> l_mask; + // Boolean to keep track of if a fail was calibration related, or scom related + bool l_cal_fail = false; + // Suck all the cal error bits out ... FAPI_TRY( mss::scom_suckah(l_mca, TT::WR_VREF_ERROR_REG, l_data) ); FAPI_TRY( mss::scom_suckah(l_mca, TT::WR_VREF_ERROR_MASK_REG, l_mask) ); + // From here on out, the FIR's are all cal fails + l_cal_fail = true; + // Loop through both data and mask { // Note: ideally these would be cbegin/cend, but HB doesn't support constant iterators for vectors @@ -3480,11 +3512,13 @@ fapi2::ReturnCode process_wrvref_cal_errors( const fapi2::Target<fapi2::TARGET_T } } - FAPI_INF("WRVREF_CAL_ERROR complete"); + FAPI_INF("%s WRVREF_CAL_ERROR complete", mss::c_str(i_target)); return fapi2::FAPI2_RC_SUCCESS; fapi_try_exit: - return fapi2::current_err; + + // If the FIR's are cal fails, then check to see if FIR's were the cause + return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_cal_fail); } /// diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H index cf6a871e5..d6e5c4f53 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H @@ -199,6 +199,7 @@ enum ffdc_functions RD_CTR_WORKAROUND_READ_DATA = 7, OVERRIDE_ODT_WR_CONFIG = 8, RECORD_BAD_BITS_HELPER = 9, + SET_PAIR_VALID = 10, }; // Static consts describing the bits used in the cal_step_enable attribute // These are bit positions. 0 is the left most bit. diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/workarounds/dp16_workarounds.C b/src/import/chips/p9/procedures/hwp/memory/lib/workarounds/dp16_workarounds.C index 95dbe26f8..a5fa507bb 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/workarounds/dp16_workarounds.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/workarounds/dp16_workarounds.C @@ -47,6 +47,7 @@ #include <lib/phy/phy_cntrl.H> #include <lib/dimm/rank.H> #include <lib/utils/bit_count.H> +#include <lib/fir/check.H> namespace mss { @@ -547,10 +548,13 @@ fapi2::ReturnCode dqs_align_workaround(const fapi2::Target<fapi2::TARGET_TYPE_MC // If we can't, exit with success if (! chip_ec_feature_mss_dqs_workaround(i_target) ) { - FAPI_DBG("Skipping DQS workaround because of ec feature attribute"); + FAPI_DBG("%s Skipping DQS workaround because of ec feature attribute", mss::c_str(i_target)); return fapi2::FAPI2_RC_SUCCESS; } + // Boolean to keep track of if a fail was calibration related, or scom related + bool l_cal_fail = false; + FAPI_TRY( eff_dram_width( i_target, l_dram_width) ); l_is_x8 = ((l_dram_width[0] == fapi2::ENUM_ATTR_EFF_DRAM_WIDTH_X8) || @@ -603,6 +607,8 @@ fapi2::ReturnCode dqs_align_workaround(const fapi2::Target<fapi2::TARGET_TYPE_MC // Clear all disable bits - this will cause calibration to re-run everything that failed, including WR LVL fails FAPI_TRY(mss::workarounds::dp16::dqs_align::reset_disables(i_target, i_rp)); + // Next, we're checking for CAL fails, so make sure to check the FIR's below + l_cal_fail = true; // If the loop timed out, bomb out // If this is firmware, they'll log it as info and run to memdiags @@ -617,11 +623,16 @@ fapi2::ReturnCode dqs_align_workaround(const fapi2::Target<fapi2::TARGET_TYPE_MC "%s i_rp %lu DQS workaround failed! 10 loops reached without everything passing", mss::c_str(i_target), i_rp); + // Below, the errors are scom related, no need to check the FIR's + l_cal_fail = false; + // Now plop the delays back in to the registers FAPI_TRY(mss::workarounds::dp16::dqs_align::set_passing_values( i_target, i_rp, l_passing_values)); fapi_try_exit: - return fapi2::current_err; + + // If the FIR's are cal fails, then check to see if FIR's or PLL's could be the cause + return mss::check::fir_or_pll_fail(i_target, fapi2::current_err, l_cal_fail); } /// @@ -777,7 +788,8 @@ fapi_try_exit: /// @param[in,out] io_passing_values - the passing values, a map from the DQS number to the value /// @return fapi2::ReturnCode FAPI2_RC_SUCCESS if ok /// -fapi2::ReturnCode record_passing_values( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, const uint64_t i_rp, +fapi2::ReturnCode record_passing_values( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, + const uint64_t i_rp, std::map<uint64_t, uint64_t>& io_passing_values) { // Traits declaration diff --git a/src/import/chips/p9/procedures/hwp/memory/p9_mss_draminit_training.C b/src/import/chips/p9/procedures/hwp/memory/p9_mss_draminit_training.C index 533a53905..b4de8bd90 100644 --- a/src/import/chips/p9/procedures/hwp/memory/p9_mss_draminit_training.C +++ b/src/import/chips/p9/procedures/hwp/memory/p9_mss_draminit_training.C @@ -68,7 +68,7 @@ extern "C" std::vector<fapi2::ReturnCode> l_fails; - FAPI_INF("Start draminit training"); + FAPI_INF("%s Start draminit training", mss::c_str(i_target)); // If there are no DIMM we don't need to bother. In fact, we can't as we didn't setup // attributes for the PHY, etc. |