diff options
author | Jacob Harvey <jlharvey@us.ibm.com> | 2017-08-01 16:11:59 -0500 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2017-08-19 22:12:27 -0400 |
commit | 11108f43887202522217b92d448880df0fef05e5 (patch) | |
tree | e48dcecf359d3ba9fa663fe7604545f8ed1a0d61 /src/import/chips/p9/procedures/hwp/memory/lib | |
parent | bb97f80565ac3074de838e2773d1d08e91040775 (diff) | |
download | talos-hostboot-11108f43887202522217b92d448880df0fef05e5.tar.gz talos-hostboot-11108f43887202522217b92d448880df0fef05e5.zip |
Fix draminit_training error logging and unit test
Change-Id: Ie0e00595a9e4a50e8b5aa0a2017e6c6ed2e548c5
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44193
Tested-by: Hostboot CI <hostboot-ci+hostboot@us.ibm.com>
Dev-Ready: JACOB L. HARVEY <jlharvey@us.ibm.com>
Reviewed-by: ANDRE A. MARIN <aamarin@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: MATTHEW I. HICKMAN <matthew.hickman@ibm.com>
Reviewed-by: Jennifer A. Stofer <stofer@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44198
Reviewed-by: Hostboot Team <hostboot@us.ibm.com>
Reviewed-by: JACOB L. HARVEY <jlharvey@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/import/chips/p9/procedures/hwp/memory/lib')
5 files changed, 194 insertions, 33 deletions
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H index 9aa3f8e74..34310cc56 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H @@ -44,6 +44,7 @@ #include <lib/utils/num.H> #include <lib/utils/count_dimm.H> #include <lib/shared/mss_const.H> +#include <lib/phy/phy_cntrl.H> namespace mss { @@ -428,6 +429,39 @@ inline uint64_t map_rank_ordinal_from_phy( const fapi2::Target<fapi2::TARGET_TYP } /// +/// @brief Maps a ordinal rank pair to INIT_CAL_ERROR register encoding +/// @param[in] i_target the fapi2 target of the MCA +/// @param[in] i_rp rank pair 0-3 +/// @param[out] o_encoding +/// @return the mapped value +// i_rank passed by value to save a local +/// +inline fapi2::ReturnCode map_rp_primary_to_init_cal( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, + const uint64_t i_rp, + uint64_t& o_encoding) +{ + typedef pcTraits<fapi2::TARGET_TYPE_MCA> TT; + fapi2::buffer<uint64_t> l_temp; + + FAPI_ASSERT( i_rp < MAX_RANK_PAIRS, + fapi2::MSS_INVALID_RANK_PAIR() + .set_RANK_PAIR(i_rp) + .set_MCA_TARGET(i_target) + .set_FUNCTION(MAP_RP_PRIMARY_TO_INIT_CAL), + "Error in map_rp_primary_init_cal for %s on rp %d", + mss::c_str(i_target), + i_rp); + + FAPI_INF("Setting bit for rp %d", i_rp); + //INIT_CAL_ERROR == 60, the bit position offset for the register + FAPI_TRY( l_temp.setBit(i_rp + TT::INIT_CAL_ERROR_RANK_PAIR) ); + + o_encoding = l_temp; + return fapi2::FAPI2_RC_SUCCESS; +fapi_try_exit: + return fapi2::current_err; +} +/// /// @brief Convert rank indexes in a rank_pair reg value from MC perspective to PHY perspective /// @tparam T fapi2 Target Type the type of the MC target /// @param[in] i_target the fapi2 target of the MCA @@ -1327,8 +1361,8 @@ inline fapi2::ReturnCode get_ranks_in_pair( const fapi2::Target<T>& i_target, FAPI_ASSERT( false, fapi2::MSS_INVALID_RANK_PAIR() .set_RANK_PAIR(i_rp) - .set_FUNCTION(GET_RANKS_IN_PAIR) - .set_MCA_TARGET(i_target), + .set_MCA_TARGET(i_target) + .set_FUNCTION(GET_RANKS_IN_PAIR), "%s Invalid number of rankpairs entered. num: %lu max: %lu", mss::c_str(i_target), i_rp, diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C index 3d19b1764..5d8aaf71f 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C @@ -531,7 +531,7 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC FAPI_INF("initial cal err: 0x%016llx, rp: 0x%016llx (0x%016llx)", l_errors, l_rank_pairs, uint64_t(l_err_data)); // Check for RDVREF calibration errors. This fapi_try catches scom errors. Any errors from the - // RDVREF itself are assumed errors only after read centering + // RDVREF itself are assumed errors only after read centering (caught in general error above) FAPI_TRY( dp16::process_rdvref_cal_errors(i_target) ); // WR VREF error processing acts the same as the RD VREF processing @@ -539,7 +539,7 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC if ((l_rank_pairs == 0) || (l_errors == 0)) { - FAPI_INF("Initial cal - no errors reported"); + FAPI_INF("Initial cal - no errors reported %s", mss::c_str(i_target)); return fapi2::FAPI2_RC_SUCCESS; } @@ -550,38 +550,16 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC // fails ...) Note first_bit_set gives a bit position (0 being left most.) So, the rank // in question is the bit postion minus the position of the 0th rank in the register. // (the rank bits are bits 60:63, for example, so rank 0 is in position 60) - FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target, mss::first_bit_set(l_rank_pairs) - TT::INIT_CAL_ERROR_RANK_PAIR, + FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target, + mss::first_bit_set(l_rank_pairs) - TT::INIT_CAL_ERROR_RANK_PAIR, l_failed_dimm) ); - FAPI_ERR("initial cal failed for %s", mss::c_str(l_failed_dimm)); - // If we aborted on error, the disable bits aren't complete so we can't make a determination about - // whether the port is repairable. - if (cal_abort_on_error == fapi2::ENUM_ATTR_MSS_CAL_ABORT_ON_ERROR_YES) - { - FAPI_INF("can't process disable bits as we aborted on error - disable bits might be incomplete"); - } - else - { - // Process the disable bits. The PHY will disable bits that it finds don't work for whatever reason. - // However, we can handle a number of bad bits without resorting to killing the DIMM. Do the bad - // bit processing here, and if we can go on and ignore these bad bits, we'll see a succcess here. - if (dp16::process_bad_bits(i_target, l_failed_dimm, l_rank_pairs) == fapi2::FAPI2_RC_SUCCESS) - { - FAPI_INF("Initial cal - errors reported, but only 1 nibble + 1 bit marked %s", mss::c_str(l_failed_dimm)); - - // If we're on a pre-DD1.02 Nimbus, Anuwat requests we 'pass' training with 1 nibble + 1 bit. - if (mss::chip_ec_feature_mss_training_bad_bits(i_target)) - { - return fapi2::FAPI2_RC_SUCCESS; - } - } - } // So we can do a few things here. If we're aborting on the first calibration error, // we only expect to have one error bit set. If we ran all the calibrations, we can // either have one bit set or more than one bit set. If we have more than one bit set - // the result is the same - a broken DIMM which will be deconfigured. So put enough - // information in the FFDC for the lab but we don't need one error for every cal fail. + // the result is the same - a broken DIMM. + // So put enough information in the FFDC for the lab but we don't need one error for every cal fail. FAPI_ASSERT(mss::bit_count(l_errors) == 1, fapi2::MSS_DRAMINIT_TRAINING_MULTIPLE_ERRORS() .set_FAILED_STEPS(uint64_t(l_err_data)) @@ -718,6 +696,128 @@ fapi_try_exit: } /// +/// @brief Finds the calibration errors from draminit training +/// @param[in] i_target the port target +/// @param[in] i_rp the rank pair we are calibrating +/// @param[in] i_cal_abort_on_error denoting if we aborted on first fail +/// @param[in,out] io_fails a vector storing all of our cal fails +/// @return FAPI2_RC_SUCCESS iff all of the scoms and functionality were good +/// +template<> +fapi2::ReturnCode find_and_log_cal_errors(const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target, + const uint64_t i_rp, + const uint64_t i_cal_abort_on_error, + std::vector<fapi2::ReturnCode>& io_fails) +{ + fapi2::ReturnCode l_rc (fapi2::FAPI2_RC_SUCCESS); + fapi2::Target<TARGET_TYPE_DIMM> l_dimm; + + // Let's get the DIMM since we train per rank pair (primary rank pair) + FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target, + i_rp, + l_dimm), + "Failed getting the DIMM for %s", mss::c_str(i_target) ); + + // Let's keep track of the error. + // We don't want to error out here because we want to run on the other ports/ranks + // We'll add this to io_fails if we fail too many DQ's + l_rc = mss::process_initial_cal_errors(i_target); + + if (l_rc != fapi2::FAPI2_RC_SUCCESS) + { + // If we're aborting on error we jump to the end and error out. + // We don't care about other ports or ranks because the hardware stopped when it saw the error + if (i_cal_abort_on_error) + { + FAPI_TRY( l_rc, "Training failed for %s. Set to abort on error, so cal didn't finish", + mss::c_str(l_dimm) ); + } + + // Process the disable bits. The PHY will disable bits that it finds don't work for whatever reason. + // However, we can handle a number of bad bits without resorting to killing the DIMM. Do the bad + // bit processing here, and if we can go on and ignore these bad bits, we'll see a succcess here. + // Needs to be bit representation for process_bad_bits (it can handle fails for multiple rp for 1 dimm) + uint64_t l_encoding = 0; + FAPI_TRY( mss::rank::map_rp_primary_to_init_cal(i_target, i_rp, l_encoding) ); + + if (dp16::process_bad_bits(i_target, l_dimm, l_encoding) == fapi2::FAPI2_RC_SUCCESS) + { + // If we're on a Nimbus, lab team requests we 'pass' training with 1 nibble + 1 bit + if (mss::chip_ec_feature_mss_training_bad_bits(i_target)) + { + FAPI_INF("p9_mss_draminit_training: errors reported, but 1 nibble + 1 bit or less was marked.%s", + mss::c_str(l_dimm)); + + // Let's log the error as RECOVERED (logs should be hidden and no deconfigs take place) - JLH + fapi2::logError(l_rc, fapi2::FAPI2_ERRL_SEV_RECOVERED); + // Set l_rc to success so we will still record the bad bits into the attribute + l_rc = fapi2::FAPI2_RC_SUCCESS; + } + } + + FAPI_ERR("Seeing calibration errors for p9_mss_draminit_training %s: Keep running? %s", + mss::c_str(l_dimm), + (l_rc == fapi2::FAPI2_RC_SUCCESS) ? "Yes" : "no"); + + // Let's update the attribute with the failing DQ bits since we had a training error + // The only fail we get here is a scom error, so we should error out + // We only want to update the attribute for hostboot runs though + // Updating the attribute updates the DIMM's VPD and actually disabled those DQ bits for good + // Commenting out until PRD has the backside implementation complete +#ifdef __HOSTBOOT_MODULE + // TODO RTC:178400 Come back and use the ATTR_BAD_BITS accessor functions from PRD when available + //FAPI_TRY( mss::dp16::record_bad_bits(i_target) ); +#endif + + // Let's add the error to our vector for later processing (if it didn't affect too many DQ bits) + if (l_rc != fapi2::FAPI2_RC_SUCCESS) + { + io_fails.push_back(l_rc); + } + } + + // Calling process_bad_bits above sets fapi2::current_err; Need to explicitly return SUCCESS here + return fapi2::FAPI2_RC_SUCCESS; +fapi_try_exit: + return fapi2::current_err; +} + +/// +/// @brief Handle draminit_training cal fails +/// @param[in] i_fails vector holding the return codes for calibration failures +/// @note We handle errors differently depending on if we're HB or cronus +/// If we're cronus, we want to error out. +/// If we're hostboot, we want to log the error as hidden and let PRD choose to deconfigure +/// +fapi2::ReturnCode draminit_training_error_handler( const std::vector<fapi2::ReturnCode>& i_fails) +{ +// If we're in hostboot, we want to log all of the errors as hidden +// and let PRD deconfigure based off of ATTR_BAD_DQ_BITMAP +#ifdef __HOSTBOOT_MODULE + for (auto l_iter : i_fails) + { + fapi2::logError(l_iter, fapi2::FAPI2_ERRL_SEV_RECOVERED); + } + + return fapi2::current_err; + +// If we're cronus, let's bomb out +#else + + if (i_fails.size() != 0) + { + // We can't log errors in cronus, so let's take the first one and end the IPL + FAPI_ERR("Failed p9_mss_draminit_training"); + return i_fails[0]; + } + +#endif + // Need this for compiler/ if i_fails is empty + return fapi2::FAPI2_RC_SUCCESS; +} + + +/// /// @brief Sets up the IO impedances (ADR DRV's and DP DRV's/RCV's) - MCA specialization /// @tparam T the fapi2::TargetType /// @param[in] i_target the target (MCA/MCBIST or MBA?) diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H index 1c27efccc..39db82147 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H @@ -426,6 +426,30 @@ fapi2::ReturnCode rank_pair_primary_to_dimm(const fapi2::Target<T>& i_target, co fapi2::Target<fapi2::TARGET_TYPE_DIMM>& o_dimm); /// +/// @brief Handle draminit_training cal fails +/// @param[in] i_fails vector holding the return codes for calibration failures +/// @note We handle errors differently depending on if we're HB or cronus +/// If we're cronus, we want to error out. +/// If we're hostboot, we want to log the error as hidden and let PRD choose to deconfigure +/// +fapi2::ReturnCode draminit_training_error_handler ( const std::vector<fapi2::ReturnCode>& i_fails); + +/// +/// @brief Finds the calibration errors from draminit training +/// @tparam T fapi2::TargetType of the port target +/// @param[in] i_target the port target +/// @param[in] i_rp the rank pair we are calibrating +/// @param[in] i_cal_abort_on_error denoting if we aborted on first fail +/// @param[in,out] io_fails a vector storing all of our cal fails +/// @return FAPI2_RC_SUCCESS iff all of the scoms and functionality were good +/// +template<fapi2::TargetType T> +fapi2::ReturnCode find_and_log_cal_errors(const fapi2::Target<T>& i_target, + const uint64_t i_rp, + const uint64_t i_cal_abort_on_error, + std::vector<fapi2::ReturnCode>& io_fails); + +/// /// @brief Sets up the IO impedances (ADR DRV's and DP DRV's/RCV's) /// @tparam T the fapi2::TargetType /// @param[in] i_target the target (MCA/MCBIST or MBA?) diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C index d9a7fb8e4..547fd3d54 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C +++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C @@ -2662,12 +2662,12 @@ fapi_try_exit: fapi2::ReturnCode record_bad_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target ) { const auto& l_mcs = mss::find_target<TARGET_TYPE_MCS>(i_target); - uint8_t l_value[PORTS_PER_MCS][MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][10] = { 0 }; + uint8_t l_value[PORTS_PER_MCS][MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = { 0 }; // Process the bad bits into an array. We copy these in to their own array // as it allows the compiler to check indexes where a passed pointer wouldn't // otherwise do. - uint8_t l_data[MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][10] = { 0 }; + uint8_t l_data[MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = { 0 }; FAPI_TRY( mss::dp16::record_bad_bits_helper(i_target, l_data) ); // Read the attribute @@ -2732,7 +2732,7 @@ fapi2::ReturnCode record_bad_bits_helper( const fapi2::Target<fapi2::TARGET_TYPE l_bad_bits[l_byte_index] = (v.first & 0xFF00) >> 8; l_bad_bits[l_byte_index + 1] = v.first & 0x00FF; - FAPI_DBG("writing %s value 0x%0lX to 0x%X, 0x%X from 0x%016lx", + FAPI_DBG("%s Recording ATTR_BAD_DQ_BITMAP value 0x%0lX to 0x%X, 0x%X from 0x%016lx", mss::c_str(i_target), v.first, l_bad_bits[l_byte_index], diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H index 2214abb40..82fe0bc61 100644 --- a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H +++ b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H @@ -145,6 +145,9 @@ enum ffdc_function_codes // MSS_INVALID_INDEX_PASSED SYMBOL_COUNT_READ = 50, SYMBOL_COUNT_WRITE = 51, + + // Used in rank.H + MAP_RP_PRIMARY_TO_INIT_CAL = 60, }; enum states |