Fix draminit_training error logging and unit test

Change-Id: Ie0e00595a9e4a50e8b5aa0a2017e6c6ed2e548c5 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44193 Tested-by: Hostboot CI <hostboot-ci+hostboot@us.ibm.com> Dev-Ready: JACOB L. HARVEY <jlharvey@us.ibm.com> Reviewed-by: ANDRE A. MARIN <aamarin@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: MATTHEW I. HICKMAN <matthew.hickman@ibm.com> Reviewed-by: Jennifer A. Stofer <stofer@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44198 Reviewed-by: Hostboot Team <hostboot@us.ibm.com> Reviewed-by: JACOB L. HARVEY <jlharvey@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
author: Jacob Harvey <jlharvey@us.ibm.com> 2017-08-01 16:11:59 -0500
committer: Daniel M. Crowell <dcrowell@us.ibm.com> 2017-08-19 22:12:27 -0400
commit: 11108f43887202522217b92d448880df0fef05e5 (patch)
tree: e48dcecf359d3ba9fa663fe7604545f8ed1a0d61 /src/import/chips/p9/procedures/hwp/memory/lib
parent: bb97f80565ac3074de838e2773d1d08e91040775 (diff)
download: talos-hostboot-11108f43887202522217b92d448880df0fef05e5.tar.gz
talos-hostboot-11108f43887202522217b92d448880df0fef05e5.zip
5 files changed, 194 insertions, 33 deletions
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H
index 9aa3f8e74..34310cc56 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/dimm/rank.H
@@ -44,6 +44,7 @@
 #include <lib/utils/num.H>
 #include <lib/utils/count_dimm.H>
 #include <lib/shared/mss_const.H>
+#include <lib/phy/phy_cntrl.H>
 
 namespace mss
 {
@@ -428,6 +429,39 @@ inline uint64_t map_rank_ordinal_from_phy( const fapi2::Target<fapi2::TARGET_TYP
 }
 
 ///
+/// @brief Maps a ordinal rank pair to INIT_CAL_ERROR register encoding
+/// @param[in] i_target the fapi2 target of the MCA
+/// @param[in] i_rp rank pair 0-3
+/// @param[out] o_encoding
+/// @return the mapped value
+// i_rank passed by value to save a local
+///
+inline fapi2::ReturnCode map_rp_primary_to_init_cal( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target,
+        const uint64_t i_rp,
+        uint64_t& o_encoding)
+{
+    typedef pcTraits<fapi2::TARGET_TYPE_MCA> TT;
+    fapi2::buffer<uint64_t> l_temp;
+
+    FAPI_ASSERT( i_rp < MAX_RANK_PAIRS,
+                 fapi2::MSS_INVALID_RANK_PAIR()
+                 .set_RANK_PAIR(i_rp)
+                 .set_MCA_TARGET(i_target)
+                 .set_FUNCTION(MAP_RP_PRIMARY_TO_INIT_CAL),
+                 "Error in map_rp_primary_init_cal for %s on rp %d",
+                 mss::c_str(i_target),
+                 i_rp);
+
+    FAPI_INF("Setting bit for rp %d", i_rp);
+    //INIT_CAL_ERROR == 60, the bit position offset for the register
+    FAPI_TRY( l_temp.setBit(i_rp + TT::INIT_CAL_ERROR_RANK_PAIR) );
+
+    o_encoding = l_temp;
+    return fapi2::FAPI2_RC_SUCCESS;
+fapi_try_exit:
+    return fapi2::current_err;
+}
+///
 /// @brief Convert rank indexes in a rank_pair reg value from MC perspective to PHY perspective
 /// @tparam T fapi2 Target Type the type of the MC target
 /// @param[in] i_target the fapi2 target of the MCA
@@ -1327,8 +1361,8 @@ inline fapi2::ReturnCode get_ranks_in_pair( const fapi2::Target<T>& i_target,
             FAPI_ASSERT( false,
                          fapi2::MSS_INVALID_RANK_PAIR()
                          .set_RANK_PAIR(i_rp)
-                         .set_FUNCTION(GET_RANKS_IN_PAIR)
-                         .set_MCA_TARGET(i_target),
+                         .set_MCA_TARGET(i_target)
+                         .set_FUNCTION(GET_RANKS_IN_PAIR),
                          "%s Invalid number of rankpairs entered. num: %lu max: %lu",
                          mss::c_str(i_target),
                          i_rp,
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
index 3d19b1764..5d8aaf71f 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
@@ -531,7 +531,7 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC
     FAPI_INF("initial cal err: 0x%016llx, rp: 0x%016llx (0x%016llx)", l_errors, l_rank_pairs, uint64_t(l_err_data));
 
     // Check for RDVREF calibration errors. This fapi_try catches scom errors. Any errors from the
-    // RDVREF itself are assumed errors only after read centering
+    // RDVREF itself are assumed errors only after read centering (caught in general error above)
     FAPI_TRY( dp16::process_rdvref_cal_errors(i_target) );
 
     // WR VREF error processing acts the same as the RD VREF processing
@@ -539,7 +539,7 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC
 
     if ((l_rank_pairs == 0) || (l_errors == 0))
     {
-        FAPI_INF("Initial cal - no errors reported");
+        FAPI_INF("Initial cal - no errors reported %s", mss::c_str(i_target));
         return fapi2::FAPI2_RC_SUCCESS;
     }
 
@@ -550,38 +550,16 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_MC
     // fails ...) Note first_bit_set gives a bit position (0 being left most.) So, the rank
     // in question is the bit postion minus the position of the 0th rank in the register.
     // (the rank bits are bits 60:63, for example, so rank 0 is in position 60)
-    FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target, mss::first_bit_set(l_rank_pairs) - TT::INIT_CAL_ERROR_RANK_PAIR,
+    FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target,
+              mss::first_bit_set(l_rank_pairs) - TT::INIT_CAL_ERROR_RANK_PAIR,
               l_failed_dimm) );
-    FAPI_ERR("initial cal failed for %s", mss::c_str(l_failed_dimm));
 
-    // If we aborted on error, the disable bits aren't complete so we can't make a determination about
-    // whether the port is repairable.
-    if (cal_abort_on_error == fapi2::ENUM_ATTR_MSS_CAL_ABORT_ON_ERROR_YES)
-    {
-        FAPI_INF("can't process disable bits as we aborted on error - disable bits might be incomplete");
-    }
-    else
-    {
-        // Process the disable bits. The PHY will disable bits that it finds don't work for whatever reason.
-        // However, we can handle a number of bad bits without resorting to killing the DIMM. Do the bad
-        // bit processing here, and if we can go on and ignore these bad bits, we'll see a succcess here.
-        if (dp16::process_bad_bits(i_target, l_failed_dimm, l_rank_pairs) == fapi2::FAPI2_RC_SUCCESS)
-        {
-            FAPI_INF("Initial cal - errors reported, but only 1 nibble + 1 bit marked %s", mss::c_str(l_failed_dimm));
-
-            // If we're on a pre-DD1.02 Nimbus, Anuwat requests we 'pass' training with 1 nibble + 1 bit.
-            if (mss::chip_ec_feature_mss_training_bad_bits(i_target))
-            {
-                return fapi2::FAPI2_RC_SUCCESS;
-            }
-        }
-    }
 
     // So we can do a few things here. If we're aborting on the first calibration error,
     // we only expect to have one error bit set. If we ran all the calibrations, we can
     // either have one bit set or more than one bit set. If we have more than one bit set
-    // the result is the same - a broken DIMM which will be deconfigured. So put enough
-    // information in the FFDC for the lab but we don't need one error for every cal fail.
+    // the result is the same - a broken DIMM.
+    // So put enough information in the FFDC for the lab but we don't need one error for every cal fail.
     FAPI_ASSERT(mss::bit_count(l_errors) == 1,
                 fapi2::MSS_DRAMINIT_TRAINING_MULTIPLE_ERRORS()
                 .set_FAILED_STEPS(uint64_t(l_err_data))
@@ -718,6 +696,128 @@ fapi_try_exit:
 }
 
 ///
+/// @brief Finds the calibration errors from draminit training
+/// @param[in] i_target the port target
+/// @param[in] i_rp the rank pair we are calibrating
+/// @param[in] i_cal_abort_on_error denoting if we aborted on first fail
+/// @param[in,out] io_fails a vector storing all of our cal fails
+/// @return FAPI2_RC_SUCCESS iff all of the scoms and functionality were good
+///
+template<>
+fapi2::ReturnCode find_and_log_cal_errors(const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target,
+        const uint64_t i_rp,
+        const uint64_t i_cal_abort_on_error,
+        std::vector<fapi2::ReturnCode>& io_fails)
+{
+    fapi2::ReturnCode l_rc (fapi2::FAPI2_RC_SUCCESS);
+    fapi2::Target<TARGET_TYPE_DIMM> l_dimm;
+
+    // Let's get the DIMM since we train per rank pair (primary rank pair)
+    FAPI_TRY( mss::rank_pair_primary_to_dimm(i_target,
+              i_rp,
+              l_dimm),
+              "Failed getting the DIMM for %s", mss::c_str(i_target) );
+
+    // Let's keep track of the error.
+    // We don't want to error out here because we want to run on the other ports/ranks
+    // We'll add this to io_fails if we fail too many DQ's
+    l_rc = mss::process_initial_cal_errors(i_target);
+
+    if (l_rc != fapi2::FAPI2_RC_SUCCESS)
+    {
+        // If we're aborting on error we jump to the end and error out.
+        // We don't care about other ports or ranks because the hardware stopped when it saw the error
+        if (i_cal_abort_on_error)
+        {
+            FAPI_TRY( l_rc, "Training failed for %s. Set to abort on error, so cal didn't finish",
+                      mss::c_str(l_dimm) );
+        }
+
+        // Process the disable bits. The PHY will disable bits that it finds don't work for whatever reason.
+        // However, we can handle a number of bad bits without resorting to killing the DIMM. Do the bad
+        // bit processing here, and if we can go on and ignore these bad bits, we'll see a succcess here.
+        // Needs to be bit representation for process_bad_bits (it can handle fails for multiple rp for 1 dimm)
+        uint64_t l_encoding = 0;
+        FAPI_TRY( mss::rank::map_rp_primary_to_init_cal(i_target, i_rp, l_encoding) );
+
+        if (dp16::process_bad_bits(i_target, l_dimm, l_encoding) == fapi2::FAPI2_RC_SUCCESS)
+        {
+            // If we're on a Nimbus, lab team requests we 'pass' training with 1 nibble + 1 bit
+            if (mss::chip_ec_feature_mss_training_bad_bits(i_target))
+            {
+                FAPI_INF("p9_mss_draminit_training: errors reported, but 1 nibble + 1 bit or less was marked.%s",
+                         mss::c_str(l_dimm));
+
+                // Let's log the error as RECOVERED (logs should be hidden and no deconfigs take place) - JLH
+                fapi2::logError(l_rc, fapi2::FAPI2_ERRL_SEV_RECOVERED);
+                // Set l_rc to success so we will still record the bad bits into the attribute
+                l_rc = fapi2::FAPI2_RC_SUCCESS;
+            }
+        }
+
+        FAPI_ERR("Seeing calibration errors for p9_mss_draminit_training %s: Keep running? %s",
+                 mss::c_str(l_dimm),
+                 (l_rc == fapi2::FAPI2_RC_SUCCESS) ? "Yes" : "no");
+
+        // Let's update the attribute with the failing DQ bits since we had a training error
+        // The only fail we get here is a scom error, so we should error out
+        // We only want to update the attribute for hostboot runs though
+        // Updating the attribute updates the DIMM's VPD and actually disabled those DQ bits for good
+        // Commenting out until PRD has the backside implementation complete
+#ifdef __HOSTBOOT_MODULE
+        // TODO RTC:178400 Come back and use the ATTR_BAD_BITS accessor functions from PRD when available
+        //FAPI_TRY( mss::dp16::record_bad_bits(i_target) );
+#endif
+
+        // Let's add the error to our vector for later processing (if it didn't affect too many DQ bits)
+        if (l_rc != fapi2::FAPI2_RC_SUCCESS)
+        {
+            io_fails.push_back(l_rc);
+        }
+    }
+
+    // Calling process_bad_bits above sets fapi2::current_err; Need to explicitly return SUCCESS here
+    return fapi2::FAPI2_RC_SUCCESS;
+fapi_try_exit:
+    return fapi2::current_err;
+}
+
+///
+/// @brief Handle draminit_training cal fails
+/// @param[in] i_fails vector holding the return codes for calibration failures
+/// @note We handle errors differently depending on if we're HB or cronus
+/// If we're cronus, we want to error out.
+/// If we're hostboot, we want to log the error as hidden and let PRD choose to deconfigure
+///
+fapi2::ReturnCode draminit_training_error_handler( const std::vector<fapi2::ReturnCode>& i_fails)
+{
+// If we're in hostboot, we want to log all of the errors as hidden
+// and let PRD deconfigure based off of ATTR_BAD_DQ_BITMAP
+#ifdef __HOSTBOOT_MODULE
+    for (auto l_iter : i_fails)
+    {
+        fapi2::logError(l_iter, fapi2::FAPI2_ERRL_SEV_RECOVERED);
+    }
+
+    return fapi2::current_err;
+
+// If we're cronus, let's bomb out
+#else
+
+    if (i_fails.size() != 0)
+    {
+        // We can't log errors in cronus, so let's take the first one and end the IPL
+        FAPI_ERR("Failed p9_mss_draminit_training");
+        return i_fails[0];
+    }
+
+#endif
+    // Need this for compiler/ if i_fails is empty
+    return fapi2::FAPI2_RC_SUCCESS;
+}
+
+
+///
 /// @brief Sets up the IO impedances (ADR DRV's and DP DRV's/RCV's) - MCA specialization
 /// @tparam T the fapi2::TargetType
 /// @param[in] i_target the target (MCA/MCBIST or MBA?)
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H
index 1c27efccc..39db82147 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.H
@@ -426,6 +426,30 @@ fapi2::ReturnCode rank_pair_primary_to_dimm(const fapi2::Target<T>& i_target, co
         fapi2::Target<fapi2::TARGET_TYPE_DIMM>& o_dimm);
 
 ///
+/// @brief Handle draminit_training cal fails
+/// @param[in] i_fails vector holding the return codes for calibration failures
+/// @note We handle errors differently depending on if we're HB or cronus
+/// If we're cronus, we want to error out.
+/// If we're hostboot, we want to log the error as hidden and let PRD choose to deconfigure
+///
+fapi2::ReturnCode draminit_training_error_handler ( const std::vector<fapi2::ReturnCode>& i_fails);
+
+///
+/// @brief Finds the calibration errors from draminit training
+/// @tparam T fapi2::TargetType of the port target
+/// @param[in] i_target the port target
+/// @param[in] i_rp the rank pair we are calibrating
+/// @param[in] i_cal_abort_on_error denoting if we aborted on first fail
+/// @param[in,out] io_fails a vector storing all of our cal fails
+/// @return FAPI2_RC_SUCCESS iff all of the scoms and functionality were good
+///
+template<fapi2::TargetType T>
+fapi2::ReturnCode find_and_log_cal_errors(const fapi2::Target<T>& i_target,
+        const uint64_t i_rp,
+        const uint64_t i_cal_abort_on_error,
+        std::vector<fapi2::ReturnCode>& io_fails);
+
+///
 /// @brief Sets up the IO impedances (ADR DRV's and DP DRV's/RCV's)
 /// @tparam T the fapi2::TargetType
 /// @param[in] i_target the target (MCA/MCBIST or MBA?)
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
index d9a7fb8e4..547fd3d54 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
@@ -2662,12 +2662,12 @@ fapi_try_exit:
 fapi2::ReturnCode record_bad_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target )
 {
     const auto& l_mcs = mss::find_target<TARGET_TYPE_MCS>(i_target);
-    uint8_t l_value[PORTS_PER_MCS][MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][10] = { 0 };
+    uint8_t l_value[PORTS_PER_MCS][MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = { 0 };
 
     // Process the bad bits into an array. We copy these in to their own array
     // as it allows the compiler to check indexes where a passed pointer wouldn't
     // otherwise do.
-    uint8_t l_data[MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][10] = { 0 };
+    uint8_t l_data[MAX_DIMM_PER_PORT][MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = { 0 };
     FAPI_TRY( mss::dp16::record_bad_bits_helper(i_target, l_data) );
 
     // Read the attribute
@@ -2732,7 +2732,7 @@ fapi2::ReturnCode record_bad_bits_helper( const fapi2::Target<fapi2::TARGET_TYPE
                 l_bad_bits[l_byte_index]    = (v.first & 0xFF00) >> 8;
                 l_bad_bits[l_byte_index + 1] = v.first & 0x00FF;
 
-                FAPI_DBG("writing %s value 0x%0lX to 0x%X, 0x%X from 0x%016lx",
+                FAPI_DBG("%s Recording ATTR_BAD_DQ_BITMAP value 0x%0lX to 0x%X, 0x%X from 0x%016lx",
                          mss::c_str(i_target),
                          v.first,
                          l_bad_bits[l_byte_index],
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H
index 2214abb40..82fe0bc61 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/shared/mss_const.H
@@ -145,6 +145,9 @@ enum ffdc_function_codes
     // MSS_INVALID_INDEX_PASSED
     SYMBOL_COUNT_READ = 50,
     SYMBOL_COUNT_WRITE = 51,
+
+    // Used in rank.H
+    MAP_RP_PRIMARY_TO_INIT_CAL = 60,
 };
 
 enum states
author	Jacob Harvey <jlharvey@us.ibm.com>	2017-08-01 16:11:59 -0500
committer	Daniel M. Crowell <dcrowell@us.ibm.com>	2017-08-19 22:12:27 -0400
commit	11108f43887202522217b92d448880df0fef05e5 (patch)
tree	e48dcecf359d3ba9fa663fe7604545f8ed1a0d61 /src/import/chips/p9/procedures/hwp/memory/lib
parent	bb97f80565ac3074de838e2773d1d08e91040775 (diff)
download	talos-hostboot-11108f43887202522217b92d448880df0fef05e5.tar.gz talos-hostboot-11108f43887202522217b92d448880df0fef05e5.zip