Updates error paths for PRD FIR checking

FIR's could cause errors within hardware procedures. PRD has the capability to retrigger a procedure if it sees an error. We might be able to avoid IPL issues with this, so if a FIR has been hit during hardware enabled code (CCS or calibration), then log the error and let PRD find the "new" FIR that could have caused the hardware engine to have an issue. If there is some other problem, the retriggered HWP will find it. Change-Id: I81599d1d0c4b4c256b79820b4a7e2eafc09e206b Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46571 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: JACOB L. HARVEY <jlharvey@us.ibm.com> Reviewed-by: Louis Stermole <stermole@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: HWSV CI <hwsv-ci+hostboot@us.ibm.com> Reviewed-by: ANDRE A. MARIN <aamarin@us.ibm.com> Tested-by: Hostboot CI <hostboot-ci+hostboot@us.ibm.com> Reviewed-by: Jennifer A. Stofer <stofer@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46584 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
author: Stephen Glancy <sglancy@us.ibm.com> 2017-09-21 10:14:04 -0500
committer: Daniel M. Crowell <dcrowell@us.ibm.com> 2017-10-02 23:45:51 -0400
commit: 3890040afa1dc93d58476d68df35cb44d49c57b2 (patch)
tree: 1c1538e4b8bb40a8a9d65bc8e900a84bb2caa586 /src/import/chips/p9/procedures/hwp/memory/lib/phy
parent: f21a18e501c28d932ee24f11a7a3ffaa93228735 (diff)
download: talos-hostboot-3890040afa1dc93d58476d68df35cb44d49c57b2.tar.gz
talos-hostboot-3890040afa1dc93d58476d68df35cb44d49c57b2.zip
2 files changed, 48 insertions, 5 deletions
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
index 86a8621fa..e1e63fec5 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/ddr_phy.C
@@ -521,6 +521,11 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_DI
     uint64_t l_rank_pairs = 0;
     uint8_t cal_abort_on_error = 0;
 
+    // This boolean tells the code whether we took a training fail or a scom fail reading the status registers
+    // It starts as false, given that we need to read out the registers
+    // When we start checking all of the values of the status registers, it gets set to true
+    bool l_check_firs = false;
+
     const auto& l_mca = mss::find_target<fapi2::TARGET_TYPE_MCA>(i_target);
     fapi2::buffer<uint64_t> l_err_data;
 
@@ -550,6 +555,9 @@ fapi2::ReturnCode process_initial_cal_errors( const fapi2::Target<TARGET_TYPE_DI
     }
 
     // Error information from other registers is gathered in the FFDC from the XML
+    // From here on out, check the FIRs
+    // Using this boolean to avoid having to check the FIR's after each assert below
+    l_check_firs = true;
 
     // So we can do a few things here. If we're aborting on the first calibration error,
     // we only expect to have one error bit set. If we ran all the calibrations, we can
@@ -692,7 +700,8 @@ fapi_try_exit:
              (fapi2::current_err == fapi2::FAPI2_RC_SUCCESS ? "success" : "errors reported"),
              mss::c_str(l_mca));
 
-    return fapi2::current_err;
+    // Checks the FIR's, if need be
+    return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_check_firs);
 }
 
 ///
diff --git a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
index 0e346881a..129c37515 100644
--- a/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
+++ b/src/import/chips/p9/procedures/hwp/memory/lib/phy/dp16.C
@@ -52,6 +52,7 @@
 #include <generic/memory/lib/utils/c_str.H>
 
 #include <lib/workarounds/dp16_workarounds.H>
+#include <lib/fir/check.H>
 #include <generic/memory/lib/utils/mss_math.H>
 
 using fapi2::TARGET_TYPE_MCS;
@@ -3260,6 +3261,22 @@ fapi_try_exit:
 ///
 fapi2::ReturnCode record_bad_bits( const fapi2::Target<fapi2::TARGET_TYPE_MCA>& i_target )
 {
+    // If we have a FIR set that could have caused our training fail, then skip checking bad bits in FW
+    // PRD will handle the FIR and retrigger the procedure
+#ifdef __HOSTBOOT_MODULE
+    bool l_fir_error = false;
+    FAPI_TRY(mss::check::bad_fir_bits(i_target, l_fir_error), "%s took an error while checking FIR's",
+             mss::c_str(i_target));
+
+    // Exit if we took a FIR error - PRD will handle bad bits
+    if(l_fir_error)
+    {
+        FAPI_INF("%s has FIR's set, exiting to let PRD handle it", mss::c_str(i_target));
+        return fapi2::FAPI2_RC_SUCCESS;
+    }
+
+#endif
+
     for( const auto& d : mss::find_targets<fapi2::TARGET_TYPE_DIMM>(i_target) )
     {
         uint8_t l_data[MAX_RANK_PER_DIMM][BAD_DQ_BYTE_COUNT] = {};
@@ -3367,11 +3384,17 @@ fapi2::ReturnCode process_rdvref_cal_errors( const fapi2::Target<fapi2::TARGET_T
     size_t l_index = 0;
     std::vector<fapi2::buffer<uint64_t>> l_data;
 
+    // Boolean to keep track of if a fail was calibration related, or scom related
+    bool l_cal_fail = false;
+
     // Suck all the cal error bits out ...
     FAPI_TRY( mss::scom_suckah(l_mca, TT::RD_VREF_CAL_ERROR_REG, l_data) );
 
     FAPI_INF("%s Processing RD_VREF_CAL_ERROR", mss::c_str(i_target));
 
+    // From here on out, the FIR's are all cal fails
+    l_cal_fail = true;
+
     for (const auto& v : l_data)
     {
         // They should all be 0's. If they're not, we have a problem.
@@ -3383,14 +3406,17 @@ fapi2::ReturnCode process_rdvref_cal_errors( const fapi2::Target<fapi2::TARGET_T
                     .set_VALUE(v),
                     "DP16 failed read vref calibration on %s. register 0x%016lx value 0x%016lx",
                     mss::c_str(l_mca), TT::RD_VREF_CAL_ERROR_REG[l_index], v);
+
         ++l_index;
     }
 
-    FAPI_INF("RD_VREF_CAL_ERROR complete");
+    FAPI_INF("%s RD_VREF_CAL_ERROR complete", mss::c_str(i_target));
     return fapi2::FAPI2_RC_SUCCESS;
 
 fapi_try_exit:
-    return fapi2::current_err;
+
+    // If the FIR's are cal fails, then check to see if FIRs or PLL fails were the cause
+    return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_cal_fail);
 }
 
 ///
@@ -3412,10 +3438,16 @@ fapi2::ReturnCode process_wrvref_cal_errors( const fapi2::Target<fapi2::TARGET_T
     std::vector<std::pair<fapi2::buffer<uint64_t>, fapi2::buffer<uint64_t>>> l_data;
     std::vector<std::pair<fapi2::buffer<uint64_t>, fapi2::buffer<uint64_t>>> l_mask;
 
+    // Boolean to keep track of if a fail was calibration related, or scom related
+    bool l_cal_fail = false;
+
     // Suck all the cal error bits out ...
     FAPI_TRY( mss::scom_suckah(l_mca, TT::WR_VREF_ERROR_REG, l_data) );
     FAPI_TRY( mss::scom_suckah(l_mca, TT::WR_VREF_ERROR_MASK_REG, l_mask) );
 
+    // From here on out, the FIR's are all cal fails
+    l_cal_fail = true;
+
     // Loop through both data and mask
     {
         // Note: ideally these would be cbegin/cend, but HB doesn't support constant iterators for vectors
@@ -3480,11 +3512,13 @@ fapi2::ReturnCode process_wrvref_cal_errors( const fapi2::Target<fapi2::TARGET_T
         }
     }
 
-    FAPI_INF("WRVREF_CAL_ERROR complete");
+    FAPI_INF("%s WRVREF_CAL_ERROR complete", mss::c_str(i_target));
     return fapi2::FAPI2_RC_SUCCESS;
 
 fapi_try_exit:
-    return fapi2::current_err;
+
+    // If the FIR's are cal fails, then check to see if FIR's were the cause
+    return mss::check::fir_or_pll_fail( i_target, fapi2::current_err, l_cal_fail);
 }
 
 ///
author	Stephen Glancy <sglancy@us.ibm.com>	2017-09-21 10:14:04 -0500
committer	Daniel M. Crowell <dcrowell@us.ibm.com>	2017-10-02 23:45:51 -0400
commit	3890040afa1dc93d58476d68df35cb44d49c57b2 (patch)
tree	1c1538e4b8bb40a8a9d65bc8e900a84bb2caa586 /src/import/chips/p9/procedures/hwp/memory/lib/phy
parent	f21a18e501c28d932ee24f11a7a3ffaa93228735 (diff)
download	talos-hostboot-3890040afa1dc93d58476d68df35cb44d49c57b2.tar.gz talos-hostboot-3890040afa1dc93d58476d68df35cb44d49c57b2.zip