PRD: RCD parity error handling

Change-Id: I291ca299249e6b18760959fdc3fed2747d3d4f46 RTC: 165385 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/38123 Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/38264 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
author: Zane Shelley <zshelle@us.ibm.com> 2017-03-07 10:57:35 -0600
committer: Zane C. Shelley <zshelle@us.ibm.com> 2017-03-22 17:54:11 -0400
commit: 98de8e60e8395033bf1deed9ede0929ecb796841 (patch)
tree: 2bbdb4f6b2f13e03a9e2a2a95955d174d2b4b72d /src/usr
parent: 070a02c9f75530fd5c559456255500e36dcb2792 (diff)
download: talos-hostboot-98de8e60e8395033bf1deed9ede0929ecb796841.tar.gz
talos-hostboot-98de8e60e8395033bf1deed9ede0929ecb796841.zip
4 files changed, 52 insertions, 10 deletions
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule
index 7aeba8b66..9946489d3 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_mca.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_mca.rule
@@ -219,7 +219,7 @@ rule rMCACALFIR
     MCACALFIR & ~MCACALFIR_MASK &  MCACALFIR_ACT0 &  MCACALFIR_ACT1;
 };
 
-group gMCACALFIR filter priority( 13 ), cs_root_cause( 13 )
+group gMCACALFIR filter priority( 13 ), cs_root_cause( 4, 13, 14 )
 {
     /** MCACALFIR[0]
      *  A MBA recoverable error has occurred.
@@ -294,7 +294,7 @@ group gMCACALFIR filter priority( 13 ), cs_root_cause( 13 )
     /** MCACALFIR[14]
      *  RCD during periodic cal
      */
-    (rMCACALFIR, bit(14)) ? threshold_and_mask;
+    (rMCACALFIR, bit(14)) ? rcd_parity_error;
 
     /** MCACALFIR[15]
      *  scom error
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule
index cfcf39300..746ca2e73 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_mca_actions.rule
@@ -73,7 +73,7 @@ actionclass rcd_parity_error
     callout(connected(TYPE_DIMM,0), MRU_HIGH); # DIMM 0 HIGH
     callout(connected(TYPE_DIMM,1), MRU_HIGH); # DIMM 1 HIGH
     calloutSelfLow;                            # Self LOW
-    threshold32pday;                           # Threshold 32/day
+    # Thresholding done in plugin
     funccall("RcdParityError");                # Run TPS on TH for all MCA ranks
 };
 
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule b/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule
index a52eb54b5..bf2fd3fd1 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_mca_regs.rule
@@ -209,3 +209,14 @@
         capture     group PllFIRs;
     };
 
+    ############################################################################
+    # Misc
+    ############################################################################
+
+    register FARB0
+    {
+        name     "MCP.PORT0.SRQ.MBA_FARB0Q";
+        scomaddr 0x07010913;
+        capture  group default;
+    };
+
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
index 1a2f7792a..cda2226c7 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
+++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
@@ -29,6 +29,7 @@
 #include <prdfPluginMap.H>
 
 // Platform includes
+#include <prdfP9McaDataBundle.H>
 #include <prdfP9McbistDataBundle.H>
 #include <prdfPlatServices.H>
 #ifdef __HOSTBOOT_RUNTIME
@@ -63,16 +64,14 @@ int32_t RcdParityError( ExtensibleChip * i_mcaChip,
 {
     #define PRDF_FUNC "[p9_mca::RcdParityError] "
 
-    // The callouts have already been made in the rule code. All we need to do
-    // now is start TPS on all slave ranks behind the MCA. This can only be done
-    // at runtime because it is too complicated to handle during Memory
-    // Diagnostics and we don't have time to complete the procedures at any
-    // other point during the IPL. The DIMMs will be deconfigured during the IPL
-    // anyways. So not really much benefit except for extra FFDC.
+    // The callouts have already been made in the rule code. All other actions
+    // documented below.
 
     #ifdef __HOSTBOOT_RUNTIME // TPS only supported at runtime.
 
-    if ( io_sc.service_data->IsAtThreshold() )
+    // Recovery is always enabled during runtime. Start TPS on all slave ranks
+    // behind the MCA if the recovery threshold is reached.
+    if ( getMcaDataBundle(i_mcaChip)->iv_rcdParityTh.inc(io_sc) )
     {
         ExtensibleChip * mcbChip = getConnectedParent( i_mcaChip, TYPE_MCBIST );
 
@@ -96,6 +95,38 @@ int32_t RcdParityError( ExtensibleChip * i_mcaChip,
         }
     }
 
+    #else // IPL
+
+    SCAN_COMM_REGISTER_CLASS * farb0 = i_mcaChip->getRegister("FARB0");
+    if ( SUCCESS != farb0->Read() )
+    {
+        PRDF_ERR( PRDF_FUNC "Read() failed on MCAECCFIR: i_mcaChip=0x%08x",
+                  i_mcaChip->getHuid() );
+
+        // Ensure the reg is zero so that we will use the recovery threshold and
+        // guarantee we don't try to do a reconfig.
+        farb0->clearAllBits();
+    }
+
+    if ( farb0->IsBitSet(54) )
+    {
+        // Recovery is disabled. Issue a reconfig loop. Make the error log
+        // predictive if threshold is reached.
+        if ( rcdParityErrorReconfigLoop() )
+            io_sc.service_data->setServiceCall();
+    }
+    else
+    {
+        // Make the error log predictive if the recovery threshold is reached.
+        // Don't bother with TPS on all ranks because it is too complicated to
+        // handle during Memory Diagnostics and we don't have time to complete
+        // the procedures at any other point during the IPL. The DIMMs will be
+        // deconfigured during the IPL anyways. So not really much benefit
+        // except for extra FFDC.
+        if ( getMcaDataBundle(i_mcaChip)->iv_rcdParityTh.inc(io_sc) )
+            io_sc.service_data->setServiceCall();
+    }
+
     #endif
 
     return SUCCESS;
author	Zane Shelley <zshelle@us.ibm.com>	2017-03-07 10:57:35 -0600
committer	Zane C. Shelley <zshelle@us.ibm.com>	2017-03-22 17:54:11 -0400
commit	98de8e60e8395033bf1deed9ede0929ecb796841 (patch)
tree	2bbdb4f6b2f13e03a9e2a2a95955d174d2b4b72d /src/usr
parent	070a02c9f75530fd5c559456255500e36dcb2792 (diff)
download	talos-hostboot-98de8e60e8395033bf1deed9ede0929ecb796841.tar.gz talos-hostboot-98de8e60e8395033bf1deed9ede0929ecb796841.zip