From 8941c2cd6251da7e090ef18201f6e3a0a8f5a597 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Fri, 5 Apr 2019 08:26:33 -0500 Subject: PRD: NVDIMM mask EVENT_N bit on persistency lost Change-Id: I49d56221729e9a1a5e63544527c10a0f48a81d4a CQ: SW461975 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75604 Tested-by: Jenkins Server Reviewed-by: Paul Greenwood Reviewed-by: Benjamen G. Tyner Reviewed-by: Zane C. Shelley Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75793 Tested-by: Jenkins OP Build CI Tested-by: FSP CI Jenkins Tested-by: Jenkins OP HW --- .../prdf/common/plat/nimbus/nimbus_mca_actions.rule | 2 +- src/usr/diag/prdf/plat/mem/prdfP9Mca.C | 21 ++++++++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule index 09f85eef0..da3a73f82 100644 --- a/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule +++ b/src/usr/diag/prdf/common/plat/nimbus/nimbus_mca_actions.rule @@ -36,8 +36,8 @@ actionclass verify_chip_mark_7 { funccall("AnalyzeFetchMpe_7"); }; /** Analyze NVDIMM Health Registers */ actionclass analyzeNvdimms { - funccall("AnalyzeNvdimmHealthStatRegs"); threshold32pday; + funccall("AnalyzeNvdimmHealthStatRegs"); }; /** Mainline NCE/TCE handling */ diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 1722314f1..5f7efa274 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -775,8 +775,12 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // BIT 0: Persistency Lost if ( bitList.count(0) ) { - // Make the log predictive - io_sc.service_data->setServiceCall(); + // EVENT_N cannot be retriggered on a new PERSISTENCY_LOST_ERROR + // if a previous PERSISTENCY_LOST_ERROR still exists. Meaning, we + // cannot detect/report multiple errors that happen at different + // points in time. As such, mask the EVENT_N bit here (MCACALFIR[8]) + // and make the log predictive. + io_sc.service_data->SetThresholdMaskId(0); // Send persistency lost message to PHYP l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm, @@ -796,8 +800,19 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // BIT 2: Persistency Restored if ( bitList.count(2) ) { - // hidden log + // It would be rare to have an intermittent error that comes and + // goes so fast we only see PERSISTENCY_RESTORED and not + // PERSISTENCY_LOST_ERROR. Set predictive on threshold of 32 + // per day (rule code handles the thresholding), else just keep + // as a hidden log. io_sc.service_data->AddSignatureList( dimm, PRDFSIG_NvdimmPersRes ); + + // callout NVDIMM high, cable high, BPM high, no gard + io_sc.service_data->SetCallout( dimm, MRU_HIGH, NO_GARD ); + l_rc = __addBpmCallout( dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != l_rc ) continue; + l_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != l_rc ) continue; } // BIT 3: Below Warning Threshold -- ignore // BIT 4: Hardware Failure -- ignore -- cgit v1.2.3