diff options
Diffstat (limited to 'src/usr/diag/prdf/plat/mem/prdfP9Mca.C')
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mca.C | 835 |
1 files changed, 755 insertions, 80 deletions
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 5f7efa274..fac29fce3 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -27,7 +27,6 @@ #include <iipServiceDataCollector.h> #include <prdfExtensibleChip.H> #include <prdfPluginMap.H> -#include <isteps/nvdimm/nvdimm.H> // Platform includes #include <prdfMemDbUtils.H> @@ -38,6 +37,10 @@ #include <prdfMemTps.H> #endif +#ifdef CONFIG_NVDIMM + #include <nvdimm.H> +#endif + using namespace TARGETING; namespace PRDF @@ -296,18 +299,9 @@ PRDF_PLUGIN_DEFINE( nimbus_mca, MemPortFailure ); // //############################################################################## +#ifdef CONFIG_NVDIMM #ifdef __HOSTBOOT_RUNTIME -enum nvdimmRegOffset -{ - NVDIMM_MGT_CMD1 = 0x041, - MODULE_HEALTH = 0x0A0, - MODULE_HEALTH_STATUS0 = 0x0A1, - MODULE_HEALTH_STATUS1 = 0x0A2, - ERROR_THRESHOLD_STATUS = 0x0A5, - WARNING_THRESHOLD_STATUS = 0x0A7, -}; - /** * @brief Gets a map list of which bits are set from a uint8_t bit list (7:0) * @param i_data uint8_t bit list (7:0) @@ -349,6 +343,7 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm, break; } + // addPartCallout will default to GARD_NULL, NO_DECONFIG mainErrl->addPartCallout( i_dimm, HWAS::BPM_PART_TYPE, i_priority ); @@ -362,10 +357,12 @@ uint32_t __addBpmCallout( TargetHandle_t i_dimm, /** * @brief Adds a callout of the cable connecting an NVDIMM to its * backup power module (BPM) + * @param i_dimm The target dimm. * @param i_priority The callout priority. * @return FAIL if unable to get the global error log, else SUCCESS */ -uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) +uint32_t __addNvdimmCableCallout( TargetHandle_t i_dimm, + HWAS::callOutPriority i_priority ) { #define PRDF_FUNC "[__addNvdimmCableCallout] " @@ -382,7 +379,9 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) break; } - mainErrl->addProcedureCallout( HWAS::EPUB_PRC_NVDIMM_ERR, i_priority ); + // addPartCallout will default to GARD_NULL, NO_DECONFIG + mainErrl->addPartCallout( i_dimm, HWAS::BPM_CABLE_PART_TYPE, + i_priority ); }while(0); @@ -391,21 +390,45 @@ uint32_t __addNvdimmCableCallout( HWAS::callOutPriority i_priority ) #undef PRDF_FUNC } +/** + * @brief If a previous error has been found, add a signature to the + * multi-signature list, else set the primary signature. + * @param io_sc The step code data struct. + * @param i_trgt The target. + * @param i_errFound Whether an error has already been found or not. + * @param i_sig The signature to be set. + */ +void __addSignature( STEP_CODE_DATA_STRUCT & io_sc, TargetHandle_t i_trgt, + bool i_errFound, uint32_t i_sig ) +{ + if ( i_errFound ) + { + io_sc.service_data->AddSignatureList( i_trgt, i_sig ); + } + else + { + io_sc.service_data->setSignature( getHuid(i_trgt), i_sig ); + } +} /** * @brief Analyze NVDIMM Health Status0 Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. * @return FAIL if unable to read register, else SUCCESS */ -uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) +uint32_t __analyzeHealthStatus0Reg(STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, bool & io_errFound) { #define PRDF_FUNC "[__analyzeHealthStatus0Reg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + do { // NVDIMM health status registers size = 1 byte @@ -413,7 +436,7 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Health Status0 Register (0xA1) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS0) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS0) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Health Status0 Register. " @@ -427,58 +450,66 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 0: Voltage Regulator Fail if ( bitList.count(0) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VoltRegFail ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VoltRegFail ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 1: VDD Lost if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VddLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VddLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 2: VPP Lost if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VppLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VppLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 3: VTT Lost if ( bitList.count(3) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_VttLost ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_VttLost ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 4: DRAM not Self Refresh if ( bitList.count(4) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NotSelfRefr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotSelfRefr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 5: Controller HW Error if ( bitList.count(5) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CtrlHwErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_CtrlHwErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 6: NVM Controller Error if ( bitList.count(6) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmCtrlErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmCtrlErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 7: NVM Lifetime Error if ( bitList.count(7) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_NvmLifeErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } }while(0); @@ -491,18 +522,22 @@ uint32_t __analyzeHealthStatus0Reg( STEP_CODE_DATA_STRUCT & io_sc, /** * @brief Analyze NVDIMM Health Status1 Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. * @return FAIL if unable to read register, else SUCCESS */ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) + TargetHandle_t i_dimm, bool & io_errFound ) { #define PRDF_FUNC "[__analyzeHealthStatus1Reg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + do { // NVDIMM health status registers size = 1 byte @@ -510,7 +545,7 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Health Status1 Register (0xA2) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH_STATUS1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH_STATUS1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Health Status1 Register. " @@ -524,83 +559,90 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 0: Insufficient Energy if ( bitList.count(0) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_InsuffEnergy); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_InsuffEnergy ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 1: Invalid Firmware if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_InvFwErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_InvFwErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 2: Configuration Data Error if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_CnfgDataErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_CnfgDataErr ); // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_HIGH, NO_GARD ); + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + io_errFound = true; } // BIT 3: No Energy Source if ( bitList.count(3) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_NoEsPres); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NoEsPres ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 4: Energy Policy Not Set if ( bitList.count(4) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsPolNotSet ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsPolNotSet ); // Callout FW (Level2 Support) High io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); // Callout NVDIMM low on 1st, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 5: Energy Source HW Error if ( bitList.count(5) ) { - io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsHwFail ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHwFail ); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 6: Energy Source Health Assessment Error if ( bitList.count(6) ) { - io_sc.service_data->AddSignatureList(i_dimm, PRDFSIG_EsHlthAssess); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsHlthAssess); // Callout BPM (backup power module) high, cable high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; - o_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + o_rc = __addNvdimmCableCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != o_rc ) break; // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 7: Reserved @@ -613,18 +655,105 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc, } /** + * @brief Reads and merges the data from two ES_TEMP registers to get the + * correct temperature format. + * @param i_dimm The target nvdimm. + * @param i_tempMsbReg The address of the register that contains the most + * significant byte of the temperature data. + * @param i_tempLsbReg The address of the register that contains the least + * significant byte of the temperature data. + * @param o_tempData The 16 bit temperature data. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg, + uint16_t i_tempLsbReg, uint16_t & o_tempData ) +{ + #define PRDF_FUNC "[__readTemp] " + + /* + * -NOTE: Example showing how to read the temperature format: + * ES_TEMP1 = 0x03 (MSB: bits 15-8) + * ES_TEMP0 = 0x48 (LSB: bits 7-0) + * + * 0x0348 = 0000 0011 0100 1000 = 52.5 C + * + * -NOTE: bit definition: + * [15:13]Reserved + * [12]Sign 0 = positive, 1 = negative; 0°C should be expressed as positive + * [11] 128°C + * [10] 64°C + * [9] 32°C + * [8] 16°C + * [7] 8°C + * [6] 4°C + * [5] 2°C + * [4] 1°C + * [3] 0.5°C + * [2] 0.25°C + * [1] 0.125°C Optional for temp fields; not used for temp th fields + * [0]0.0625°C Optional for temp fields; not used for temp th fields + */ + uint32_t o_rc = SUCCESS; + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + uint8_t msbData = 0; + uint8_t lsbData = 0; + + // Read the two inputted temperature registers. + errlHndl_t errl = deviceRead( i_dimm, &msbData, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_tempMsbReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature MSB Register. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + errl = deviceRead( i_dimm, &lsbData, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_tempLsbReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature LSB Register. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + o_tempData = ((uint16_t)msbData << 8) | lsbData; + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** * @brief Analyze NVDIMM Error Threshold Status Register for errors - * @param io_sc The step code data struct. - * @param i_dimm The target dimm. + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. + * @param o_esTempErr A flag for whether we hit an ES TEMP error or not. * @return FAIL if unable to read register, else SUCCESS */ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm ) + TargetHandle_t i_dimm, bool & io_errFound, + bool & o_esTempErr ) { #define PRDF_FUNC "[__analyzeErrorThrStatusReg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + o_esTempErr = false; + + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); do { @@ -633,7 +762,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Read the Error Threshold Status Register (0xA5) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(ERROR_THRESHOLD_STATUS) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::ERROR_THRESHOLD_STATUS) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Status Reg. " @@ -648,7 +777,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 1: ES Lifetime Error if ( bitList.count(1) ) { - io_sc.service_data->AddSignatureList ( i_dimm, PRDFSIG_EsLifeErr ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsLifeErr ); // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -656,11 +785,60 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + io_errFound = true; } // BIT 2: ES Temperature Error if ( bitList.count(2) ) { - io_sc.service_data->AddSignatureList( i_dimm, PRDFSIG_EsTmpErr ); + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + + // Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values + uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; + uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; + uint16_t esTemp = 0; + o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); + if ( SUCCESS != o_rc ) break; + + uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1; + uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0; + uint16_t esTempHighTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); + if ( SUCCESS != o_rc ) break; + + msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + + // Check to see if the ES_TEMP is negative (bit 12) + bool esTempNeg = false; + if ( esTemp & 0x1000 ) esTempNeg = true; + + // If ES_TEMP is equal or above ES_TEMP_ERROR_HIGH_THRESHOLD + // Just in case ES_TEMP has moved before we read it out, we'll add + // a 2°C margin when comparing to the threshold. + if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrHigh ); + } + // Else check if the error hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrLow ); + } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrFa ); + } // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -668,6 +846,9 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + + o_esTempErr = true; + io_errFound = true; } // BIT 3:7: Reserved @@ -680,6 +861,419 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, } /** + * @brief Adjusts the warning threshold so that future warnings are allowed + * to report. + * @param io_sc The step code data struct. + * @param i_dimm The target nvdimm. + * @param i_warnThReg The address of the relevant warning threshold register. + * @param i_errThReg The address of the relevant error threshold register. + * @param o_firstWarn Flag if this is the first warning of this type. + * @param o_statusErr Flag to tell if we found an error from checking the + * notification status register. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, uint16_t i_warnThReg, + uint16_t i_errThReg, bool & o_firstWarn, + bool & o_statusErr ) +{ + #define PRDF_FUNC "[__adjustThreshold] " + + uint32_t o_rc = SUCCESS; + uint16_t notifCmdReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD; + uint16_t notifStatusReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_STATUS; + o_firstWarn = false; + o_statusErr = false; + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + + // Read the corresponding warning threshold + uint8_t warnTh = 0; + errlHndl_t errl = deviceRead( i_dimm, &warnTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_warnThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Reg. HUID: " + "0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Read the corresponding error threshold + uint8_t errTh = 0; + errl = deviceRead( i_dimm, &errTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_errThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Reg. HUID: " + "0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // If the warning threshold is not set to the error threshold+1, + // move the threshold. + if ( warnTh != (errTh+1) ) + { + o_firstWarn = true; + + // SET_EVENT_NOTIFICATION_CMD is a write only register that is + // used to change the SET_EVENT_NOTIFICATION_STATUS register. + // The only bits within it that are used are bits 0 and 1, as such + // we can safely set the rest to 0. It is defined as: + // [0]: Persistency Notification + // [1]: Warning Threshold Notification + // [2]: Obsolete + // [3]: Firmware Activation Notification (Not Used) + // [4:7]: Reserved + + // Clear SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + uint8_t notifCmd = 0x01; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification " + "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Check SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + uint8_t notifStat = 0; + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 1 + if ( bitList.count(1) || !bitList.count(2) || bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } + + + // Set the warning threshold to error threshold + 1 + warnTh = errTh+1; + errl = deviceWrite( i_dimm, &warnTh, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(i_warnThReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to write Warning Threshold Reg. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Set SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + notifCmd = 0x03; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification " + "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Recheck SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 0 + if ( bitList.count(1) || !bitList.count(2) || !bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } + } + // Note: moving the threshold should clear the warning from + // WARNING_THRESHOLD_STATUS, which allows future warnings to report. + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** + * @brief Analyze NVDIMM Warning Threshold Status Register for errors + * @param io_sc The step code data struct. + * @param i_dimm The target dimm. + * @param io_errFound Whether an error has already been found or not. + * @return FAIL if unable to read register, else SUCCESS + */ +uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, bool & io_errFound) +{ + #define PRDF_FUNC "[__analyzeWarningThrStatusReg] " + + uint32_t o_rc = SUCCESS; + uint8_t data = 0; + + // Get MCA, for signatures + TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); + + do + { + // NVDIMM health status registers size = 1 byte + size_t NVDIMM_SIZE = 1; + + // Read the Warning Threshold Status Register (0xA7) 7:0 + errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::WARNING_THRESHOLD_STATUS) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Status Reg. " + "HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data ); + + // Analyze Bit 2 First + // BIT 2: ES_TEMP_WARNING + if ( bitList.count(2) ) + { + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + + // Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values + uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; + uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; + uint16_t esTemp = 0; + o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); + if ( SUCCESS != o_rc ) break; + + uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1; + uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0; + uint16_t esTempHighTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); + if ( SUCCESS != o_rc ) break; + + msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + + // Check to see if the ES_TEMP is negative (bit 12) + bool esTempNeg = false; + if ( esTemp & 0x1000 ) esTempNeg = true; + + // If ES_TEMP is equal or above ES_TEMP_WARNING_HIGH_THRESHOLD + // Just in case ES_TEMP has moved before we read it out, we'll add + // a 2°C margin when comparing to the threshold. + if ( (esTemp >= (esTempHighTh - 0x0020)) && !esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnHigh ); + } + // Else check if the warning hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnLow ); + } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnFa ); + } + + // Callout BPM (backup power module) high + o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != o_rc ) break; + + // Callout NVDIMM low, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + + // Because of the possibility of intermittent ES temperature + // false alarm readings, we will keep the log hidden. If there is + // an actual ES temperature problem, we assume we will continue + // to be called to handle the temperature warning and hit threshold. + + // Only send the save/restore message to PHYP if we hit threshold. + if ( io_sc.service_data->IsAtThreshold() ) + { + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + } + + io_errFound = true; + } + // BIT 0: NVM_LIFETIME_WARNING + if ( bitList.count(0) ) + { + // Adjust warning threshold. + uint16_t warnThReg = NVDIMM::i2cReg::NVM_LIFETIME_WARNING_THRESHOLD; + uint16_t errThReg = NVDIMM::i2cReg::NVM_LIFETIME_ERROR_THRESHOLD; + bool firstWarn = false; + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); + if ( SUCCESS != o_rc ) break; + + // Make the log predictive, but do not mask the FIR + io_sc.service_data->setServiceCall(); + + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout NVDIMM on 1st, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + } + + // Update signature depending on whether this is the first or second + // warning of this type. + if ( firstWarn ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn1 ); + } + else + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn2 ); + } + + + io_errFound = true; + } + // BIT 1: ES_LIFETIME_WARNING + if ( bitList.count(1) ) + { + // Adjust warning threshold. + uint16_t warnThReg = NVDIMM::i2cReg::ES_LIFETIME_WARNING_THRESHOLD; + uint16_t errThReg = NVDIMM::i2cReg::ES_LIFETIME_ERROR_THRESHOLD; + bool firstWarn = false; + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); + if ( SUCCESS != o_rc ) break; + + // Make the log predictive, but do not mask the FIR + io_sc.service_data->setServiceCall(); + + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout BPM (backup power module) high + o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != o_rc ) break; + + // Callout NVDIMM low, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + } + + // Update signature depending on whether this is the first or second + // warning of this type. + if ( firstWarn ) + { + __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn1); + } + else + { + __addSignature(io_sc, mca, io_errFound, PRDFSIG_EsLifeWarn2); + } + + io_errFound = true; + } + + }while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +/** * @brief De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1 (0x41) * @param i_dimm The target dimm. * @return FAIL if unable to read/write register, else SUCCESS @@ -698,7 +1292,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) // Read the NVDIMM_MGT_CMD1 register (0x41) 7:0 errlHndl_t errl = deviceRead( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read NVDIMM_MGT_CMD1. " @@ -713,7 +1307,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) // Write the updated data back to NVDIMM_MGT_CMD1 errl = deviceWrite( i_dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(NVDIMM_MGT_CMD1) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::NVDIMM_MGT_CMD1) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to write NVDIMM_MGT_CMD1. " @@ -732,6 +1326,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm ) } #endif // HOSTBOOT_RUNTIME +#endif // CONFIG_NVDIMM /** * @brief MCACALFIR[8] - Error from NVDIMM health status registers @@ -744,13 +1339,28 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, { #define PRDF_FUNC "[nimbus_mca::AnalyzeNvdimmHealthStatRegs] " + #ifdef CONFIG_NVDIMM #ifdef __HOSTBOOT_RUNTIME uint32_t l_rc = SUCCESS; + bool errFound = false; // We need to check both dimms for errors for ( auto & dimm : getConnected(i_chip->getTrgt(), TYPE_DIMM) ) { + // Skip any non-NVDIMMs + if ( !isNVDIMM(dimm) ) continue; + + // Add SMART-specific, page 4 registers to FFDC + errlHndl_t mainErrl = nullptr; + mainErrl = ServiceGeneratorClass::ThisServiceGenerator().getErrl(); + if ( nullptr == mainErrl ) + { + PRDF_ERR( PRDF_FUNC "Failed to get the global error log." ); + continue; + } + PlatServices::nvdimmAddFfdc( dimm, mainErrl ); + // De-assert the EVENT_N pin by setting bit 2 in NVDIMM_MGT_CMD1 l_rc = __deassertEventN( dimm ); if ( SUCCESS != l_rc ) continue; @@ -762,7 +1372,7 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // Read the Module Health Register (0xA0) 7:0 errlHndl_t errl = deviceRead( dimm, &data, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(MODULE_HEALTH) ); + DEVICE_NVDIMM_ADDRESS(NVDIMM::i2cReg::MODULE_HEALTH) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to read Module Health Register. " @@ -775,6 +1385,30 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // BIT 0: Persistency Lost if ( bitList.count(0) ) { + // Analyze Health Status0 Reg, Health Status1 Reg, + // and Error Theshold Status Reg + l_rc = __analyzeHealthStatus0Reg( io_sc, dimm, errFound ); + if ( SUCCESS != l_rc ) continue; + l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound ); + if ( SUCCESS != l_rc ) continue; + bool esTempErr = false; + l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr); + if ( SUCCESS != l_rc ) continue; + + // If we hit an ES temperature error and have not yet hit threshold, + // then keep the log hidden. + if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue; + + // If we didn't find any error, then keep the log hidden. + if ( !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_FirEvntGone ); + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + continue; + } + // EVENT_N cannot be retriggered on a new PERSISTENCY_LOST_ERROR // if a previous PERSISTENCY_LOST_ERROR still exists. Meaning, we // cannot detect/report multiple errors that happen at different @@ -782,43 +1416,77 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, // and make the log predictive. io_sc.service_data->SetThresholdMaskId(0); - // Send persistency lost message to PHYP - l_rc = PlatServices::nvdimmNotifyPhypProtChange( dimm, - NVDIMM::UNPROTECTED_BECAUSE_ERROR ); + // Send message to PHYP that save/restore may work + l_rc = PlatServices::nvdimmNotifyProtChange( dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); if ( SUCCESS != l_rc ) continue; - // Analyze Health Status0 Reg, Health Status1 Reg, - // and Error Theshold Status Reg - l_rc = __analyzeHealthStatus0Reg( io_sc, dimm ); - if ( SUCCESS != l_rc ) continue; - l_rc = __analyzeHealthStatus1Reg( io_sc, dimm ); - if ( SUCCESS != l_rc ) continue; - l_rc = __analyzeErrorThrStatusReg( io_sc, dimm ); + } + // BIT 1: Warning Threshold Exceeded + else if ( bitList.count(1) ) + { + l_rc = __analyzeWarningThrStatusReg( io_sc, dimm, errFound ); if ( SUCCESS != l_rc ) continue; + + if ( !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_FirEvntGone ); + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + continue; + } } - // BIT 1: Warning Threshold Exceeded -- ignore // BIT 2: Persistency Restored - if ( bitList.count(2) ) + else if ( bitList.count(2) ) { // It would be rare to have an intermittent error that comes and // goes so fast we only see PERSISTENCY_RESTORED and not // PERSISTENCY_LOST_ERROR. Set predictive on threshold of 32 // per day (rule code handles the thresholding), else just keep // as a hidden log. - io_sc.service_data->AddSignatureList( dimm, PRDFSIG_NvdimmPersRes ); + __addSignature( io_sc, i_chip->getTrgt(), errFound, + PRDFSIG_NvdimmPersRes ); + + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + } + // BIT 3: Below Warning Threshold + else if ( bitList.count(3) ) + { + // Much like the persistency restored bit above, we don't expect + // to see this, so just make a hidden log. + __addSignature( io_sc, i_chip->getTrgt(), errFound, + PRDFSIG_BelowWarnTh ); + + // Callout NVDIMM + io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD ); + } + // BIT 4: Hardware Failure -- ignore - no logic feeding this + // BIT 5: EVENT_N_LOW -- ignore + // BIT 6:7: Unused + + // If we reach a threshold on MCACALFIR[8] of 32 per day, we assume + // some intermittent error must be triggering the FIR that isn't a + // persistency lost error which would cause us to mask. The rule code + // handles the actual thresholding here. + if ( io_sc.service_data->IsAtThreshold() && !errFound ) + { + io_sc.service_data->setSignature( i_chip->getHuid(), + PRDFSIG_IntNvdimmErr ); // callout NVDIMM high, cable high, BPM high, no gard io_sc.service_data->SetCallout( dimm, MRU_HIGH, NO_GARD ); l_rc = __addBpmCallout( dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != l_rc ) continue; - l_rc = __addNvdimmCableCallout( HWAS::SRCI_PRIORITY_HIGH ); + l_rc = __addNvdimmCableCallout( dimm, HWAS::SRCI_PRIORITY_HIGH ); if ( SUCCESS != l_rc ) continue; - } - // BIT 3: Below Warning Threshold -- ignore - // BIT 4: Hardware Failure -- ignore - // BIT 5: EVENT_N_LOW -- ignore - // BIT 6:7: Unused + // Send message to PHYP that save/restore may work + l_rc = PlatServices::nvdimmNotifyProtChange( dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != l_rc ) continue; + } } #else // IPL only @@ -826,7 +1494,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, PRDF_ERR( PRDF_FUNC "Unexpected call to analyze NVDIMMs at IPL." ); io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); - #endif + #endif // end runtime vs IPL check + + #else // CONFIG_NVDIMM not defined + + PRDF_ERR( PRDF_FUNC "CONFIG_NVDIMM not defined." ); + io_sc.service_data->SetCallout( LEVEL2_SUPPORT, MRU_HIGH, NO_GARD ); + + #endif // end CONFIG_NVDIMM check return SUCCESS; // nothing to return to rule code |