diff options
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H | 2 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mca.C | 80 |
2 files changed, 66 insertions, 16 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H index 05cf5d0d8..7bcf0e573 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H @@ -112,6 +112,8 @@ PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Thresh PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error"); PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error"); PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present"); +PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm"); +PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm"); #endif // __prdfMemExtraSig_H diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 19e90c01b..fc32c557b 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -739,15 +739,18 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg, * @param io_sc The step code data struct. * @param i_dimm The target dimm. * @param io_errFound Whether an error has already been found or not. + * @param o_esTempErr A flag for whether we hit an ES TEMP error or not. * @return FAIL if unable to read register, else SUCCESS */ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, - TargetHandle_t i_dimm, bool & io_errFound ) + TargetHandle_t i_dimm, bool & io_errFound, + bool & o_esTempErr ) { #define PRDF_FUNC "[__analyzeErrorThrStatusReg] " uint32_t o_rc = SUCCESS; uint8_t data = 0; + o_esTempErr = false; // Get MCA, for signatures TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA ); @@ -787,21 +790,29 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // BIT 2: ES Temperature Error if ( bitList.count(2) ) { + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + // Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; - uint16_t esTemp = 0; o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); if ( SUCCESS != o_rc ) break; uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1; uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0; - uint16_t esTempHighTh = 0; o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); if ( SUCCESS != o_rc ) break; + msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + // Check to see if the ES_TEMP is negative (bit 12) bool esTempNeg = false; if ( esTemp & 0x1000 ) esTempNeg = true; @@ -814,12 +825,20 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsTmpErrHigh ); } - // Else assume the warning is because of a low threshold. - else + // Else check if the error hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) { __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsTmpErrLow ); } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpErrFa ); + } // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -827,6 +846,8 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + + o_esTempErr = true; io_errFound = true; } // BIT 3:7: Reserved @@ -1072,21 +1093,29 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, // BIT 2: ES_TEMP_WARNING if ( bitList.count(2) ) { + // Sleep two seconds to avoid exiting PRD analysis faster than the + // ES_TEMP sample rate. + PlatServices::milliSleep( 2, 0 ); + // Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1; uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0; - uint16_t esTemp = 0; o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp ); if ( SUCCESS != o_rc ) break; uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1; uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0; - uint16_t esTempHighTh = 0; o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh ); if ( SUCCESS != o_rc ) break; + msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1; + lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0; + uint16_t esTempLowTh = 0; + o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh ); + if ( SUCCESS != o_rc ) break; + // Check to see if the ES_TEMP is negative (bit 12) bool esTempNeg = false; if ( esTemp & 0x1000 ) esTempNeg = true; @@ -1099,12 +1128,20 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsTmpWarnHigh ); } - // Else assume the warning is because of a low threshold. - else + // Else check if the warning hit the low threshold, again with the + // same 2°C margin. + else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg ) { __addSignature( io_sc, mca, io_errFound, PRDFSIG_EsTmpWarnLow ); } + // Else the temperature must have gone back to a normal value, so + // we will label this as a false alarm case. + else + { + __addSignature( io_sc, mca, io_errFound, + PRDFSIG_EsTmpWarnFa ); + } // Callout BPM (backup power module) high o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); @@ -1113,13 +1150,19 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, // Callout NVDIMM low, no gard io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); - // Make the log predictive and mask the FIR. - io_sc.service_data->SetThresholdMaskId(0); + // Because of the possibility of intermittent ES temperature + // false alarm readings, we will keep the log hidden. If there is + // an actual ES temperature problem, we assume we will continue + // to be called to handle the temperature warning and hit threshold. - // Send message to PHYP that save/restore may work - o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, - NVDIMM::NVDIMM_RISKY_HW_ERROR ); - if ( SUCCESS != o_rc ) break; + // Only send the save/restore message to PHYP if we hit threshold. + if ( io_sc.service_data->IsAtThreshold() ) + { + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + } io_errFound = true; } @@ -1348,9 +1391,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip, if ( SUCCESS != l_rc ) continue; l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound ); if ( SUCCESS != l_rc ) continue; - l_rc = __analyzeErrorThrStatusReg( io_sc, dimm, errFound ); + bool esTempErr = false; + l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr); if ( SUCCESS != l_rc ) continue; + // If we hit an ES temperature error and have not yet hit threshold, + // then keep the log hidden. + if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue; + // If we didn't find any error, then keep the log hidden. if ( !errFound ) { |