summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H2
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfP9Mca.C80
2 files changed, 66 insertions, 16 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
index 05cf5d0d8..7bcf0e573 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
@@ -112,6 +112,8 @@ PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Thresh
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
+PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
+PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");
#endif // __prdfMemExtraSig_H
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
index 19e90c01b..fc32c557b 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
+++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
@@ -739,15 +739,18 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
* @param io_sc The step code data struct.
* @param i_dimm The target dimm.
* @param io_errFound Whether an error has already been found or not.
+ * @param o_esTempErr A flag for whether we hit an ES TEMP error or not.
* @return FAIL if unable to read register, else SUCCESS
*/
uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
- TargetHandle_t i_dimm, bool & io_errFound )
+ TargetHandle_t i_dimm, bool & io_errFound,
+ bool & o_esTempErr )
{
#define PRDF_FUNC "[__analyzeErrorThrStatusReg] "
uint32_t o_rc = SUCCESS;
uint8_t data = 0;
+ o_esTempErr = false;
// Get MCA, for signatures
TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
@@ -787,21 +790,29 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// BIT 2: ES Temperature Error
if ( bitList.count(2) )
{
+ // Sleep two seconds to avoid exiting PRD analysis faster than the
+ // ES_TEMP sample rate.
+ PlatServices::milliSleep( 2, 0 );
+
// Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values
uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;
-
uint16_t esTemp = 0;
o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
if ( SUCCESS != o_rc ) break;
uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1;
uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0;
-
uint16_t esTempHighTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
if ( SUCCESS != o_rc ) break;
+ msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1;
+ lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0;
+ uint16_t esTempLowTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
+ if ( SUCCESS != o_rc ) break;
+
// Check to see if the ES_TEMP is negative (bit 12)
bool esTempNeg = false;
if ( esTemp & 0x1000 ) esTempNeg = true;
@@ -814,12 +825,20 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpErrHigh );
}
- // Else assume the warning is because of a low threshold.
- else
+ // Else check if the error hit the low threshold, again with the
+ // same 2°C margin.
+ else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpErrLow );
}
+ // Else the temperature must have gone back to a normal value, so
+ // we will label this as a false alarm case.
+ else
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpErrFa );
+ }
// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
@@ -827,6 +846,8 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+
+ o_esTempErr = true;
io_errFound = true;
}
// BIT 3:7: Reserved
@@ -1072,21 +1093,29 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// BIT 2: ES_TEMP_WARNING
if ( bitList.count(2) )
{
+ // Sleep two seconds to avoid exiting PRD analysis faster than the
+ // ES_TEMP sample rate.
+ PlatServices::milliSleep( 2, 0 );
+
// Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values
uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;
-
uint16_t esTemp = 0;
o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
if ( SUCCESS != o_rc ) break;
uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1;
uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0;
-
uint16_t esTempHighTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
if ( SUCCESS != o_rc ) break;
+ msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1;
+ lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0;
+ uint16_t esTempLowTh = 0;
+ o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
+ if ( SUCCESS != o_rc ) break;
+
// Check to see if the ES_TEMP is negative (bit 12)
bool esTempNeg = false;
if ( esTemp & 0x1000 ) esTempNeg = true;
@@ -1099,12 +1128,20 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpWarnHigh );
}
- // Else assume the warning is because of a low threshold.
- else
+ // Else check if the warning hit the low threshold, again with the
+ // same 2°C margin.
+ else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpWarnLow );
}
+ // Else the temperature must have gone back to a normal value, so
+ // we will label this as a false alarm case.
+ else
+ {
+ __addSignature( io_sc, mca, io_errFound,
+ PRDFSIG_EsTmpWarnFa );
+ }
// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
@@ -1113,13 +1150,19 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
- // Make the log predictive and mask the FIR.
- io_sc.service_data->SetThresholdMaskId(0);
+ // Because of the possibility of intermittent ES temperature
+ // false alarm readings, we will keep the log hidden. If there is
+ // an actual ES temperature problem, we assume we will continue
+ // to be called to handle the temperature warning and hit threshold.
- // Send message to PHYP that save/restore may work
- o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
- NVDIMM::NVDIMM_RISKY_HW_ERROR );
- if ( SUCCESS != o_rc ) break;
+ // Only send the save/restore message to PHYP if we hit threshold.
+ if ( io_sc.service_data->IsAtThreshold() )
+ {
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+ }
io_errFound = true;
}
@@ -1348,9 +1391,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
if ( SUCCESS != l_rc ) continue;
l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
- l_rc = __analyzeErrorThrStatusReg( io_sc, dimm, errFound );
+ bool esTempErr = false;
+ l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr);
if ( SUCCESS != l_rc ) continue;
+ // If we hit an ES temperature error and have not yet hit threshold,
+ // then keep the log hidden.
+ if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue;
+
// If we didn't find any error, then keep the log hidden.
if ( !errFound )
{
OpenPOWER on IntegriCloud