summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCaleb Palmer <cnpalmer@us.ibm.com>2019-09-06 16:28:31 -0500
committerZane C Shelley <zshelle@us.ibm.com>2019-09-11 22:14:41 -0500
commitb08fe621d31fd7aeaca6ef27cb8b11ce5b07e259 (patch)
tree7f07a27abcec22e74e1483c61a37fdd74fdd3ebd
parentf5de75d9d9b1b648ed0ad6d782390b0a793017ff (diff)
downloadtalos-hostboot-b08fe621d31fd7aeaca6ef27cb8b11ce5b07e259.tar.gz
talos-hostboot-b08fe621d31fd7aeaca6ef27cb8b11ce5b07e259.zip
PRD: Update NVDIMM warning threshold adjustment
Change-Id: I6ea62e85eddba8eec04a47478462d3493db13cfb CQ: SW474559 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/83434 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com> Reviewed-by: Brian J Stegmiller <bjs@us.ibm.com> Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com> Reviewed-by: Zane C Shelley <zshelle@us.ibm.com> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/83457 Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rw-r--r--src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H1
-rw-r--r--src/usr/diag/prdf/plat/mem/prdfP9Mca.C182
2 files changed, 145 insertions, 38 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
index b9604f56d..7f078957b 100644
--- a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
+++ b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
@@ -110,6 +110,7 @@ PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temper
PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold");
PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold");
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
+PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
#endif // __prdfMemExtraSig_H
diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
index 2292b4b22..c7a16e60f 100644
--- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
+++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C
@@ -842,20 +842,27 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
/**
* @brief Adjusts the warning threshold so that future warnings are allowed
* to report.
+ * @param io_sc The step code data struct.
* @param i_dimm The target nvdimm.
* @param i_warnThReg The address of the relevant warning threshold register.
* @param i_errThReg The address of the relevant error threshold register.
* @param o_firstWarn Flag if this is the first warning of this type.
+ * @param o_statusErr Flag to tell if we found an error from checking the
+ * notification status register.
* @return FAIL if unable to read register, else SUCCESS
*/
-uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg,
- uint16_t i_errThReg, bool & o_firstWarn )
+uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
+ TargetHandle_t i_dimm, uint16_t i_warnThReg,
+ uint16_t i_errThReg, bool & o_firstWarn,
+ bool & o_statusErr )
{
#define PRDF_FUNC "[__adjustThreshold] "
uint32_t o_rc = SUCCESS;
- uint16_t notifReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD;
+ uint16_t notifCmdReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD;
+ uint16_t notifStatusReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_STATUS;
o_firstWarn = false;
+ o_statusErr = false;
do
{
@@ -894,33 +901,65 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg,
{
o_firstWarn = true;
- // Set SET_EVENT_NOTIFICATION_CMD[1]: Warning Threshold
- // Notification = 0
- // First read the register.
- uint8_t warnThNotif = 0;
- errl = deviceRead( i_dimm, &warnThNotif, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(notifReg) );
+ // SET_EVENT_NOTIFICATION_CMD is a write only register that is
+ // used to change the SET_EVENT_NOTIFICATION_STATUS register.
+ // The only bits within it that are used are bits 0 and 1, as such
+ // we can safely set the rest to 0. It is defined as:
+ // [0]: Persistency Notification
+ // [1]: Warning Threshold Notification
+ // [2]: Obsolete
+ // [3]: Firmware Activation Notification (Not Used)
+ // [4:7]: Reserved
+
+ // Clear SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set
+ uint8_t notifCmd = 0x01;
+ errl = deviceWrite( i_dimm, &notifCmd, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifCmdReg) );
if ( errl )
{
- PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
+ PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification "
"Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
break;
}
- // Clear bit 1
- warnThNotif &= 0xfd;
- errl = deviceWrite( i_dimm, &warnThNotif, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(notifReg) );
+ // Check SET_EVENT_NOTIFICATION_STATUS to ensure everything is set
+ // as we expect and we don't see any errors.
+ uint8_t notifStat = 0;
+ errl = deviceRead( i_dimm, &notifStat, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifStatusReg) );
if ( errl )
{
- PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification "
- "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
+ "Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
break;
}
+ std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat );
+
+ // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1
+ // or Bit [2]: PERSISTENCY_ENABLED = 0
+ // or Bit [3]: WARNING_THRESHOLD_ENABLED = 1
+ if ( bitList.count(1) || !bitList.count(2) || bitList.count(3) )
+ {
+ o_statusErr = true;
+
+ // Make the log predictive and mask the fir
+ io_sc.service_data->SetThresholdMaskId(0);
+
+ // Callout the NVDIMM, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+
+ break;
+ }
+
// Set the warning threshold to error threshold + 1
warnTh = errTh+1;
@@ -935,12 +974,10 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg,
break;
}
- // Set SET_EVENT_NOTIFICATION_CMD[1]: Warning Threshold
- // Notification = 1
- // Set bit 1
- warnThNotif |= 0x02;
- errl = deviceWrite( i_dimm, &warnThNotif, NVDIMM_SIZE,
- DEVICE_NVDIMM_ADDRESS(notifReg) );
+ // Set SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set
+ notifCmd = 0x03;
+ errl = deviceWrite( i_dimm, &notifCmd, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifCmdReg) );
if ( errl )
{
PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification "
@@ -950,6 +987,40 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg,
break;
}
+ // Recheck SET_EVENT_NOTIFICATION_STATUS to ensure everything is set
+ // as we expect and we don't see any errors.
+ errl = deviceRead( i_dimm, &notifStat, NVDIMM_SIZE,
+ DEVICE_NVDIMM_ADDRESS(notifStatusReg) );
+ if ( errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
+ "Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
+ PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
+ o_rc = FAIL;
+ break;
+ }
+ bitList = __nvdimmGetActiveBits( notifStat );
+
+ // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1
+ // or Bit [2]: PERSISTENCY_ENABLED = 0
+ // or Bit [3]: WARNING_THRESHOLD_ENABLED = 0
+ if ( bitList.count(1) || !bitList.count(2) || !bitList.count(3) )
+ {
+ o_statusErr = true;
+
+ // Make the log predictive and mask the fir
+ io_sc.service_data->SetThresholdMaskId(0);
+
+ // Callout the NVDIMM, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+
+ // Send message to PHYP that save/restore may work
+ o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
+ NVDIMM::NVDIMM_RISKY_HW_ERROR );
+ if ( SUCCESS != o_rc ) break;
+
+ break;
+ }
}
// Note: moving the threshold should clear the warning from
// WARNING_THRESHOLD_STATUS, which allows future warnings to report.
@@ -1058,29 +1129,46 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// Make the log predictive, but do not mask the FIR
io_sc.service_data->setServiceCall();
- // Callout NVDIMM on 1st, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
-
// Adjust warning threshold.
uint16_t warnThReg = NVDIMM::i2cReg::NVM_LIFETIME_WARNING_THRESHOLD;
uint16_t errThReg = NVDIMM::i2cReg::NVM_LIFETIME_ERROR_THRESHOLD;
bool firstWarn = false;
- o_rc = __adjustThreshold( i_dimm, warnThReg, errThReg, firstWarn );
+ bool statusErr = false;
+ o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg,
+ firstWarn, statusErr );
if ( SUCCESS != o_rc ) break;
+ // If we got a set event notification status error, add the
+ // signature for that before adding the signature for the warning.
+ // Also do not take our normal callout action since we already will
+ // have called out the NVDIMM because of the status error.
+ if ( statusErr )
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr );
+
+ // Need to set io_errFound here so the warning signature is
+ // added to the multi-signature list instead of as the primary
+ // signature.
+ io_errFound = true;
+ }
+ else
+ {
+ // Callout NVDIMM on 1st, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD );
+ }
+
// Update signature depending on whether this is the first or second
// warning of this type.
if ( firstWarn )
{
- __addSignature( io_sc, mca, io_errFound,
- PRDFSIG_NvmLifeWarn1 );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn1 );
}
else
{
- __addSignature( io_sc, mca, io_errFound,
- PRDFSIG_NvmLifeWarn2 );
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn2 );
}
+
io_errFound = true;
}
// BIT 1: ES_LIFETIME_WARNING
@@ -1089,20 +1177,38 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// Make the log predictive, but do not mask the FIR
io_sc.service_data->setServiceCall();
- // Callout BPM (backup power module) high
- o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
- if ( SUCCESS != o_rc ) break;
-
- // Callout NVDIMM low, no gard
- io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
-
// Adjust warning threshold.
uint16_t warnThReg = NVDIMM::i2cReg::ES_LIFETIME_WARNING_THRESHOLD;
uint16_t errThReg = NVDIMM::i2cReg::ES_LIFETIME_ERROR_THRESHOLD;
bool firstWarn = false;
- o_rc = __adjustThreshold( i_dimm, warnThReg, errThReg, firstWarn );
+ bool statusErr = false;
+ o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg,
+ firstWarn, statusErr );
if ( SUCCESS != o_rc ) break;
+ // If we got a set event notification status error, add the
+ // signature for that before adding the signature for the warning.
+ // Also do not take our normal callout action since we already will
+ // have called out the NVDIMM because of the status error.
+ if ( statusErr )
+ {
+ __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr );
+
+ // Need to set io_errFound here so the warning signature is
+ // added to the multi-signature list instead of as the primary
+ // signature.
+ io_errFound = true;
+ }
+ else
+ {
+ // Callout BPM (backup power module) high
+ o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
+ if ( SUCCESS != o_rc ) break;
+
+ // Callout NVDIMM low, no gard
+ io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );
+ }
+
// Update signature depending on whether this is the first or second
// warning of this type.
if ( firstWarn )
OpenPOWER on IntegriCloud