diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2019-09-06 16:28:31 -0500 |
---|---|---|
committer | Zane C Shelley <zshelle@us.ibm.com> | 2019-09-11 22:14:41 -0500 |
commit | b08fe621d31fd7aeaca6ef27cb8b11ce5b07e259 (patch) | |
tree | 7f07a27abcec22e74e1483c61a37fdd74fdd3ebd | |
parent | f5de75d9d9b1b648ed0ad6d782390b0a793017ff (diff) | |
download | talos-hostboot-b08fe621d31fd7aeaca6ef27cb8b11ce5b07e259.tar.gz talos-hostboot-b08fe621d31fd7aeaca6ef27cb8b11ce5b07e259.zip |
PRD: Update NVDIMM warning threshold adjustment
Change-Id: I6ea62e85eddba8eec04a47478462d3493db13cfb
CQ: SW474559
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/83434
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com>
Reviewed-by: Brian J Stegmiller <bjs@us.ibm.com>
Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/83457
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H | 1 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mca.C | 182 |
2 files changed, 145 insertions, 38 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H index b9604f56d..7f078957b 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H @@ -110,6 +110,7 @@ PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temper PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold"); PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold"); PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error"); +PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error"); #endif // __prdfMemExtraSig_H diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 2292b4b22..c7a16e60f 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -842,20 +842,27 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc, /** * @brief Adjusts the warning threshold so that future warnings are allowed * to report. + * @param io_sc The step code data struct. * @param i_dimm The target nvdimm. * @param i_warnThReg The address of the relevant warning threshold register. * @param i_errThReg The address of the relevant error threshold register. * @param o_firstWarn Flag if this is the first warning of this type. + * @param o_statusErr Flag to tell if we found an error from checking the + * notification status register. * @return FAIL if unable to read register, else SUCCESS */ -uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg, - uint16_t i_errThReg, bool & o_firstWarn ) +uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc, + TargetHandle_t i_dimm, uint16_t i_warnThReg, + uint16_t i_errThReg, bool & o_firstWarn, + bool & o_statusErr ) { #define PRDF_FUNC "[__adjustThreshold] " uint32_t o_rc = SUCCESS; - uint16_t notifReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD; + uint16_t notifCmdReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_CMD; + uint16_t notifStatusReg = NVDIMM::i2cReg::SET_EVENT_NOTIFICATION_STATUS; o_firstWarn = false; + o_statusErr = false; do { @@ -894,33 +901,65 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg, { o_firstWarn = true; - // Set SET_EVENT_NOTIFICATION_CMD[1]: Warning Threshold - // Notification = 0 - // First read the register. - uint8_t warnThNotif = 0; - errl = deviceRead( i_dimm, &warnThNotif, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(notifReg) ); + // SET_EVENT_NOTIFICATION_CMD is a write only register that is + // used to change the SET_EVENT_NOTIFICATION_STATUS register. + // The only bits within it that are used are bits 0 and 1, as such + // we can safely set the rest to 0. It is defined as: + // [0]: Persistency Notification + // [1]: Warning Threshold Notification + // [2]: Obsolete + // [3]: Firmware Activation Notification (Not Used) + // [4:7]: Reserved + + // Clear SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + uint8_t notifCmd = 0x01; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); if ( errl ) { - PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification " "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); o_rc = FAIL; break; } - // Clear bit 1 - warnThNotif &= 0xfd; - errl = deviceWrite( i_dimm, &warnThNotif, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(notifReg) ); + // Check SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + uint8_t notifStat = 0; + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); if ( errl ) { - PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification " - "Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); o_rc = FAIL; break; } + std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 1 + if ( bitList.count(1) || !bitList.count(2) || bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } + // Set the warning threshold to error threshold + 1 warnTh = errTh+1; @@ -935,12 +974,10 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg, break; } - // Set SET_EVENT_NOTIFICATION_CMD[1]: Warning Threshold - // Notification = 1 - // Set bit 1 - warnThNotif |= 0x02; - errl = deviceWrite( i_dimm, &warnThNotif, NVDIMM_SIZE, - DEVICE_NVDIMM_ADDRESS(notifReg) ); + // Set SET_EVENT_NOTIFICATION_CMD bit 1 and keep bit 0 set + notifCmd = 0x03; + errl = deviceWrite( i_dimm, ¬ifCmd, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifCmdReg) ); if ( errl ) { PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification " @@ -950,6 +987,40 @@ uint32_t __adjustThreshold( TargetHandle_t i_dimm, uint16_t i_warnThReg, break; } + // Recheck SET_EVENT_NOTIFICATION_STATUS to ensure everything is set + // as we expect and we don't see any errors. + errl = deviceRead( i_dimm, ¬ifStat, NVDIMM_SIZE, + DEVICE_NVDIMM_ADDRESS(notifStatusReg) ); + if ( errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification " + "Status Reg. HUID: 0x%08x", getHuid(i_dimm) ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + bitList = __nvdimmGetActiveBits( notifStat ); + + // if Bit [1]: SET_EVENT_NOTIFICATION_ERROR = 1 + // or Bit [2]: PERSISTENCY_ENABLED = 0 + // or Bit [3]: WARNING_THRESHOLD_ENABLED = 0 + if ( bitList.count(1) || !bitList.count(2) || !bitList.count(3) ) + { + o_statusErr = true; + + // Make the log predictive and mask the fir + io_sc.service_data->SetThresholdMaskId(0); + + // Callout the NVDIMM, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + + // Send message to PHYP that save/restore may work + o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm, + NVDIMM::NVDIMM_RISKY_HW_ERROR ); + if ( SUCCESS != o_rc ) break; + + break; + } } // Note: moving the threshold should clear the warning from // WARNING_THRESHOLD_STATUS, which allows future warnings to report. @@ -1058,29 +1129,46 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, // Make the log predictive, but do not mask the FIR io_sc.service_data->setServiceCall(); - // Callout NVDIMM on 1st, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); - // Adjust warning threshold. uint16_t warnThReg = NVDIMM::i2cReg::NVM_LIFETIME_WARNING_THRESHOLD; uint16_t errThReg = NVDIMM::i2cReg::NVM_LIFETIME_ERROR_THRESHOLD; bool firstWarn = false; - o_rc = __adjustThreshold( i_dimm, warnThReg, errThReg, firstWarn ); + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); if ( SUCCESS != o_rc ) break; + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout NVDIMM on 1st, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_MED, NO_GARD ); + } + // Update signature depending on whether this is the first or second // warning of this type. if ( firstWarn ) { - __addSignature( io_sc, mca, io_errFound, - PRDFSIG_NvmLifeWarn1 ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn1 ); } else { - __addSignature( io_sc, mca, io_errFound, - PRDFSIG_NvmLifeWarn2 ); + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NvmLifeWarn2 ); } + io_errFound = true; } // BIT 1: ES_LIFETIME_WARNING @@ -1089,20 +1177,38 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc, // Make the log predictive, but do not mask the FIR io_sc.service_data->setServiceCall(); - // Callout BPM (backup power module) high - o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); - if ( SUCCESS != o_rc ) break; - - // Callout NVDIMM low, no gard - io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); - // Adjust warning threshold. uint16_t warnThReg = NVDIMM::i2cReg::ES_LIFETIME_WARNING_THRESHOLD; uint16_t errThReg = NVDIMM::i2cReg::ES_LIFETIME_ERROR_THRESHOLD; bool firstWarn = false; - o_rc = __adjustThreshold( i_dimm, warnThReg, errThReg, firstWarn ); + bool statusErr = false; + o_rc = __adjustThreshold( io_sc, i_dimm, warnThReg, errThReg, + firstWarn, statusErr ); if ( SUCCESS != o_rc ) break; + // If we got a set event notification status error, add the + // signature for that before adding the signature for the warning. + // Also do not take our normal callout action since we already will + // have called out the NVDIMM because of the status error. + if ( statusErr ) + { + __addSignature( io_sc, mca, io_errFound, PRDFSIG_NotifStatErr ); + + // Need to set io_errFound here so the warning signature is + // added to the multi-signature list instead of as the primary + // signature. + io_errFound = true; + } + else + { + // Callout BPM (backup power module) high + o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH ); + if ( SUCCESS != o_rc ) break; + + // Callout NVDIMM low, no gard + io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD ); + } + // Update signature depending on whether this is the first or second // warning of this type. if ( firstWarn ) |