From ce0d29c96c4619e07819760b84c32f0eb7812b5c Mon Sep 17 00:00:00 2001 From: Corey Swenson Date: Fri, 6 Sep 2019 01:34:02 -0500 Subject: Add vendor log data to FFDC for all NVDIMM HW errors Read vendor log data from NVDIMM Do checksum compare Add to error log as string Change-Id: I41a295bf54d031c978b59fe1f59c98507fbeec81 CQ:SW473585 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/83383 Tested-by: Jenkins Server Tested-by: FSP CI Jenkins Reviewed-by: Daniel M Crowell --- src/usr/isteps/nvdimm/bpm_update.C | 23 ++ src/usr/isteps/nvdimm/nvdimm.C | 658 ++++++++++++++++++++++++++++++ src/usr/isteps/nvdimm/nvdimm_update.C | 13 + src/usr/isteps/nvdimm/runtime/nvdimm_rt.C | 7 + 4 files changed, 701 insertions(+) (limited to 'src/usr/isteps/nvdimm') diff --git a/src/usr/isteps/nvdimm/bpm_update.C b/src/usr/isteps/nvdimm/bpm_update.C index 76704c1c7..7910c1e5b 100644 --- a/src/usr/isteps/nvdimm/bpm_update.C +++ b/src/usr/isteps/nvdimm/bpm_update.C @@ -739,6 +739,7 @@ errlHndl_t Bpm::issueCommand(const uint8_t i_command, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -867,6 +868,7 @@ errlHndl_t Bpm::issueCommand(const uint8_t i_command, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } } @@ -1163,6 +1165,7 @@ errlHndl_t Bpm::inUpdateMode() TRACFCOMP(g_trac_bpm, "Bpm::inUpdateMode(): " "Failed to read error status register"); errl->collectTrace(BPM_COMP_NAME); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -1189,6 +1192,7 @@ errlHndl_t Bpm::inUpdateMode() HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -1252,6 +1256,7 @@ errlHndl_t Bpm::enterUpdateMode() TARGETING::get_huid(iv_nvdimm)); infoErrl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); ERRORLOG::errlCommit(infoErrl, BPM_COMP_ID); } while(0); @@ -1346,6 +1351,7 @@ errlHndl_t Bpm::exitUpdateMode() HWAS::SRCI_PRIORITY_HIGH); infoErrl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); ERRORLOG::errlCommit(infoErrl, BPM_COMP_ID); } while(0); @@ -1508,6 +1514,7 @@ errlHndl_t Bpm::updateFirmware(BpmFirmwareLidImage i_image) HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); // Change the state of iv_attemptAnotherUpdate to signal // if another update attempt should occur. @@ -1698,6 +1705,7 @@ errlHndl_t Bpm::enterBootstrapLoaderMode() errl->addPartCallout(iv_nvdimm, HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); + nvdimmAddVendorLog(iv_nvdimm, errl); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); break; @@ -1795,6 +1803,7 @@ errlHndl_t Bpm::setupPayload(payload_t & o_payload, TARGETING::get_huid(iv_nvdimm)); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2141,6 +2150,7 @@ errlHndl_t Bpm::writeViaScapRegister(uint8_t const i_reg, uint8_t const i_data) HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2236,6 +2246,7 @@ errlHndl_t Bpm::writeViaScapRegister(uint8_t const i_reg, uint8_t const i_data) HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2308,6 +2319,7 @@ errlHndl_t Bpm::disableWriteProtection() HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2482,6 +2494,7 @@ errlHndl_t Bpm::writeToMagicRegisters( HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2578,6 +2591,7 @@ errlHndl_t Bpm::dumpSegment(uint16_t const i_segmentCode, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -2702,6 +2716,7 @@ errlHndl_t Bpm::dumpSegment(uint16_t const i_segmentCode, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); setAttemptAnotherUpdate(); break; } @@ -3169,6 +3184,7 @@ errlHndl_t Bpm::getResponse(uint8_t * const o_responseData, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -3348,6 +3364,7 @@ errlHndl_t Bpm::blockWrite(payload_t i_payload) HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); + nvdimmAddVendorLog(iv_nvdimm, errl); } @@ -3432,6 +3449,7 @@ errlHndl_t Bpm::blockWriteRetry(payload_t i_payload) HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); } @@ -3504,6 +3522,7 @@ errlHndl_t Bpm::waitForCommandStatusBitReset( HWAS::SRCI_PRIORITY_HIGH); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -3547,6 +3566,7 @@ errlHndl_t Bpm::waitForCommandStatusBitReset( TARGETING::get_huid(iv_nvdimm)); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -3611,6 +3631,7 @@ errlHndl_t Bpm::verifyGoodBpmState() status.full); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); } return errl; @@ -3664,6 +3685,7 @@ errlHndl_t Bpm::waitForBusyBit() TARGETING::get_huid(iv_nvdimm)); errl->collectTrace(BPM_COMP_NAME); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } @@ -3972,6 +3994,7 @@ errlHndl_t Bpm::checkFirmwareCrc() 0), TARGETING::get_huid(iv_nvdimm)); nvdimmAddPage4Regs(iv_nvdimm,errl); + nvdimmAddVendorLog(iv_nvdimm, errl); break; } diff --git a/src/usr/isteps/nvdimm/nvdimm.C b/src/usr/isteps/nvdimm/nvdimm.C index 3e9867ebe..2c2629d22 100644 --- a/src/usr/isteps/nvdimm/nvdimm.C +++ b/src/usr/isteps/nvdimm/nvdimm.C @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -136,6 +137,24 @@ static constexpr uint8_t KEY_ABORT_BYTE = 0xFF; // Currently only bits 1:6 need to be checked during init static constexpr uint8_t CSAVE_FAIL_BITS_MASK = 0x7E; +// LOG PAGE INFO +static constexpr size_t VENDOR_LOG_UNIT_SIZE = 256; +static constexpr size_t VENDOR_LOG_BLOCK_SIZE = 32; +static constexpr size_t VENDOR_BLOCK_DATA_BYTES = 32; + +// TYPED_BLOCK_DATA +static constexpr uint8_t VENDOR_DATA_TYPE = 0x04; +static constexpr uint8_t VENDOR_DEFAULT = 0x00; +static constexpr uint8_t FIRMWARE_IMAGE_DATA = 0x02; + +// Commands to OPERATIONAL_UNIT_OPS_CMD +static constexpr uint8_t GET_OPERATIONAL_UNIT = 0x01; +static constexpr uint8_t GENERATE_OPERATIONAL_UNIT_CKSUM = 0x08; + +static constexpr uint8_t MSBIT_SET_MASK = 0x80; +static constexpr uint8_t MSBIT_CLR_MASK = 0x7F; +static constexpr uint8_t OPERATION_SLEEP_SECONDS = 0x1; + #ifndef __HOSTBOOT_RUNTIME // Warning thresholds static constexpr uint8_t THRESHOLD_ES_LIFETIME = 0x07; // 7% @@ -575,6 +594,7 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) // Add Register Traces to error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); } }while(0); @@ -707,6 +727,7 @@ errlHndl_t nvdimmPollStatus ( Target *i_nvdimm, l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); } return l_err; @@ -756,6 +777,7 @@ errlHndl_t nvdimmPollBackupDone(Target* i_nvdimm, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME ); + nvdimmAddVendorLog(i_nvdimm, l_err); // Collect register data for FFDC Traces nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); @@ -824,6 +846,7 @@ errlHndl_t nvdimmPollRestoreDone(Target* i_nvdimm, // Collect register data for FFDC Traces nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); @@ -879,6 +902,7 @@ errlHndl_t nvdimmPollEraseDone(Target* i_nvdimm, l_err->collectTrace( NVDIMM_COMP_NAME ); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollEraseDone() nvdimm[%X]", @@ -1023,6 +1047,7 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) // Read relevant regs for trace data nvdimmTraceRegs(i_nvdimm, l_RegInfo); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); @@ -1263,6 +1288,7 @@ errlHndl_t nvdimmRestore(TargetHandleList& i_nvdimmList, uint8_t &i_mpipl) 0x0, ERRORLOG::ErrlEntry::NO_SW_CALLOUT); nvdimmAddPage4Regs(l_nvdimm,l_err); + nvdimmAddVendorLog(l_nvdimm, l_err); break; } @@ -1500,6 +1526,7 @@ errlHndl_t nvdimmOpenPage(Target *i_nvdimm, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(i_nvdimm, l_err); // Failure to open page most likely means problem with // the NV controller. @@ -1883,6 +1910,7 @@ errlHndl_t nvdimm_factory_reset(Target *i_nvdimm) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(i_nvdimm, l_err); // If nvdimm is not ready for access by now, this is // a failing indication on the NV controller @@ -2007,6 +2035,7 @@ errlHndl_t nvdimm_init(Target *i_nvdimm) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME ); + nvdimmAddVendorLog(i_nvdimm, l_err); // Failure to erase could mean internal NV controller error and/or // HW error on nand flash. NVDIMM will lose persistency if failed to @@ -2089,6 +2118,7 @@ errlHndl_t nvdimm_init(Target *i_nvdimm) // Collect register data for FFDC Traces nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); nvdimmAddPage4Regs(i_nvdimm,l_err); + nvdimmAddVendorLog(i_nvdimm, l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); @@ -2452,6 +2482,7 @@ bool nvdimm_encrypt_unlock(TargetHandleList &i_nvdimmList) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); l_err->addPartCallout( l_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); @@ -2968,6 +2999,7 @@ errlHndl_t nvdimm_setKeyReg(Target* i_nvdimm, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(i_nvdimm, l_err); l_err->addPartCallout( i_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); @@ -3144,6 +3176,7 @@ bool nvdimm_encrypt_enable(TargetHandleList &i_nvdimmList) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); l_err->addPartCallout( l_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); @@ -3284,6 +3317,7 @@ bool nvdimm_crypto_erase(TargetHandleList &i_nvdimmList) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); l_err->addPartCallout( l_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); @@ -3359,6 +3393,7 @@ bool nvdimm_crypto_erase(TargetHandleList &i_nvdimmList) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); l_err->addPartCallout( l_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); @@ -3654,6 +3689,629 @@ errlHndl_t notifyNvdimmProtectionChange(Target* i_target, } +/* + * @brief Get operational unit operation timeout + */ +errlHndl_t getOperOpsTimeout(TARGETING::Target* i_nvdimm, + uint16_t& o_timeout) +{ + errlHndl_t l_err = nullptr; + + do + { + // Get timeout lsb + uint8_t l_lsb = 0; + l_err = nvdimmReadReg(i_nvdimm, + OPERATIONAL_UNIT_OPS_TIMEOUT0, + l_lsb); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getOperOpsTimeout() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_OPS_TIMEOUT0); + break; + } + + // Get timeout msb + uint8_t l_msb = 0; + l_err = nvdimmReadReg(i_nvdimm, + OPERATIONAL_UNIT_OPS_TIMEOUT1, + l_msb); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getOperOpsTimeout() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_OPS_TIMEOUT1); + break; + } + + // Bit 7 of the MSB indicates whether the time should + // be interpreted in seconds or milliseconds + // 0 = millisecond + // 1 = second + if (l_msb < MSBIT_SET_MASK) + { + o_timeout = l_msb; + o_timeout <<= 8; + o_timeout += l_lsb; + o_timeout = o_timeout / MS_PER_SEC; + } + else + { + l_msb = l_msb & MSBIT_CLR_MASK; + o_timeout = l_msb; + o_timeout <<= 8; + o_timeout += l_lsb; + } + + } while(0); + + return l_err; +} + + +/* + * @brief Wait for operational unit operation to complete + */ +errlHndl_t waitOperOpsComplete(TARGETING::Target* i_nvdimm, uint8_t i_cmd) +{ + errlHndl_t l_err = nullptr; + bool l_complete = false; + uint16_t l_timeout = 0; + uint8_t l_status = 0; + + // Get the timeout + l_err = getOperOpsTimeout(i_nvdimm, l_timeout); + + do + { + // Exit if l_timeout invalid + if (l_err) + { + break; + } + + // Delay before reading status + nanosleep( OPERATION_SLEEP_SECONDS, 0 ); + if (OPERATION_SLEEP_SECONDS > l_timeout) + { + l_timeout = 0; + } + else + { + l_timeout = l_timeout - OPERATION_SLEEP_SECONDS; + } + + // Get timeout cmd status 1 + l_err = nvdimmReadReg(i_nvdimm, + NVDIMM_CMD_STATUS1, + l_status); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "waitOperOpsComplete() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), NVDIMM_CMD_STATUS1); + break; + } + + if (l_status >= 0x01) + { + // If bit 1 is set that means the command is in progress + // Wait for it to become 0 + } + else + { + l_complete = true; + break; + } + + } while(l_timeout > 0); + + // Timed out + if (!l_err && (l_complete == false) ) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "waitOperOpsComplete() nvdimm[%X] " + "Timeout waiting for operation 0x%X to complete, " + "NVDIMM_CMD_STATUS1 0x%X", + get_huid(i_nvdimm), i_cmd, l_status); + + // Get the timeout value again + getOperOpsTimeout(i_nvdimm, l_timeout); + + /*@ + *@errortype + *@reasoncode NVDIMM_VENDOR_LOG_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_WAIT_OPER_OPS_COMPLETE + *@userdata1[0:31] NVDIMM HUID + *@userdata1[32:63] OPERATIONAL_UNIT_OPS_CMD + *@userdata2[0:31] NVDIMM_CMD_STATUS1 + *@userdata2[32:63] OPERATIONAL_UNIT_OPS_TIMEOUT + *@devdesc NVDIMM timeout reading vendor log + *@custdesc NVDIMM logging error + */ + l_err = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_WAIT_OPER_OPS_COMPLETE, + NVDIMM_VENDOR_LOG_TIMEOUT, + TWO_UINT32_TO_UINT64( + get_huid(i_nvdimm), + i_cmd + ), + TWO_UINT32_TO_UINT64( + l_status, + l_timeout + ), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + } + + return l_err; +} + + +/* + * @brief Get the vendor log unit + */ +errlHndl_t getLogPerUnit(TARGETING::Target* i_nvdimm, + uint16_t i_unitId, + std::vector& o_unitData) +{ + // 3a) write OPERATIONAL_UNIT_ID0 and OPERATIONAL_UNIT_ID1 with unit_id + // 3b) set OPERATIONAL_UNIT_OPS_CMD to GET_OPERATIONAL_UNIT + // 3c) wait for NVDIMM_CMD_STATUS1 to return 0 + // 3d) for (block_id = 0; + // block_id < VENDOR_LOG_UNIT_SIZE/BLOCKSIZE; + // block_id++) + // 3da) Write block_id to BLOCK_ID + // 3db) Read TYPED_BLOCK_DATA_BYTE0 to TYPED_BLOCK_DATA_BYTE31 + // 3dc) Save data to buffer + + errlHndl_t l_err = nullptr; + + do + { + // 3a) + // Write the unit LSB + l_err = nvdimmWriteReg(i_nvdimm, + OPERATIONAL_UNIT_ID0, + i_unitId & 0x00FF); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getLogPerUnit() nvdimm[%X] error writing reg 0x%X to 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_ID0, (i_unitId & 0x00FF)); + break; + } + + // Write the unit MSB + l_err = nvdimmWriteReg(i_nvdimm, + OPERATIONAL_UNIT_ID1, + i_unitId >> 8); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getLogPerUnit() nvdimm[%X] error writing reg 0x%X to 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_ID0, (i_unitId >> 8) ); + break; + } + + // 3b) + // Write the cmd + l_err = nvdimmWriteReg(i_nvdimm, + OPERATIONAL_UNIT_OPS_CMD, + GET_OPERATIONAL_UNIT); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getLogPerUnit() nvdimm[%X] error writing reg 0x%X to 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_OPS_CMD, + GET_OPERATIONAL_UNIT ); + break; + } + + // 3c + l_err = waitOperOpsComplete(i_nvdimm, GET_OPERATIONAL_UNIT); + if (l_err) + { + break; + } + + // 3d + for (uint8_t l_blockId = 0; + l_blockId < (VENDOR_LOG_UNIT_SIZE / VENDOR_LOG_BLOCK_SIZE); + l_blockId++) + { + // 3da + // Write the block id + l_err = nvdimmWriteReg(i_nvdimm, + BLOCK_ID, + l_blockId); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getLogPerUnit() nvdimm[%X] error writing reg 0x%X to 0x%X", + get_huid(i_nvdimm), BLOCK_ID, l_blockId ); + break; + } + + // 3db + // Read all the block data + for (uint16_t l_byteId = TYPED_BLOCK_DATA_BYTE0; + l_byteId < (TYPED_BLOCK_DATA_BYTE0 + VENDOR_BLOCK_DATA_BYTES); + l_byteId++) + { + uint8_t l_data = 0; + l_err = nvdimmReadReg(i_nvdimm, + l_byteId, + l_data); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getLogPerUnit() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), l_byteId); + break; + } + + // 3dc + o_unitData.push_back(l_data); + } // for byteId + + if (l_err) + { + break; + } + } // for blockId + + } while(0); + + return l_err; +} + + +/* + * @brief Calculate CRC + */ +uint16_t crc16(const uint8_t * i_data, int i_size) +{ + // From JEDEC JESD245B.01 document + // https://www.jedec.org/standards-documents/docs/jesd245a + int i, crc; + crc = 0; + while (--i_size >= 0) + { + crc = crc ^ (int)*i_data++ << 8; + for (i = 0; i < 8; ++i) + { + if (crc & 0x8000) + { + crc = crc << 1 ^ 0x1021; + } + else + { + crc = crc << 1; + } + } + } + return (crc & 0xFFFF); +} + + +/* + * @brief Get operational unit crc + */ +errlHndl_t getOperUnitCrc(TARGETING::Target* i_nvdimm, uint16_t& o_crc) +{ + errlHndl_t l_err = nullptr; + + do + { + // Get crc lsb + uint8_t l_lsb = 0; + l_err = nvdimmReadReg(i_nvdimm, + OPERATIONAL_UNIT_CRC0, + l_lsb); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getOperUnitCrc() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_CRC0); + break; + } + + // Get crc msb + uint8_t l_msb = 0; + l_err = nvdimmReadReg(i_nvdimm, + OPERATIONAL_UNIT_CRC1, + l_msb); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "getOperUnitCrc() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_CRC1); + break; + } + + o_crc = l_msb; + o_crc <<= 8; + o_crc += l_lsb; + + } while(0); + + return l_err; +} + + +/* + * @brief Compare host and nvdimm checksum + */ +errlHndl_t compareCksum(TARGETING::Target* i_nvdimm, + std::vector& i_unitData) +{ + // 3e) Compare checksum for unit retrieved + // 3ea) Write GENERATE_OPERATIONAL_UNIT_CKSUM + // to OPERATIONAL_UNIT_OPS_CMD + // 3eb) wait for NVDIMM_CMD_STATUS1 to return 0 + // 3ec) Read OPERATIONAL_UNIT_CRC1(MSB) and OPERATIONAL_UNIT_CRC0(LSB) + // 3ed) Calculate host checksum + // 3ee) return true if 3ec) == 3ed) + + errlHndl_t l_err = nullptr; + + do + { + // 3ea) + // Command the nvdimm to calculate the CRC on the unit + l_err = nvdimmWriteReg(i_nvdimm, + OPERATIONAL_UNIT_OPS_CMD, + GENERATE_OPERATIONAL_UNIT_CKSUM); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "compareCksum() nvdimm[%X] error writing reg 0x%X to 0x%X", + get_huid(i_nvdimm), OPERATIONAL_UNIT_OPS_CMD, + GENERATE_OPERATIONAL_UNIT_CKSUM ); + break; + } + + // 3eb) + // Wait for the command to finish + l_err = waitOperOpsComplete(i_nvdimm, + GENERATE_OPERATIONAL_UNIT_CKSUM); + if (l_err) + { + break; + } + + // 3ec) + // Read the HW CRC MSB + LSB + uint16_t l_nvdimmCrc = 0; + l_err = getOperUnitCrc(i_nvdimm, l_nvdimmCrc); + if (l_err) + { + break; + } + + // 3ed) + // Calculate the host checksum + uint8_t* l_hostData = reinterpret_cast(i_unitData.data()); + uint16_t l_hostCrc = crc16(l_hostData, i_unitData.size()); + + // 3ee) + if (l_hostCrc != l_nvdimmCrc) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "compareCksum() nvdimm[%X] compare cksum failed " + "hostCrc 0x%X nvdimmCrc 0x%X", + get_huid(i_nvdimm), l_hostCrc, l_nvdimmCrc); + /*@ + *@errortype + *@reasoncode NVDIMM_VENDOR_LOG_CKSUM_FAILED + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_COMPARE_CKSUM + *@userdata1 NVDIMM HUID + *@userdata2[0:31] HOST CRC + *@userdata2[32:63] NVDIMM CRC + *@devdesc NVDIMM vendor log checksum failed + *@custdesc NVDIMM logging error + */ + l_err = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_COMPARE_CKSUM, + NVDIMM_VENDOR_LOG_CKSUM_FAILED, + get_huid(i_nvdimm), + TWO_UINT32_TO_UINT64( + l_hostCrc, + l_nvdimmCrc), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + } + + } while(0); + + return l_err; +} + + +/* + * @brief Add vendor log data to FFDC + * Added to all NVDIMM HW errors + */ +void nvdimmAddVendorLog( TARGETING::Target* i_nvdimm, errlHndl_t& io_err ) +{ + TRACFCOMP( g_trac_nvdimm, ENTER_MRK + "nvdimmAddVendorLog: Target huid 0x%.8X", + get_huid(i_nvdimm)); + + /* + 1) Read VENDOR_LOG_PAGE_SIZE. Multiply the return value with BLOCKSIZE + to get the total page size (LOG_PAGE_SIZE) + 2) Set TYPED_BLOCK_DATA to VENDOR_DATA_TYPE + 3) for (unit_id = 0; + unit_id < LOG_PAGE_LENGTH/VENDOR_LOG_UNIT_SIZE; + unit_id++) + 3a) write OPERATIONAL_UNIT_ID0 and OPERATIONAL_UNIT_ID1 with unit_id + 3b) set OPERATIONAL_UNIT_OPS_CMD to GET_OPERATIONAL_UNIT + 3c) wait for NVDIMM_CMD_STATUS1 to return 0 + 3d) for (block_id = 0; + block_id < VENDOR_LOG_UNIT_SIZE/BLOCKSIZE; + block_id++) + 3da) Write block_id to BLOCK_ID + 3db) Read TYPED_BLOCK_DATA_BYTE0 to TYPED_BLOCK_DATA_BYTE31 + 3dc) Save data to buffer + 3e) Compare checksum for unit retrieved + 3ea) Write GENERATE_OPERATIONAL_UNIT_CKSUM + to OPERATIONAL_UNIT_OPS_CMD + 3eb) wait for NVDIMM_CMD_STATUS1 to return 0 + 3ec) Read OPERATIONAL_UNIT_CRC1(MSB) and OPERATIONAL_UNIT_CRC0(LSB) + 3ed) Calculate host checksum + 3ee) return true if 3ec) == 3ed) + */ + + errlHndl_t l_err = nullptr; + + // Get the vendor log attribute + auto l_vendorLog = i_nvdimm->getAttr(); + + do + { + // If attr is set we are already in the process of + // reading the vendor log, exit + if (l_vendorLog) + { + break; + } + + // Set the vendor log attribute so we don't recursively + // execute the nvdimmAddVendorLog function + l_vendorLog = 0x1; + i_nvdimm->setAttr(l_vendorLog); + + uint8_t l_readData = 0; + std::vector l_fullData; + + // Step 1 + l_err = nvdimmReadReg(i_nvdimm, + VENDOR_LOG_PAGE_SIZE, + l_readData); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "nvdimmAddVendorLog() nvdimm[%X] error reading 0x%X", + get_huid(i_nvdimm), VENDOR_LOG_PAGE_SIZE); + break; + } + + size_t l_logPgeLength = l_readData * VENDOR_LOG_BLOCK_SIZE; + + // Step 2 + // Some weird bug here - switching directly to VENDOR_DATA_TYPE + // would not work. Need to switch to something else first + l_err = nvdimmWriteReg(i_nvdimm, + TYPED_BLOCK_DATA, + FIRMWARE_IMAGE_DATA); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "nvdimmAddVendorLog() nvdimm[%X] error writing 0x%X to 0x%X", + get_huid(i_nvdimm),TYPED_BLOCK_DATA, FIRMWARE_IMAGE_DATA ); + break; + } + + l_err = nvdimmWriteReg(i_nvdimm, + TYPED_BLOCK_DATA, + VENDOR_DATA_TYPE); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "nvdimmAddVendorLog() nvdimm[%X] error writing 0x%X to 0x%X", + get_huid(i_nvdimm),TYPED_BLOCK_DATA, VENDOR_DATA_TYPE ); + break; + } + + // Step 3 + // Loop through all the log units. + for (uint16_t l_unitId = 0; + l_unitId < (l_logPgeLength / VENDOR_LOG_UNIT_SIZE); + l_unitId++) + { + // Step 3a) - 3dc) + // Get one log unit + std::vector l_unitData; + l_err = getLogPerUnit(i_nvdimm, l_unitId, l_unitData); + if (l_err) + { + break; + } + + // Step 3e) - 3ee) + // Check the checksum for the entire log unit + l_err = compareCksum(i_nvdimm, l_unitData); + if (l_err) + { + break; + } + + // Append to full data + l_fullData.insert(l_fullData.end(), + l_unitData.begin(), + l_unitData.end()); + } + + if (l_err) + { + break; + } + + // Add NUL terminator to ascii data + l_fullData.push_back(0x00); + + // Add vendor data to error log as string + const char* l_fullChar = reinterpret_cast(l_fullData.data()); + ERRORLOG::ErrlUserDetailsStringSet l_stringSet; + l_stringSet.add("Vendor Log", l_fullChar); + l_stringSet.addToLog(io_err); + + // Change back to default + l_err = nvdimmWriteReg(i_nvdimm, + TYPED_BLOCK_DATA, + VENDOR_DEFAULT); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK + "nvdimmAddVendorLog() nvdimm[%X] error writing 0x%X to 0x%X", + get_huid(i_nvdimm),TYPED_BLOCK_DATA, VENDOR_DEFAULT ); + break; + } + + } while(0); + + if (l_err) + { + // FFDC error, set as informational + l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); + errlCommit( l_err, NVDIMM_COMP_ID ); + } + + // Clear the vendor log attribute before exiting + l_vendorLog = 0x0; + i_nvdimm->setAttr(l_vendorLog); + + TRACFCOMP( g_trac_nvdimm, EXIT_MRK + "nvdimmAddVendorLog: Target huid 0x%.8X", + get_huid(i_nvdimm)); +} + + /* * @brief Add Page 4 regs to FFDC * Added to all NVDIMM HW errors diff --git a/src/usr/isteps/nvdimm/nvdimm_update.C b/src/usr/isteps/nvdimm/nvdimm_update.C index 3c8426fba..9cda59f8c 100644 --- a/src/usr/isteps/nvdimm/nvdimm_update.C +++ b/src/usr/isteps/nvdimm/nvdimm_update.C @@ -400,6 +400,7 @@ errlHndl_t NvdimmInstalledImage::updateImage(NvdimmLidImage * i_lidImage) l_status.whole, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -574,6 +575,7 @@ errlHndl_t NvdimmInstalledImage::updateImage(NvdimmLidImage * i_lidImage) 0x0000), ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); // maybe some data was altered on the NV controller l_err->addPartCallout( iv_dimm, @@ -948,6 +950,7 @@ errlHndl_t NvdimmInstalledImage::updateImageData(NvdimmLidImage * i_lidImage) region, data_len), ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1018,6 +1021,7 @@ errlHndl_t NvdimmInstalledImage::changeFwUpdateMode(fw_update_mode i_mode) 0x00, 0x00), ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1101,6 +1105,7 @@ errlHndl_t NvdimmInstalledImage::waitFwOpsBlockReceived() ), ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 512 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1180,6 +1185,7 @@ errlHndl_t NvdimmInstalledImage::waitFwOpsComplete() ), ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1547,6 +1553,7 @@ errlHndl_t NvdimmInstalledImage::validateFwHeader() opsCmd.whole, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1597,6 +1604,7 @@ errlHndl_t NvdimmInstalledImage::commitFwRegion() opsCmd.whole, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1648,6 +1656,7 @@ errlHndl_t NvdimmInstalledImage::clearFwDataBlock() opsCmd.whole, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1698,6 +1707,7 @@ errlHndl_t NvdimmInstalledImage::validateFwImage() opsCmd.whole, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(iv_dimm, l_err); l_err->addPartCallout( iv_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -1984,6 +1994,7 @@ bool NvdimmsUpdate::runUpdate(void) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); l_err->collectTrace(NVDIMM_UPD, 256); + nvdimmAddVendorLog(l_nvdimm, l_err); l_err->addPartCallout( l_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); @@ -2203,6 +2214,8 @@ errlHndl_t NvdimmsUpdate::isUpdateNeeded(bool & o_update_needed, curType, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_UPD, 256 ); + nvdimmAddVendorLog(const_cast(l_dimm), + l_err); l_err->addPartCallout( l_dimm, HWAS::NV_CONTROLLER_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH ); diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index d9ce45e5e..a9f630c74 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -141,6 +141,7 @@ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout) ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); + nvdimmAddVendorLog(i_nvdimm, l_err); // Failure to arm could mean internal NV controller error or // even error on the battery pack. NVDIMM will lose persistency @@ -453,6 +454,7 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) // Read relevant regs for trace data nvdimmTraceRegs(l_nvdimm, l_RegInfo); nvdimmAddPage4Regs(l_nvdimm,l_err); + nvdimmAddVendorLog(l_nvdimm, l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); @@ -849,6 +851,7 @@ bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, @@ -888,6 +891,7 @@ bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, @@ -964,6 +968,7 @@ bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) ES_LIFETIME_MINIMUM_REQUIREMENT), ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, @@ -1016,6 +1021,7 @@ bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, @@ -1480,6 +1486,7 @@ bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList) ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvDimm, l_err); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); -- cgit v1.2.1