diff options
Diffstat (limited to 'src/usr/isteps/nvdimm/nvdimm.C')
-rw-r--r-- | src/usr/isteps/nvdimm/nvdimm.C | 636 |
1 files changed, 448 insertions, 188 deletions
diff --git a/src/usr/isteps/nvdimm/nvdimm.C b/src/usr/isteps/nvdimm/nvdimm.C index db26eb184..3e0d712ff 100644 --- a/src/usr/isteps/nvdimm/nvdimm.C +++ b/src/usr/isteps/nvdimm/nvdimm.C @@ -40,6 +40,8 @@ #include <lib/dimm/ddr4/nvdimm_utils.H> #include <lib/mc/port.H> #include <isteps/nvdimm/nvdimmreasoncodes.H> +#include "errlud_nvdimm.H" +#include "nvdimmErrorLog.H" #include <isteps/nvdimm/nvdimm.H> #include <vpd/spdenums.H> #include <secureboot/trustedbootif.H> @@ -54,6 +56,7 @@ using namespace TARGETING; using namespace DeviceFW; using namespace EEPROM; +using namespace ERRORLOG; trace_desc_t* g_trac_nvdimm = NULL; TRAC_INIT(&g_trac_nvdimm, NVDIMM_COMP_NAME, 2*KILOBYTE); @@ -360,24 +363,24 @@ void nvdimmSetStatusFlag(Target *i_nvdimm, const uint8_t i_status_flag) switch(i_status_flag) { - // Make sure NSTD_VAL_PRSV (content preserved) is unset before setting NSTD_VAL_NOPRSV - // (data not preserved) or NSTD_ERR_NOPRSV (error preserving data) + // Make sure NSTD_VAL_ERROR (content preserved) is unset before setting NSTD_VAL_ERASED + // (data not preserved) or NSTD_VAL_SR_FAILED (error preserving data) case NSTD_ERR: - case NSTD_VAL_NOPRSV: - case NSTD_ERR_NOPRSV: - l_statusFlag &= NSTD_VAL_PRSV_MASK; + case NSTD_VAL_ERASED: + case NSTD_VAL_SR_FAILED: + l_statusFlag &= NSTD_VAL_ERROR_MASK; l_statusFlag |= i_status_flag; break; // If the content preserved(restore sucessfully), make sure - // NSTD_VAL_NOPRSV (not preserved) and NSTD_ERR_NOPRSV (error preserving) + // NSTD_VAL_ERASED (not preserved) and NSTD_VAL_SR_FAILED (error preserving) // are unset before setting this flag. - case NSTD_VAL_PRSV: - l_statusFlag &= (NSTD_VAL_NOPRSV_MASK & NSTD_ERR_NOPRSV_MASK); + case NSTD_VAL_ERROR: + l_statusFlag &= (NSTD_VAL_ERASED_MASK & NSTD_VAL_SR_FAILED_MASK); l_statusFlag |= i_status_flag; break; - case NSTD_ERR_NOBKUP: + case NSTD_VAL_DISARMED: l_statusFlag |= i_status_flag; break; @@ -407,7 +410,8 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmReady() HUID[%X]",get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; - uint8_t l_data = 0x0; + nvdimm_reg_t l_RegInfo; + uint8_t l_data; uint8_t l_nvm_init_time = 0; size_t l_numBytes = 1; @@ -456,6 +460,48 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) if ((l_data != NV_READY) && !l_err) { + + // Collect available status registers for error log + do + { + // Read and save NVDIMM_READY for traces + l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.NVDimm_Ready = l_data; + + // Read and save MODULE_HEALTH for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health = l_data; + + // Read and save MODULE_HEALTH_STATUS0 for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health_Status0 = l_data; + + // Read and save MODULE_HEALTH_STATUS1 for traces + l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS1, l_data); + if(l_err) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + break; + } + l_RegInfo.Module_Health_Status1 = l_data; + + }while(0); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmReady() nvdimm[%X] - nvdimm not ready[%d]", get_huid(i_nvdimm), l_data); /*@ @@ -484,7 +530,12 @@ errlHndl_t nvdimmReady(Target *i_nvdimm) // a failing indication on the NV controller l_err->addPartCallout( i_nvdimm, HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + // Add Register Traces to error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } }while(0); @@ -616,12 +667,6 @@ errlHndl_t nvdimmPollStatus ( Target *i_nvdimm, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); - - // May have to move the error handling to the caller - // as different op could have different error severity - l_err->addPartCallout( i_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); } return l_err; @@ -646,9 +691,39 @@ errlHndl_t nvdimmPollBackupDone(Target* i_nvdimm, get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); l_err = nvdimmPollStatus ( i_nvdimm, SAVE, o_poll); + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_BACKUP_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_BACKUP + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_BACKUP, + NVDIMM_BACKUP_TIMEOUT, + NVDIMM_SET_USER_DATA_1(SAVE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + } + TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollBackupDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -673,15 +748,52 @@ errlHndl_t nvdimmPollRestoreDone(Target* i_nvdimm, get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); l_err = nvdimmPollStatus ( i_nvdimm, RESTORE, o_poll ); + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_RESTORE_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_RESTORE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_RESTORE, + NVDIMM_RESTORE_TIMEOUT, + NVDIMM_SET_USER_DATA_1(RESTORE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // May have to move the error handling to the caller + // as different op could have different error severity + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + } + TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollRestoreDone() nvdimm[%X]", get_huid(i_nvdimm)); return l_err; } + /** * @brief This function polls the command status register for erase * completion (does not indicate success or fail) @@ -701,7 +813,31 @@ errlHndl_t nvdimmPollEraseDone(Target* i_nvdimm, errlHndl_t l_err = nullptr; - l_err = nvdimmPollStatus ( i_nvdimm, ERASE, o_poll); + l_err = nvdimmPollStatus( i_nvdimm, ERASE, o_poll); + + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + + /*@ + *@errortype + *@reasoncode NVDIMM_ERASE_TIMEOUT + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_POLL_ERASE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@devdesc Encountered timeout while performing NVDIMM Restore operation + *@custdesc NVDIMM timed out + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_POLL_ERASE, + NVDIMM_ERASE_TIMEOUT, + NVDIMM_SET_USER_DATA_1(ERASE, TARGETING::get_huid(i_nvdimm)), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollEraseDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -729,7 +865,11 @@ errlHndl_t nvdimmPollESChargeStatus(Target* i_nvdimm, errlHndl_t l_err = nullptr; - l_err = nvdimmPollStatus ( i_nvdimm, CHARGE, o_poll ); + l_err = nvdimmPollStatus( i_nvdimm, CHARGE, o_poll ); + + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollESChargeDone() nvdimm[%X]", get_huid(i_nvdimm)); @@ -781,7 +921,8 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; - uint8_t l_data; + uint8_t l_data = 0x0; + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); do { @@ -790,7 +931,7 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOBKUP); + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_DISARMED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmSetESPolicy() nvdimm[%X]" "failed to write ES register!",get_huid(i_nvdimm)); break; @@ -804,13 +945,13 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOBKUP); + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_DISARMED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmSetESPolicy() nvdimm[%X]" "failed to read ES register!",get_huid(i_nvdimm)); break; } - if ((l_data & ES_SUCCESS) != ES_SUCCESS) + if (((l_data & ES_SUCCESS) != ES_SUCCESS) || ((l_data & ES_POLICY_ERROR) == ES_POLICY_ERROR)) { TRACFCOMP(g_trac_nvdimm, EXIT_MRK"NDVIMM HUID[%X], nvdimmSetESPolicy() " "failed!",get_huid(i_nvdimm)); @@ -837,14 +978,11 @@ errlHndl_t nvdimmSetESPolicy(Target* i_nvdimm) l_err->collectTrace(NVDIMM_COMP_NAME); - // Failure setting the energy source policy could mean error on the - // battery or even the cabling - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_CABLE_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + // Read relevant regs for trace data + nvdimmTraceRegs(i_nvdimm, l_RegInfo); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } }while(0); @@ -938,12 +1076,12 @@ errlHndl_t nvdimmValidImage(Target *i_nvdimm, bool &o_imgValid) * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ -errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) +errlHndl_t nvdimmRestore(TargetHandleList& i_nvdimmList, uint8_t &i_mpipl) { errlHndl_t l_err = nullptr; - bool l_imgValid; uint8_t l_rstrValid; uint32_t l_poll = 0; + TargetHandleList l_nvdimmList = i_nvdimmList; do { @@ -952,23 +1090,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) it != i_nvdimmList.end();) { // Default state during boot is unarmed, therefore not preserved - nvdimmSetStatusFlag(*it, NSTD_ERR_NOBKUP); - - l_err = nvdimmValidImage(*it, l_imgValid); - - // No reason to run if we can't figure out - // if there is an image or not - if (l_err) - { - break; - } - - if (!l_imgValid) - { - nvdimmSetStatusFlag(*it, NSTD_VAL_NOPRSV); - i_nvdimmList.erase(it); - continue; - } + nvdimmSetStatusFlag(*it, NSTD_VAL_DISARMED); TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA); @@ -987,13 +1109,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] i_mpipl[%u] failed to de-assert resetn!", get_huid(*it), i_mpipl); - - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); - //@TODO RTC 199645 - add HW callout on dimm target - // If we failed to de-assert reset_n, the dimm is pretty much useless. - // Let's not restore if that happens - // The callout will be added inside the HWP - // Leaving this comment here as a reminder, will remove later break; } @@ -1009,7 +1124,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(*it), i_mpipl); l_err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); - l_err->collectTrace(NVDIMM_COMP_NAME, 256); + l_err->collectTrace( NVDIMM_COMP_NAME ); ERRORLOG::errlCommit(l_err, NVDIMM_COMP_ID); } @@ -1022,12 +1137,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] self_refresh_entry failed!", get_huid(*it)); - - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); - //@TODO RTC 199645 - add HW callout on dimm target - // Without SRE the data could be not reliably restored - // The callout will be added inside the HWP - // Leaving this comment here as a reminder, will remove later break; } it++; @@ -1050,7 +1159,6 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_FUNC_CMD, RESTORE_IMAGE); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X], error initiating restore!!", get_huid(l_nvdimm)); break; @@ -1071,10 +1179,8 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) l_err = nvdimmPollRestoreDone(l_nvdimm, l_poll); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X], error restoring!", get_huid(l_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } } @@ -1084,22 +1190,22 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) break; } - // Make sure the restore is valid + // Check for restore errors for (const auto & l_nvdimm : i_nvdimmList) { l_err = nvdimmGetRestoreValid(l_nvdimm, l_rstrValid); if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore Target[%X] error validating restore status!", get_huid(l_nvdimm)); break; } - if ((l_rstrValid & RSTR_SUCCESS) != RSTR_SUCCESS){ + if ((l_rstrValid & RSTR_ERROR) == RSTR_ERROR) + { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] restoreValid[%d], restore failed!", - get_huid(l_nvdimm), l_rstrValid); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] restore failed due to errors", + get_huid(l_nvdimm)); /*@ *@errortype *@reasoncode NVDIMM_RESTORE_FAILED @@ -1119,28 +1225,19 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(l_nvdimm), 0x0, ERRORLOG::ErrlEntry::NO_SW_CALLOUT); - - l_err->collectTrace(NVDIMM_COMP_NAME); - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); - - // Invalid restore could be due to dram not in self-refresh - // or controller issue. Data should not be trusted at this point - l_err->addPartCallout( l_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); break; } } if (l_err) { + TRACFCOMP(g_trac_nvdimm, "restore encountered an error"); break; } // Exit self-refresh for (const auto & l_nvdimm : i_nvdimmList) { - TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, l_nvdimm, CLASS_UNIT, TYPE_MCA); assert(l_mcaList.size(), "nvdimmRestore() failed to find parent MCA."); @@ -1155,21 +1252,25 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmRestore() HUID[%X] post_restore_transition failed!", get_huid(l_nvdimm)); - - // Commit the error from the HWP - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); break; } else { // Restore success! - nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_PRSV); + // Remove dimm from list for error handling + i_nvdimmList.erase(i_nvdimmList.begin()); } } + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() HUID[%X] encounrterd an error during restore"); + break; + } + if (i_mpipl) { - for (const auto & l_nvdimm : i_nvdimmList) + for (const auto & l_nvdimm : l_nvdimmList) { TargetHandleList l_mcaList; errlHndl_t err = nullptr; @@ -1188,7 +1289,7 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) get_huid(l_nvdimm), i_mpipl); err->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL); - err->collectTrace(NVDIMM_COMP_NAME, 256); + err->collectTrace( NVDIMM_COMP_NAME ); ERRORLOG::errlCommit(err, NVDIMM_COMP_ID); } } @@ -1203,68 +1304,42 @@ errlHndl_t nvdimmRestore(TargetHandleList i_nvdimmList, uint8_t &i_mpipl) #endif /** - * @brief This function checks the erase status register to make sure - * the last erase completed witout error + * @brief This function checks the status and success of an erase * * @param[in] i_nvdimm - nvdimm target with NV controller * * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ -errlHndl_t nvdimmCheckEraseSuccess(Target *i_nvdimm) +errlHndl_t nvdimmEraseCheck(Target *i_nvdimm) { - TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckEraseSuccess() : nvdimm[%X]", - get_huid(i_nvdimm)); - - uint8_t l_data = 0; errlHndl_t l_err = nullptr; + nvdimm_reg_t l_RegInfo; - l_err = nvdimmReadReg(i_nvdimm, ERASE_STATUS, l_data); + // Erase happens one module at a time. No need to set any offset on the counter + uint32_t l_poll = 0; + l_err = nvdimmPollEraseDone(i_nvdimm, l_poll); + // Add part callout, currently all erase calls have same callout + // Dump traces to the error log if error exists if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckEraseSuccess() nvdimm[%X]" - "failed to read erase status reg!",get_huid(i_nvdimm)); - } - else if ((l_data & ERASE_SUCCESS) != ERASE_SUCCESS) - { + // For both Erase timeout and Erase fail + // Callout nvdimm on high, gard and deconfig + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckEraseSuccess() nvdimm[%X]" - "failed to erase!",get_huid(i_nvdimm)); - /*@ - *@errortype - *@reasoncode NVDIMM_ERASE_FAILED - *@severity ERRORLOG_SEV_PREDICTIVE - *@moduleid NVDIMM_CHECK_ERASE - *@userdata1[0:31] Related ops (0xff = NA) - *@userdata1[32:63] Target Huid - *@userdata2 <UNUSED> - *@devdesc Encountered error erasing previously stored data image - * on NVDIMM. Likely due to timeout and/or controller error - *@custdesc NVDIMM error erasing data image - */ - l_err = new ERRORLOG::ErrlEntry( - ERRORLOG::ERRL_SEV_PREDICTIVE, - NVDIMM_CHECK_ERASE, - NVDIMM_ERASE_FAILED, - NVDIMM_SET_USER_DATA_1(ERASE, get_huid(i_nvdimm)), - 0x0, - ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); - l_err->collectTrace(NVDIMM_COMP_NAME); - errlCommit( l_err, NVDIMM_COMP_ID ); + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); - // Failure to erase could mean internal NV controller error and/or - // HW error on nand flash. NVDIMM will lose persistency if failed to - // erase nand flash - l_err->addPartCallout( i_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); } - TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckEraseSuccess(): nvdimm[%X] ret[%X]", - get_huid(i_nvdimm), l_data); - return l_err; } @@ -1293,13 +1368,8 @@ errlHndl_t nvdimmEraseNF(Target *i_nvdimm) break; } - // Erase happens one module at a time. No need to set any offset on the counter - uint32_t l_poll = 0; - l_err = nvdimmPollEraseDone(i_nvdimm, l_poll); - if (!l_err) - { - l_err = nvdimmCheckEraseSuccess(i_nvdimm); - } + // Poll for success and check status + l_err = nvdimmEraseCheck(i_nvdimm); }while(0); @@ -1525,7 +1595,7 @@ errlHndl_t nvdimmEpowSetup(TargetHandleList &i_nvdimmList) TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmEpowSetup() HUID[%X] failed to setup epow!", get_huid(*it)); - nvdimmSetStatusFlag(*it, NSTD_ERR_NOPRSV); + nvdimmSetStatusFlag(*it, NSTD_VAL_SR_FAILED); break; } it++; @@ -1547,32 +1617,46 @@ errlHndl_t nvdimmEpowSetup(TargetHandleList &i_nvdimmList) * @param[in] i_nvdimmList - list of nvdimm targets * */ -void nvdimm_restore(TargetHandleList &i_nvdimmList) +errlHndl_t nvdimm_restore(TargetHandleList &i_nvdimmList) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimm_restore()"); + errlHndl_t l_err = nullptr; - Target* l_sys = nullptr; - targetService().getTopLevelTarget( l_sys ); + bool l_valid = false; + bool l_exit = false; + TARGETING::Target* l_sys = nullptr; + TARGETING::targetService().getTopLevelTarget( l_sys ); assert(l_sys, "nvdimm_restore: no TopLevelTarget"); uint8_t l_mpipl = l_sys->getAttr<ATTR_IS_MPIPL_HB>(); + nvdimm_reg_t l_RegInfo = nvdimm_reg_t(); + TargetHandleList l_nvdimmList = i_nvdimmList; + uint8_t l_rstrValid; do { - // Set the energy policy to device-managed - // Don't think this is needed for the supercaps to start charging - // but do it anyway to get the charging going for (const auto & l_nvdimm : i_nvdimmList) { - l_err = nvdimmSetESPolicy(l_nvdimm); + // Check for a valid image + l_err = nvdimmValidImage( l_nvdimm, l_valid ); if (l_err) { - // Failing this is an indication of power pack issue. - // This will prevent future backup, but let's continue - // since we can still restore the data if there is any - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() - Failing nvdimmSetESPolicy()"); - errlCommit( l_err, NVDIMM_COMP_ID ); + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed to read the image", get_huid(l_nvdimm)); + errlCommit(l_err, NVDIMM_COMP_ID); } + + if (!l_valid) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] restore failed due to invalid image", get_huid(l_nvdimm)); + // Set ATTR NV STATUS FLAG to Erased + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERASED); + break; + } + + } + + if (!l_valid) + { + break; } if (l_mpipl) @@ -1586,7 +1670,7 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) if (l_err) { - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOPRSV); + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERASED); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() nvdimm[%X], error backing up the DRAM!", get_huid(l_nvdimm)); errlCommit(l_err, NVDIMM_COMP_ID); @@ -1596,31 +1680,77 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) } // Start the restore - l_err = nvdimmRestore(i_nvdimmList, l_mpipl); + l_err = nvdimmRestore(l_nvdimmList, l_mpipl); + // Check if restore completed successfully if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_restore() - Failing nvdimmRestore()"); - errlCommit( l_err, NVDIMM_COMP_ID ); + const auto l_nvdimm = l_nvdimmList.front(); + + TRACFCOMP(g_trac_nvdimm, "nvdimm_restore() - Failing nvdimmRestore()"); + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_SR_FAILED); + + // Invalid restore could be due to dram not in self-refresh + // or controller issue. Data should not be trusted at this point + l_err->addPartCallout( l_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( l_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); break; } - // Make sure the energy source is fully charged before erasing the images - // Doing this on all the nvdimms since the ones w/o image will need - // to be fully charged before arming the trigger - uint32_t l_poll = 0; + // Check health status registers and exit if required for (const auto & l_nvdimm : i_nvdimmList) { - l_err = nvdimmPollESChargeStatus(l_nvdimm, l_poll); + l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_RESTORE, l_exit ); - if (l_err){ - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); - errlCommit( l_err, NVDIMM_COMP_ID ); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore() nvdimm[%X] failed during health status check", get_huid(l_nvdimm)); + if (l_exit) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else + { + // Redundant check with external err bugged + errlCommit( l_err, NVDIMM_COMP_ID ); + return l_err; + } } + + // Make sure the restore is valid + l_err = nvdimmGetRestoreValid(l_nvdimm, l_rstrValid); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmRestore Target[%X] error validating restore status!", + get_huid(l_nvdimm)); + break; + } + + if ((l_rstrValid & RSTR_SUCCESS) == RSTR_SUCCESS) + { + // Restore success! + nvdimmSetStatusFlag(l_nvdimm, NSTD_VAL_ERROR); + } + } }while(0); + // Return err not being handled, temp commit: + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + } + // At the end, pre-load CCS with commands for EPOW. This will stage the CCS // with the require commands to trigger the save on NVDIMMs. The actual // triggering will be done by OCC when EPOW is detected. @@ -1633,6 +1763,7 @@ void nvdimm_restore(TargetHandleList &i_nvdimmList) } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimm_restore()"); + return l_err; } /** @@ -1733,12 +1864,16 @@ errlHndl_t nvdimm_factory_reset(Target *i_nvdimm) * @param[in] i_nvdimm - nvdimm target * */ -void nvdimm_init(Target *i_nvdimm) +errlHndl_t nvdimm_init(Target *i_nvdimm) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimm_init() nvdimm[%X]", get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; + bool l_continue = true; + uint8_t l_data = 0; + nvdimm_reg_t l_RegInfo; + uint32_t l_poll = 0; do { @@ -1759,6 +1894,15 @@ void nvdimm_init(Target *i_nvdimm) } } + // Set ATTR_NV_STATUS_FLAG to default disarmed state + l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED); + if (l_err) + { + nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); + errlCommit(l_err, NVDIMM_COMP_ID); + } + + // Check if the nvdimm ready status l_err = nvdimmReady(i_nvdimm); if (l_err) @@ -1766,7 +1910,6 @@ void nvdimm_init(Target *i_nvdimm) nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], controller not ready", get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } @@ -1777,46 +1920,163 @@ void nvdimm_init(Target *i_nvdimm) nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR); TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error retrieving timeout values", get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } - //Check save progress - uint32_t l_poll = 0; - l_err = nvdimmPollBackupDone(i_nvdimm, l_poll); + // Check for Erase in progress and its status + l_err = nvdimmEraseCheck(i_nvdimm); + if (l_err) + { + break; + } + // Check NO_RESET_N bit for power loss without save + l_err = nvdimmReadReg ( i_nvdimm, CSAVE_FAIL_INFO1, l_data); if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOPRSV); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error backing up the DRAM!", - get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); break; } + else if ((l_data & NO_RESET_N) == NO_RESET_N) + { + // Set ATTR_NV_STATUS_FLAG to restored, as data may persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmInit() nvdimm[%X]" + "failed to save due to power loss!",get_huid(i_nvdimm)); + /*@ + *@errortype + *@reasoncode NVDIMM_POWER_SAVE_FAILURE + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_CHECK_RESETN + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@userdata2 <UNUSED> + *@devdesc Encountered error erasing previously stored data image + * on NVDIMM. Likely due to timeout and/or controller error + *@custdesc NVDIMM error erasing data image + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_CHECK_RESETN, + NVDIMM_POWER_SAVE_FAILURE, + NVDIMM_SET_USER_DATA_1(l_data, get_huid(i_nvdimm)), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Failure to erase could mean internal NV controller error and/or + // HW error on nand flash. NVDIMM will lose persistency if failed to + // erase nand flash + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_LOW); - // Unlock encryption if enabled - TargetHandleList l_nvdimmTargetList; - l_nvdimmTargetList.push_back(i_nvdimm); - NVDIMM::nvdimm_encrypt_unlock(l_nvdimmTargetList); + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); - // Disarm the ddr_resetn here in case it came in armed. When the nvdimm is - // armed the reset_n is masked off from the host, meaning the drams won't - // be able to get reset properly later, causing training to fail. - l_err = nvdimmChangeArmState(i_nvdimm, DISARM_TRIGGER); + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + errlCommit(l_err, NVDIMM_COMP_ID); + } + else + { + // Check save progress + l_err = nvdimmPollBackupDone(i_nvdimm, l_poll); + if (l_err) + { + // May have to move the error handling to the caller + // as different op could have different error severity + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_int() nvdimm[%X], error backing up the DRAM!", + get_huid(i_nvdimm)); + break; + } + } + + // Check CSAVE_ERROR Register + l_err = nvdimmReadReg( i_nvdimm, CSAVE_FAIL_INFO0, l_data ); if (l_err) { - nvdimmSetStatusFlag(i_nvdimm, NSTD_ERR_NOPRSV); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_init() nvdimm[%X], error disarming the nvdimm!", - get_huid(i_nvdimm)); - errlCommit(l_err, NVDIMM_COMP_ID); + break; + } + else if (l_data != ZERO) + { + /*@ + *@errortype + *@reasoncode NVDIMM_CSAVE_ERROR + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_CHECK_CSAVE + *@userdata1[0:31] Related ops (0xff = NA) + *@userdata1[32:63] Target Huid + *@userdata2 <UNUSED> + *@devdesc Encountered error saving during catastrophic save + * on NVDIMM. Check error register trace for details + *@custdesc NVDIMM error during Catastrophic Save + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_CHECK_CSAVE, + NVDIMM_CSAVE_ERROR, + NVDIMM_SET_USER_DATA_1(l_data, get_huid(i_nvdimm)), + 0x0, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Collect register data for FFDC Traces + nvdimmTraceRegs ( i_nvdimm, l_RegInfo ); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + // Check if the image is still valid + if ( l_RegInfo.CSave_Info != VALID_IMAGE ) + { + // Callout and gard dimm if image is not valid + l_err->addPartCallout( i_nvdimm, + HWAS::NV_CONTROLLER_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Fatal); + } + else + { + // Set ATTR_NV_STATUS_FLAG to Restored as data might persist + nvdimmSetStatusFlag(i_nvdimm, NSTD_VAL_ERROR); + errlCommit(l_err, NVDIMM_COMP_ID); + } + break; + } + + // Check Health Status Registers + l_err = nvdimmHealthStatusCheck(i_nvdimm, HEALTH_SAVE, l_continue); + if(!l_continue) + { break; } + // Unlock encryption if enabled + TargetHandleList l_nvdimmTargetList; + l_nvdimmTargetList.push_back(i_nvdimm); + NVDIMM::nvdimm_encrypt_unlock(l_nvdimmTargetList); + }while(0); TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimm_init() nvdimm[%X]", get_huid(i_nvdimm)); + + // Return err not being handled, temp commit: + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + } + + + return l_err; } |