/* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ /* $Source: src/usr/isteps/nvdimm/runtime/nvdimm_rt.C $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ /* Contributors Listed Below - COPYRIGHT 2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ /* implied. See the License for the specific language governing */ /* permissions and limitations under the License. */ /* */ /* IBM_PROLOG_END_TAG */ /** * @file nvdimm_rt.C * * @brief NVDIMM functions only needed for runtime. These functions include * but are not limited to arming/disarming the NVDIMM along with methods * to poll the arming and check the status of the arming. Checking the * error state of the NVDIMM, getting a random number with the darn * instruction and checking the ES or NVM health status. */ /// BPM - Backup Power Module #include #include #include #include #include #include #include #include #include #include #include #include #include "../errlud_nvdimm.H" #include "../nvdimmErrorLog.H" #include // implements some of these #include "../nvdimm.H" // for g_trac_nvdimm //#define TRACUCOMP(args...) TRACFCOMP(args) #define TRACUCOMP(args...) using namespace TARGETING; using namespace ERRORLOG; namespace NVDIMM { static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull; static constexpr uint32_t MAX_DARN_ERRORS = 10; static constexpr uint8_t FW_OPS_UPDATE = 0x04; /** * @brief This function polls the command status register for arm completion * (does not indicate success or fail) * * @param[in] i_nvdimm - nvdimm target with NV controller * * @param[out] o_poll - total polled time in ms * * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ errlHndl_t nvdimmPollArmDone(Target* i_nvdimm, uint32_t &o_poll) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmPollArmDone() nvdimm[%X]", get_huid(i_nvdimm) ); errlHndl_t l_err = nullptr; l_err = nvdimmPollStatus ( i_nvdimm, ARM, o_poll); TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollArmDone() nvdimm[%X]", get_huid(i_nvdimm)); return l_err; } /** * @brief This function checks the arm status register to make sure * the trigger has been armed to ddr_reset_n * * @param[in] i_nvdimm - nvdimm target with NV controller * @param[in] i_arm_timeout - nvdimm local timeout status * * @return errlHndl_t - Null if successful, otherwise a pointer to * the error log. */ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]", get_huid(i_nvdimm)); errlHndl_t l_err = nullptr; uint8_t l_data = 0; l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" "failed to read arm status reg!",get_huid(i_nvdimm)); } else if (((l_data & ARM_ERROR) == ARM_ERROR) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED) || i_arm_timeout) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" "failed to arm!",get_huid(i_nvdimm)); /*@ *@errortype *@reasoncode NVDIMM_ARM_FAILED *@severity ERRORLOG_SEV_PREDICTIVE *@moduleid NVDIMM_SET_ARM *@userdata1[0:31] Related ops (0xff = NA) *@userdata1[32:63] Target Huid *@userdata2 *@devdesc Encountered error arming the catastrophic save * trigger on NVDIMM. Make sure an energy source * is connected to the NVDIMM and the ES policy * is set properly *@custdesc NVDIMM encountered error arming save trigger */ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, NVDIMM_SET_ARM, NVDIMM_ARM_FAILED, TWO_UINT32_TO_UINT64(ARM, get_huid(i_nvdimm)), 0x0, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); nvdimmAddVendorLog(i_nvdimm, l_err); // Failure to arm could mean internal NV controller error or // even error on the battery pack. NVDIMM will lose persistency // if failed to arm trigger l_err->addHwCallout( i_nvdimm, HWAS::SRCI_PRIORITY_HIGH, HWAS::NO_DECONFIG, HWAS::GARD_Fatal); } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]", get_huid(i_nvdimm), l_data); return l_err; } bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) { bool o_arm_successful = true; bool l_continue = true; bool l_arm_timeout = false; uint8_t l_data; uint8_t l_ready; uint8_t l_fwupdate; auto l_RegInfo = nvdimm_reg_t(); uint64_t l_writeData; uint32_t l_writeAddress; size_t l_writeSize = sizeof(l_writeData); TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() %d", i_nvdimmTargetList.size()); errlHndl_t l_err = nullptr; errlHndl_t l_err_t = nullptr; // Prerequisite Arm Checks for (auto const l_nvdimm : i_nvdimmTargetList) { do { // Read out the Module Health status register l_err = nvdimmReadReg(l_nvdimm, MODULE_HEALTH_STATUS0, l_data); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read Module Health Status", get_huid(l_nvdimm)); errlCommit( l_err, NVDIMM_COMP_ID ); l_continue = false; break; } // Read out the NVDimm Ready register l_err = nvdimmReadReg(l_nvdimm, NVDIMM_READY, l_ready); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read NVDimm Ready register", get_huid(l_nvdimm)); errlCommit( l_err, NVDIMM_COMP_ID ); l_continue = false; break; } // Read out the FW OPs Status register l_err = nvdimmReadReg(l_nvdimm, FIRMWARE_OPS_STATUS, l_fwupdate); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read Firmware OPs Status register", get_huid(l_nvdimm)); errlCommit( l_err, NVDIMM_COMP_ID ); l_continue = false; } }while(0); // Check ARM pre-requisites if ((!l_continue) || (l_data & NVM_LIFETIME_ERROR) || (l_ready != NV_READY) || (l_fwupdate & FW_OPS_UPDATE)) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed NVDimm Arm prechecks", get_huid(l_nvdimm)); // Disarming all dimms due to error nvdimmDisarm(i_nvdimmTargetList); l_err = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err) { errlCommit( l_err, NVDIMM_COMP_ID ); } return false; } } // Mask MBACALFIR EventN to separate ARM handling for (TargetHandleList::iterator it = i_nvdimmTargetList.begin(); it != i_nvdimmTargetList.end();) { TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA); assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA."); l_writeAddress = MBACALFIR_OR_MASK_REG; l_writeData = MBACALFIR_EVENTN_OR_BIT; l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } it++; } for (auto const l_nvdimm : i_nvdimmTargetList) { l_arm_timeout = false; // skip if the nvdimm is already armed ATTR_NVDIMM_ARMED_type l_armed_state = {}; l_armed_state = l_nvdimm->getAttr(); if (l_armed_state.armed) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] called when already armed", get_huid(l_nvdimm)); continue; } // Set ES Policy, contains all of its status checks l_err = nvdimmSetESPolicy(l_nvdimm); if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set ES Policy", get_huid(l_nvdimm)); o_arm_successful = false; nvdimmDisarm(i_nvdimmTargetList); l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err_t) { errlCommit( l_err_t, NVDIMM_COMP_ID ); } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); // Callout the nvdimm on high and gard l_err->addHwCallout( l_nvdimm, HWAS::SRCI_PRIORITY_HIGH, HWAS::NO_DECONFIG, HWAS::GARD_Fatal); errlCommit( l_err, NVDIMM_COMP_ID ); break; } // Clear all CMD error status registers l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_MGT_CMD0, ARM_CLEAR_ALL); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - error clearing all status registers", get_huid(l_nvdimm)); break; } l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER); // If we run into any error here we will just // commit the error log and move on. Let the // system continue to boot and let the user // salvage the data if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm)); l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err_t) { errlCommit( l_err_t, NVDIMM_COMP_ID ); } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; continue; } // Arm happens one module at a time. No need to set any offset on the counter uint32_t l_poll = 0; l_err = nvdimmPollArmDone(l_nvdimm, l_poll); if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm)); l_arm_timeout = true; l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err_t) { errlCommit( l_err_t, NVDIMM_COMP_ID ); } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; } // Check health status registers and exit if required l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_arm_timeout ); // Check for health status failure if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm)); // The arm timeout variable is used here as the continue variable for the // health status check. This was done to include the timeout for use in the check // If true either the arm timed out with a health status fail or the // health status check failed with another disarm and exit condition if (l_arm_timeout) { errlCommit( l_err, NVDIMM_COMP_ID ); // Disarming all dimms due to error nvdimmDisarm(i_nvdimmTargetList); o_arm_successful = false; break; } else { errlCommit( l_err, NVDIMM_COMP_ID ); continue; } } l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout); if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm", get_huid(l_nvdimm)); // Disarming all dimms due to error nvdimmDisarm(i_nvdimmTargetList); l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err_t) { errlCommit( l_err_t, NVDIMM_COMP_ID ); } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); // Dump Traces for error logs nvdimmTraceRegs( l_nvdimm, l_RegInfo ); nvdimmAddPage4Regs(l_nvdimm,l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); errlCommit(l_err, NVDIMM_COMP_ID); o_arm_successful = false; break; } // After arming the trigger, erase the image to prevent the possible // stale image getting the restored on the next boot in case of failed // save. l_err = nvdimmEraseNF(l_nvdimm); if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to erase post arm", get_huid(l_nvdimm)); // Disarming all dimms due to error nvdimmDisarm(i_nvdimmTargetList); l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); if (l_err_t) { errlCommit( l_err_t, NVDIMM_COMP_ID ); } // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; // If the erase failed let's disarm the trigger l_err = nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X], error disarming the nvdimm!", get_huid(l_nvdimm)); l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); } break; } // Arm successful, update armed status l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm, NVDIMM::NVDIMM_ARMED); if (l_err) { l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); } // Enable Persistency and Warning Threshold notifications l_err = nvdimmWriteReg(l_nvdimm, SET_EVENT_NOTIFICATION_CMD, ENABLE_NOTIFICATIONS); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] setting persistency notification", TARGETING::get_huid(l_nvdimm)); break; } // Check notification status and errors l_err = nvdimmReadReg(l_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data); if (l_err) { break; } else if (((l_data & SET_EVENT_NOTIFICATION_ERROR) == SET_EVENT_NOTIFICATION_ERROR) || ((l_data & NOTIFICATIONS_ENABLED) != NOTIFICATIONS_ENABLED)) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set event notification", get_huid(l_nvdimm)); // Set NVDIMM Status flag to partial working, as error detected but data might persist notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_RISKY_HW_ERROR); /*@ *@errortype *@reasoncode NVDIMM_SET_EVENT_NOTIFICATION_ERROR *@severity ERRORLOG_SEV_PREDICTIVE *@moduleid NVDIMM_SET_EVENT_NOTIFICATION *@userdata1[0:31] Target Huid *@userdata2 *@devdesc NVDIMM threw an error or failed to set event * notifications during arming *@custdesc NVDIMM failed to enable event notificaitons */ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, NVDIMM_SET_EVENT_NOTIFICATION, NVDIMM_SET_EVENT_NOTIFICATION_ERROR, TARGETING::get_huid(l_nvdimm), 0x0, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace( NVDIMM_COMP_NAME ); // Callout the dimm l_err->addHwCallout( l_nvdimm, HWAS::SRCI_PRIORITY_LOW, HWAS::NO_DECONFIG, HWAS::GARD_NULL); // Read relevant regs for trace data nvdimmTraceRegs(l_nvdimm, l_RegInfo); nvdimmAddPage4Regs(l_nvdimm,l_err); nvdimmAddVendorLog(l_nvdimm, l_err); // Add reg traces to the error log NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); errlCommit( l_err, NVDIMM_COMP_ID ); break; } // Re-check health status registers l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_POST_ARM, l_continue ); // Check for health status failure if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed final health status check", get_huid(l_nvdimm)); errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; break; } } // Check for uncommited i2c fail error logs if (l_err) { TRACFCOMP(g_trac_nvdimm, "nvdimmArm() failed an i2c read/write"); errlCommit( l_err, NVDIMM_COMP_ID ); nvdimmDisarm(i_nvdimmTargetList); return false; } // Unmask firs if the arm completed successfully if (o_arm_successful) { // Unmask MBACALFIR EventN and set to recoverable for (TargetHandleList::iterator it = i_nvdimmTargetList.begin(); it != i_nvdimmTargetList.end();) { TargetHandleList l_mcaList; getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA); assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA."); // Set MBACALFIR_ACTION0 to recoverable l_writeAddress = MBACALFIR_ACTION0_REG; l_writeData = 0; l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } l_writeData &= MBACALFIR_EVENTN_AND_BIT; l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } // Set MBACALFIR_ACTION1 to recoverable l_writeAddress = MBACALFIR_ACTION1_REG; l_writeData = 0; l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } l_writeData |= MBACALFIR_EVENTN_OR_BIT; l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } // Unmask MBACALFIR[8] l_writeAddress = MBACALFIR_AND_MASK_REG; l_writeData = MBACALFIR_UNMASK_BIT; l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize, DEVICE_SCOM_ADDRESS(l_writeAddress)); if(l_err) { TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed", l_writeAddress); errlCommit( l_err, NVDIMM_COMP_ID ); } it++; } } TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d", o_arm_successful); return o_arm_successful; } bool nvdimmDisarm(TargetHandleList &i_nvdimmTargetList) { bool o_disarm_successful = true; TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmDisarm() %d", i_nvdimmTargetList.size()); errlHndl_t l_err = nullptr; for (auto const l_nvdimm : i_nvdimmTargetList) { // skip if the nvdimm is already disarmed ATTR_NVDIMM_ARMED_type l_armed_state = {}; l_armed_state = l_nvdimm->getAttr(); if (!l_armed_state.armed) { TRACFCOMP(g_trac_nvdimm, "nvdimmDisarm() nvdimm[%X] called when already disarmed", get_huid(l_nvdimm)); continue; } l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER); // If we run into any error here we will just // commit the error log and move on. Let the // system continue to boot and let the user // salvage the data if (l_err) { // Committing the error as we don't want this to interrupt // the boot. This will notify the user that action is needed // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit( l_err, NVDIMM_COMP_ID ); o_disarm_successful = false; continue; } // Disarm successful, update armed status l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm, NVDIMM::NVDIMM_DISARMED); if (l_err) { l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); } } TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmDisarm() returning %d", o_disarm_successful); return o_disarm_successful; } /** * @brief Check nvdimm error state * * @param[in] i_nvdimm - nvdimm target * * @return bool - true if nvdimm is in any error state, false otherwise */ bool nvdimmInErrorState(Target *i_nvdimm) { TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm)); uint8_t l_statusFlag = i_nvdimm->getAttr(); bool l_ret = true; // Just checking bit 1 for now, need to investigate these // Should be checking NVDIMM_ARMED instead if ((l_statusFlag & NSTD_VAL_ERASED) == 0) { l_ret = false; } // Also check the encryption error status Target* l_sys = nullptr; targetService().getTopLevelTarget( l_sys ); assert(l_sys, "nvdimmInErrorState: no TopLevelTarget"); if (l_sys->getAttr()) { ATTR_NVDIMM_ARMED_type l_armed_state = {}; l_armed_state = i_nvdimm->getAttr(); if (l_armed_state.encryption_error_detected) { l_ret = true; } } TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm)); return l_ret; } // This could be made a generic utility errlHndl_t nvdimm_getDarnNumber(size_t i_genSize, uint8_t* o_genData) { assert(i_genSize % sizeof(uint64_t) == 0,"nvdimm_getDarnNumber() bad i_genSize"); errlHndl_t l_err = nullptr; uint64_t* l_darnData = reinterpret_cast(o_genData); for (uint32_t l_loop = 0; l_loop < (i_genSize / sizeof(uint64_t)); l_loop++) { // Darn could return an error code uint32_t l_darnErrors = 0; while (l_darnErrors < MAX_DARN_ERRORS) { // Get a 64-bit random number with the darn instruction l_darnData[l_loop] = getDarn(); if ( l_darnData[l_loop] != DARN_ERROR_CODE ) { break; } else { l_darnErrors++; } } if (l_darnErrors == MAX_DARN_ERRORS) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_getDarnNumber() reached MAX_DARN_ERRORS"); /*@ *@errortype *@reasoncode NVDIMM_ENCRYPTION_MAX_DARN_ERRORS *@severity ERRORLOG_SEV_PREDICTIVE *@moduleid NVDIMM_GET_DARN_NUMBER *@userdata1 MAX_DARN_ERRORS *@devdesc Error using darn instruction *@custdesc NVDIMM encryption error */ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, NVDIMM_GET_DARN_NUMBER, NVDIMM_ENCRYPTION_MAX_DARN_ERRORS, MAX_DARN_ERRORS, ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); break; } } return l_err; } errlHndl_t nvdimm_getRandom(uint8_t* o_genData) { errlHndl_t l_err = nullptr; uint8_t l_xtraData[ENC_KEY_SIZE] = {0}; do { // Get a random number with the darn instruction l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, o_genData); if (l_err) { break; } // Validate and update the random number // Retry if more randomness required do { //Get replacement data l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, l_xtraData); if (l_err) { break; } }while (nvdimm_keyifyRandomNumber(o_genData, l_xtraData)); }while (0); return l_err; } /* * @brief Check the ES (enery source)/backup power module(BPM) health status of * the individual NVDIMMs supplied in list * * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of * * @return false if one or more NVDIMMs fail ES health check, else true */ bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) { TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): " "Target list size(%d)", i_nvdimmTargetList.size()); // The minimum ES lifetime value const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% // The ES health check status flags for the different states of an // ES health check const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 // Handle to catch any errors errlHndl_t l_err(nullptr); // The ES health check status from an ES health check call uint8_t l_esHealthCheck(0); // Status of the accumulation of all calls related to the ES health check. // If any one call is bad/fails, then this will be false, else it stays true bool l_didEsHealthCheckPass(true); // Iterate thru the NVDIMMs checking the ES health status of each one. // Going with the assumption that the caller waited the allotted time, // roughly 20 to 30 minutes, after the start of an IPL. // Success case: // * ES health check initiated at start of the IPL, caller waited the // allotted time (20 to 30 mins) before doing a health check, health // check returned success and the lifetime meets the minimum threshold // for a new BPM. // Error cases are: // * ES health check is in progress, will assume BPM is hung // * ES health check failed // * ES health check succeeded but lifetime does not meet a // certain threshold // * If none of the above apply (success case and other error cases), // then assume the ES health check was never initiated at the start // of the IPL // For each of these error cases do a predictive callout for (auto const l_nvdimm : i_nvdimmTargetList) { // Retrieve the Health Check status from the BPM TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " "Reading NVDIMM(0x%.8X) ES health check data, " "register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "NVDIMM(0x%X) failed to read the ES health check " "data, register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; // Proceed to next NVDIMM, better luck next time continue; } // Trace out the returned data for inspection TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " "NVDIMM(0x%X) returned value(0x%.2X) from the ES health " "check data, register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0); if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG) { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " "doing an ES health check on NVDIMM(0x%.8X), the BPM " "is hung doing the ES health check.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE * @moduleid NVDIMM_ES_HEALTH_CHECK * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE * @userdata1 HUID of NVDIMM target * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before * doing an ES health check, then the BPM is hung doing * the ES health check. * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, NVDIMM_ES_HEALTH_CHECK, NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE, get_huid(l_nvdimm), l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); nvdimmAddPage4Regs(l_nvdimm,l_err); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; } else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG) { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " "doing an ES health check on NVDIMM(0x%.8X), the BPM " "reported a failure.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE * @moduleid NVDIMM_ES_HEALTH_CHECK * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE * @userdata1 HUID of NVDIMM target * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before * doing an ES health check, the BPM reported a failure * while doing an ES health check. * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, NVDIMM_ES_HEALTH_CHECK, NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE, get_huid(l_nvdimm), l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); nvdimmAddPage4Regs(l_nvdimm,l_err); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; } else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) { TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " "Reading NVDIMM(0x%.8X) ES lifetime data, " "register ES_LIFETIME(0x%.2X)", get_huid(l_nvdimm), ES_LIFETIME); // The lifetime percentage uint8_t l_lifetimePercentage(0); // Retrieve the Lifetime Percentage from the BPM l_err = nvdimmReadReg(l_nvdimm, ES_LIFETIME, l_lifetimePercentage); if (l_err) { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "NVDIMM(0x%.8X) failed to read the " "ES_LIFETIME(0x%.2X) data", get_huid(l_nvdimm), ES_LIFETIME ); l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; } else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT) { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "ES health check on NVDIMM(0x%.8X) succeeded but " "the BPM's lifetime(%d) does not meet the minimum " "requirement(%d) needed to qualify as a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, ES_LIFETIME_MINIMUM_REQUIREMENT ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE * @moduleid NVDIMM_ES_HEALTH_CHECK * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET * @userdata1[00:31] HUID of NVDIMM target * @userdata1[32:63] ES health check status * @userdata2[00:31] Retrieved lifetime percentage * @userdata2[32:63] lifetime minimum requirement * @devdesc ES health check succeeded but the BPM's * lifetime does not meet the minimum * requirement needed to qualify as a * new BPM. * @custdesc NVDIMM ES health check failed */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, NVDIMM_ES_HEALTH_CHECK, NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET, TWO_UINT32_TO_UINT64( get_huid(l_nvdimm), l_esHealthCheck), TWO_UINT32_TO_UINT64( l_lifetimePercentage, ES_LIFETIME_MINIMUM_REQUIREMENT), ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); nvdimmAddPage4Regs(l_nvdimm,l_err); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; } // end else if (l_lifetimePercentage ... else { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Success: ES health check on NVDIMM(0x%.8X) " "succeeded and the BPM's lifetime(%d) meet's the " "minimum requirement(%d) needed to qualify as " "a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, ES_LIFETIME_MINIMUM_REQUIREMENT ); } } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) else // Assume the ES health check was never initiated at // the start of the IPL. { TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "The ES health check on NVDIMM(0x%.8X) shows no status " "(in progress, fail or succeed) so assuming it was " "never initiated at the start of the IPL.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE * @moduleid NVDIMM_ES_HEALTH_CHECK * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED * @userdata1 HUID of NVDIMM target * @userdata2 ES health check status * @devdesc The ES health check shows no status (in progress, * fail or succeed) so assuming it was never initiated * at the start of the IPL. * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, NVDIMM_ES_HEALTH_CHECK, NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED, get_huid(l_nvdimm), l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddVendorLog(l_nvdimm, l_err); // Add a BPM callout l_err->addPartCallout( l_nvdimm, HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); nvdimmAddPage4Regs(l_nvdimm,l_err); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didEsHealthCheckPass = false; } } // end for (auto const l_nvdimm : i_nvdimmTargetList) // Should not have any uncommitted errors assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected " "uncommitted error found" ); TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): " "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false"); return l_didEsHealthCheckPass; } // end nvDimmEsCheckHealthStatus /** * @brief A wrapper around the call to nvDimmEsCheckHealthStatus * * @see nvDimmEsCheckHealthStatus for more details * * @return false if one or more NVDIMMs fail an ES health check, else true */ bool nvDimmEsCheckHealthStatusOnSystem() { TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()"); // Get the list of NVDIMM Targets from the system TargetHandleList l_nvDimmTargetList; nvdimm_getNvdimmList(l_nvDimmTargetList); // Return status of doing a check health status bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList); TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): " "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" ); return l_didEsHealthCheckPass; } // end nvDimmCheckHealthStatusOnSystem /* * @brief Check the bad flash block percentage against a given maximum allowed. * * @details This returns a tristate - 1 pass, 2 different fails * If true is returned, then the check passed and * o_badFlashBlockPercentage will contain what the retrieved * flash block percentage is. * If false is returned and the o_badFlashBlockPercentage is zero, then * the check failed because of a register read fail * If false is returned and the o_badFlashBlockPercentage is not zero, * then the check failed because the retrieved bad flash block * percentage exceeds the given maximum allowed * * @param[in] i_nvDimm - The NVDIMM to check * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash * block allowed * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block * percentage from i_nvDimm, if no * register read error. * * @return false if check failed or register read failed, else true */ bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm, const uint8_t i_maxPercentageAllowed, uint8_t &o_badFlashBlockPercentage) { // Cache the HUID of the NVDIMM uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckBadFlashBlockPercentage(): " "NVDIMM(0x%.4X), max bad flash blocks allowed(%d)", l_nvDimmHuid, i_maxPercentageAllowed); // The status of the check on the bad block percentage bool l_didBadFlashBlockPercentageCheckPass(true); // The retrieved flash block percentage from register, initialize to zero o_badFlashBlockPercentage = 0; // Handle to catch any errors errlHndl_t l_err(nullptr); // Retrieve the percentage of bad blocks and validate TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " "Reading NVDIMM(0x%.8X) percentage of bad blocks from " "register FLASH_BAD_BLK_PCT(0x%.4X)", l_nvDimmHuid, FLASH_BAD_BLK_PCT); l_err = nvdimmReadReg(i_nvDimm, FLASH_BAD_BLK_PCT, o_badFlashBlockPercentage); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " "FAIL: NVDIMM(0x%.8X) failed to read the percentage of " "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), " "marking as a fail", l_nvDimmHuid, FLASH_BAD_BLK_PCT); l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); // Set up the fail state, so caller can determine that the fail was // due to a register read error l_didBadFlashBlockPercentageCheckPass = false; o_badFlashBlockPercentage = 0; } else { // Trace out the returned data for inspection TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " "NVDIMM(0x%.8X) returned value (%d) from the " "percentage of bad blocks, register " "FLASH_BAD_BLK_PCT(0x%.4X)", l_nvDimmHuid, o_badFlashBlockPercentage, FLASH_BAD_BLK_PCT); // Check to see if the bad flash block percentage // exceeds maximum allowed. if (o_badFlashBlockPercentage > i_maxPercentageAllowed) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " "FAIL: For NVDIMM (0x%.8X), the percentage of bad " "flash blocks (%d), read from register " "FLASH_BAD_BLK_PCT(0x%.4X), exceeds the maximum " "percentage of bad flash blocks allowed (%d), marking " "this as a fail", l_nvDimmHuid, o_badFlashBlockPercentage, FLASH_BAD_BLK_PCT, i_maxPercentageAllowed); // Set up the fail state, so caller can determine that the fail was // due to percentage exceeding the max percentage allowed. // Note: Leave the value in o_badFlashBlockPercentage so caller // can inspect, if they wish l_didBadFlashBlockPercentageCheckPass = false; } else { TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad " "flash blocks (%d) is less than or meets the maximum " "percentage of bad flash blocks allowed (%d), " "marking this as a pass", l_nvDimmHuid, o_badFlashBlockPercentage, i_maxPercentageAllowed); // Set up the pass state // Note: Leave the value in o_badFlashBlockPercentage so caller // can inspect, if they wish l_didBadFlashBlockPercentageCheckPass = true; } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed) } // end if (l_err) ... else TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckBadFlashBlockPercentage(): " "Returning %s", l_didBadFlashBlockPercentageCheckPass == true ? "true" : "false" ); return l_didBadFlashBlockPercentageCheckPass; } /* * @brief Check the flash error count against a given maximum allowed. * * @details This returns a tristate - 1 pass, 2 different fails * If true is returned, then the check passed and * o_readFlashErrorCount will contain what the retrieved * flash error count is. * If false is returned and the o_readFlashErrorCount is zero, then * the check failed because of a register read fail * If false is returned and the o_readFlashErrorCount is not zero, * then the check failed because the retrieved flash error * count exceeds the given maximum allowed * * @param[in] i_nvDimm - The NVDIMM to check * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors * allowed * @param[out] o_readFlashErrorCount - The retrieved bad flash error * count from i_nvDimm, if no * register read error. * * @return false if check failed or register read failed, else true */ bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm, const uint32_t i_maxFlashErrorsAllowed, uint32_t &o_readFlashErrorCount) { // Cache the HUID of the NVDIMM uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckFlashErrorCount(): " "NVDIMM(0x%.4X), max flash errors allowed(%d)", l_nvDimmHuid, i_maxFlashErrorsAllowed); // The status of the check on the flash error count bool l_didFlashErrorCountCheckPass(true); // The retrieved flash error count from register, initialize to zero o_readFlashErrorCount = 0; // Handle to catch any errors errlHndl_t l_err(nullptr); // The retrieved flash error count from a register uint8_t l_readFlashErrorCountByte(0); // Read the flash error count registers starting from MSB to LSB for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; l_flashErrorRegister >= FLASH_ERROR_COUNT0; --l_flashErrorRegister) { // Reset this for every iteration, may be redundant l_readFlashErrorCountByte = 0; TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " "Reading NVDIMM(0x%.8X) flash error count from " "register FLASH_ERROR_COUNT(0x%.4X)", l_nvDimmHuid, l_flashErrorRegister); l_err = nvdimmReadReg(i_nvDimm, static_cast(l_flashErrorRegister), l_readFlashErrorCountByte); if (l_err) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " "FAIL: NVDIMM(0x%.8X) failed to read flash error " "count from register FLASH_ERROR_COUNT(0x%.4X) " "marking as a fail", l_nvDimmHuid, l_flashErrorRegister); l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); l_err->collectTrace(NVDIMM_COMP_NAME); errlCommit(l_err, NVDIMM_COMP_ID); // Set up the fail state, so caller can determine that the fail was // due to a register read error l_didFlashErrorCountCheckPass = false; o_readFlashErrorCount = 0; break; } // If we get here, then the read was successful // Append the read flash error count byte to the LSB of the // aggregated flash error count bytes. o_readFlashErrorCount = (o_readFlashErrorCount << 8) | l_readFlashErrorCountByte; TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " "NVDIMM(0x%.8X) returned value (0x%.2X) from the " "partial flash error count, register " "FLASH_ERROR_COUNT(0x%.4X)", l_nvDimmHuid, l_readFlashErrorCountByte, l_flashErrorRegister); } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ... // If o_readFlashErrorCount is not zero, then register read was successful if (o_readFlashErrorCount) { TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " "NVDIMM(0x%.8X) flash error count = %d ", l_nvDimmHuid, o_readFlashErrorCount); // Check the validity of the flash error count if (o_readFlashErrorCount > i_maxFlashErrorsAllowed) { TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " "FAIL: For NVDIMM (0x%.8X), the flash error count (%d), " "read from registers FLASH_ERROR_COUNT0(0x%.4X), " "FLASH_ERROR_COUNT1(0x%.4X) and FLASH_ERROR_COUNT2(0x%.4X), " "exceeds the maximum number of flash " "errors allowed (%d), marking this as a fail", l_nvDimmHuid, o_readFlashErrorCount, FLASH_ERROR_COUNT0, FLASH_ERROR_COUNT1, FLASH_ERROR_COUNT2, i_maxFlashErrorsAllowed); // Set up the fail state, so caller can determine that the fail was // due to error count exceeding the max errors allowed. // Note: Leave the value in o_readFlashErrorCount so caller // can inspect, if they wish l_didFlashErrorCountCheckPass = false; } else { TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " "SUCCESS: For NVDIMM(0x%.8X), the flash error counts " "(%d) is less than or meets the maximum number of " "errors allowed (%d), marking this as a pass", l_nvDimmHuid, o_readFlashErrorCount, i_maxFlashErrorsAllowed); // Set up the pass state // Note: Leave the value in o_readFlashErrorCount so caller // can inspect, if they wish l_didFlashErrorCountCheckPass = true; } } // end if (o_readFlashErrorCount) TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckFlashErrorCount(): " "Returning %s", l_didFlashErrorCountCheckPass == true ? "true" : "false" ); return l_didFlashErrorCountCheckPass; } /* * @brief Check the NVM (non-volatile memory)/flash health of the individual * NVDIMMs supplied in list. * * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash * * @return false if one or more NVDIMMs fail NVM health check, else true */ bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList) { TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): " "Target list size(%d)", i_nvDimmTargetList.size()); // The following maximums are the same values used by SMART's // manufacturing and recommended that we use. // The maximum percentage of bad flash blocks // Fail if over 19% of bad flash blocks is encountered const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19; // The maximum number of flash memory errors allowed // Fail if over 300 flash memory errors is encountered const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300; // Status of the accumulation of all calls related to the NVM health check. // If any one call is bad/fails, then this will be false, else it stays true bool l_didNvmHealthCheckPass(true); // Handle to catch any errors errlHndl_t l_err(nullptr); // The retrieved flash block percentage from register uint8_t l_badFlashBlockPercentage(0); // The retrieved flash error count from register uint32_t l_flashErrorCount(0); // The status of the checks on the percentage of bad blocks and // flash error count // Default to true bool l_badFlashBlockPercentageCheckPassed(true); bool l_flashErrorCountCheckPassed(true); // Iterate thru the supplied NVDIMMs checking the health of the NVM for (auto const l_nvDimm : i_nvDimmTargetList) { // Cache the HUID of the NVDIMM uint32_t l_nvDimmHuid = get_huid( l_nvDimm ); // Reset these for every NVDIMM that is checked l_badFlashBlockPercentage = 0; l_flashErrorCount = 0; l_badFlashBlockPercentageCheckPassed = true; l_flashErrorCountCheckPassed = true; // Check the validity of bad flash block percentage if (!nvDimmCheckBadFlashBlockPercentage( l_nvDimm, MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED, l_badFlashBlockPercentage)) { // Set this to false to indicate that the overall check on the // NVDIMMs had at least one failure l_didNvmHealthCheckPass = false; // If no data in the variable l_badFlashBlockPercentage, then // this is a read register fail. Move onto the next NVDIMM // this is a dud if (!l_badFlashBlockPercentage) { continue; } // Set the check to false, to facilitate error reporting l_badFlashBlockPercentageCheckPassed = false; } // Check the validity of the flash error count if (!nvDimmCheckFlashErrorCount( l_nvDimm, MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED, l_flashErrorCount)) { // Set this to false to indicate that the overall check on the // NVDIMMs had at least one failure l_didNvmHealthCheckPass = false; // If no data in the variable l_flashErrorCount, then // this is a read register fail. Move onto the next NVDIMM // this is a dud if (!l_flashErrorCount) { continue; } // Set the check to false, to facilitate error reporting l_flashErrorCountCheckPassed = false; } /// Now we assess the health of the flash based on data gathered above if ( !l_badFlashBlockPercentageCheckPassed || !l_flashErrorCountCheckPassed ) { // First set the NVDIMM HUID to the first 32 bits of user data 1 uint64_t l_badFlashBlockPercentageUserData1 = TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0); // If an issue with the bad flash block percentage, then append // data to user data 1 if (!l_badFlashBlockPercentageCheckPassed && l_badFlashBlockPercentage) { // Setting the HUID here is redundant but easier than trying to // do some clever code that will set the HUID for user data 1 // when this path is not taken, but the next check on the flash // error count is taken l_badFlashBlockPercentageUserData1 = TWO_UINT32_TO_UINT64(l_nvDimmHuid, TWO_UINT16_TO_UINT32( l_badFlashBlockPercentage, MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED)); } // If an issue with the flash error count, then set user // data 2 to contain the flash error count value uint64_t l_flashErrorCountUserData2(0); if (!l_flashErrorCountCheckPassed && l_flashErrorCount) { l_flashErrorCountUserData2 = TWO_UINT32_TO_UINT64(l_flashErrorCount, MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED); } /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE * @moduleid NVDIMM_NVM_HEALTH_CHECK * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED * @userdata1[0:31] HUID of NVDIMM target * @userdata1[32:47] The retrieved bad flash block percentage, * if error with, else 0 * @userdata1[48:63] The maximum percentage of bad flash blocks * allowed, if bad flash block percentage * exceeds this maximum, else 0 * @userdata2[0:31] The retrieved flash error count, * if error with, else 0 * @userdata2[32:63] The maximum number of flash errors * allowed, if flash error exceeds this * maximum, else 0 * @devdesc Either the NVDIMM NVM bad flash block * percentage exceeded the maximum percentage * allowed or the NVDIMM NVM number of flash * error exceeds the maximum count allowed * or both. * @custdesc NVDIMM NVM health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, NVDIMM_NVM_HEALTH_CHECK, NVDIMM_NVM_HEALTH_CHECK_FAILED, l_badFlashBlockPercentageUserData1, l_flashErrorCountUserData2, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); nvdimmAddVendorLog(l_nvDimm, l_err); // Add a DIMM callout l_err->addHwCallout( l_nvDimm, HWAS::SRCI_PRIORITY_HIGH, HWAS::NO_DECONFIG, HWAS::GARD_NULL ); // Collect the error errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss l_didNvmHealthCheckPass = false; } else { // This NVDIMM passed the NVM health check TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): " "Success: NVDIMM (0x%.8X) passed the NVM health check.", l_nvDimmHuid); } // end if ( !l_badFlashBlockPercentageCheckPassed .. else } // end for (auto const l_nvdimm : i_nvdimmTargetList) // Should not have any uncommitted errors assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected " "uncommitted error found"); TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" ); return l_didNvmHealthCheckPass; } // end nvDimmNvmCheckHealthStatus /** * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus * * @see nvDimmNvmCheckHealthStatus for more details * * @return false if one or more NVDIMMs fail an NVM health check, else true */ bool nvDimmNvmCheckHealthStatusOnSystem() { TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()"); // Get the list of NVDIMM Targets from the system TargetHandleList l_nvDimmTargetList; nvdimm_getNvdimmList(l_nvDimmTargetList); // Return status of doing a check health status bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList); TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): " "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" ); return l_didNvmHealthCheckPass; } // end nvDimmCheckHealthStatusOnSystem } // end NVDIMM namespace