diff options
Diffstat (limited to 'src/usr/isteps/nvdimm/runtime/nvdimm_rt.C')
-rw-r--r-- | src/usr/isteps/nvdimm/runtime/nvdimm_rt.C | 1222 |
1 files changed, 904 insertions, 318 deletions
diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index 267fab07c..e8ad1d9e9 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -25,446 +25,1032 @@ /** * @file nvdimm_rt.C * - * @brief NVDIMM functions only needed for runtime + * @brief NVDIMM functions only needed for runtime. These functions include + * but are not limited to arming/disarming the NVDIMM along with methods + * to poll the arming and check the status of the arming. Checking the + * error state of the NVDIMM, getting a random number with the darn + * instruction and checking the ES or NVM health status. */ + +/// BPM - Backup Power Module + #include <trace/interface.H> #include <errl/errlentry.H> #include <errl/errlmanager.H> +#include <errl/errludstring.H> #include <util/runtime/rt_fwreq_helper.H> #include <targeting/common/attributes.H> #include <targeting/common/commontargeting.H> #include <targeting/common/util.H> #include <targeting/common/utilFilter.H> -#include <usr/runtime/rt_targeting.H> +#include <targeting/runtime/rt_targeting.H> #include <runtime/interface.h> +#include <arch/ppc.H> #include <isteps/nvdimm/nvdimmreasoncodes.H> +#include "../errlud_nvdimm.H" +#include "../nvdimmErrorLog.H" #include <isteps/nvdimm/nvdimm.H> // implements some of these #include "../nvdimm.H" // for g_trac_nvdimm +#include <sys/time.h> //#define TRACUCOMP(args...) TRACFCOMP(args) #define TRACUCOMP(args...) +using namespace TARGETING; +using namespace ERRORLOG; + namespace NVDIMM { +static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull; +static constexpr uint32_t MAX_DARN_ERRORS = 10; + /** -* @brief Notify PHYP of NVDIMM OCC protection status -*/ -errlHndl_t notifyNvdimmProtectionChange(TARGETING::Target* i_target, - const nvdimm_protection_t i_state) + * @brief Check nvdimm error state + * + * @param[in] i_nvdimm - nvdimm target + * + * @return bool - true if nvdimm is in any error state, false otherwise + */ +bool nvdimmInErrorState(Target *i_nvdimm) { - errlHndl_t l_err = nullptr; + TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm)); - // default to send a not protected status - uint64_t l_nvdimm_protection_state = - hostInterfaces::HBRT_FW_NVDIMM_NOT_PROTECTED; + uint8_t l_statusFlag = i_nvdimm->getAttr<ATTR_NV_STATUS_FLAG>(); + bool l_ret = true; - TRACFCOMP( g_trac_nvdimm, ENTER_MRK - "notifyNvdimmProtectionChange: Target huid 0x%.8X, state %d", - get_huid(i_target), i_state); - do + // Just checking bit 1 for now, need to investigate these + // Should be checking NVDIMM_ARMED instead + if ((l_statusFlag & NSTD_VAL_ERASED) == 0) { - TARGETING::TargetHandleList l_nvdimmTargetList = - TARGETING::getProcNVDIMMs(i_target); + l_ret = false; + } - // Only send command if the processor has an NVDIMM under it - if (l_nvdimmTargetList.empty()) + // Also check the encryption error status + Target* l_sys = nullptr; + targetService().getTopLevelTarget( l_sys ); + assert(l_sys, "nvdimmInErrorState: no TopLevelTarget"); + if (l_sys->getAttr<ATTR_NVDIMM_ENCRYPTION_ENABLE>()) + { + ATTR_NVDIMM_ARMED_type l_armed_state = {}; + l_armed_state = i_nvdimm->getAttr<ATTR_NVDIMM_ARMED>(); + if (l_armed_state.encryption_error_detected) { - TRACFCOMP( g_trac_nvdimm, - "notifyNvdimmProtectionChange: No NVDIMM found under processor 0x%.8X", - get_huid(i_target)); - break; + l_ret = true; } + } + + TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm)); + return l_ret; +} + - TARGETING::ATTR_NVDIMM_ARMED_type l_nvdimm_armed_state = - i_target->getAttr<TARGETING::ATTR_NVDIMM_ARMED>(); +// This could be made a generic utility +errlHndl_t nvdimm_getDarnNumber(size_t i_genSize, uint8_t* o_genData) +{ + assert(i_genSize % sizeof(uint64_t) == 0,"nvdimm_getDarnNumber() bad i_genSize"); - // Only notify protected state if NVDIMM controllers are - // armed and no error was or is detected - if (i_state == NVDIMM::PROTECTED) + errlHndl_t l_err = nullptr; + uint64_t* l_darnData = reinterpret_cast<uint64_t*>(o_genData); + + for (uint32_t l_loop = 0; l_loop < (i_genSize / sizeof(uint64_t)); l_loop++) + { + // Darn could return an error code + uint32_t l_darnErrors = 0; + + while (l_darnErrors < MAX_DARN_ERRORS) { - // Exit without notifying phyp if in error state - if (l_nvdimm_armed_state.error_detected) + // Get a 64-bit random number with the darn instruction + l_darnData[l_loop] = getDarn(); + + if ( l_darnData[l_loop] != DARN_ERROR_CODE ) { - // State can't go to protected after error is detected break; } - // check if we need to rearm the NVDIMM(s) - else if (!l_nvdimm_armed_state.armed) - { - bool nvdimms_armed = - NVDIMM::nvdimmArm(l_nvdimmTargetList); - if (nvdimms_armed) - { - // NVDIMMs are now armed and ready for backup - l_nvdimm_armed_state.armed = 1; - i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state); - - l_nvdimm_protection_state = hostInterfaces::HBRT_FW_NVDIMM_PROTECTED; - } - else - { - // If nvdimm arming failed, - // do NOT post that the dimms are now protected. - - // Remember this error, only try arming once - if (!l_nvdimm_armed_state.error_detected) - { - l_nvdimm_armed_state.error_detected = 1; - i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state); - } - - // Exit without notifying phyp of any protection change - break; - } - } else { - // NVDIMM already armed and no error found - l_nvdimm_protection_state = hostInterfaces::HBRT_FW_NVDIMM_PROTECTED; + l_darnErrors++; } } - else if (i_state == NVDIMM::UNPROTECTED_BECAUSE_ERROR) + + if (l_darnErrors == MAX_DARN_ERRORS) { - // Remember that this NV controller has an error so - // we don't rearm this until next IPL - if (!l_nvdimm_armed_state.error_detected) - { - l_nvdimm_armed_state.error_detected = 1; - i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state); - } - // still notify phyp that NVDIMM is Not Protected + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_getDarnNumber() reached MAX_DARN_ERRORS"); + /*@ + *@errortype + *@reasoncode NVDIMM_ENCRYPTION_MAX_DARN_ERRORS + *@severity ERRORLOG_SEV_PREDICTIVE + *@moduleid NVDIMM_GET_DARN_NUMBER + *@userdata1 MAX_DARN_ERRORS + *@devdesc Error using darn instruction + *@custdesc NVDIMM encryption error + */ + l_err = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, + NVDIMM_GET_DARN_NUMBER, + NVDIMM_ENCRYPTION_MAX_DARN_ERRORS, + MAX_DARN_ERRORS, + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + break; } + } + return l_err; +} - // Get the Proc Chip Id - RT_TARG::rtChipId_t l_chipId = 0; - l_err = RT_TARG::getRtTarget(i_target, l_chipId); - if(l_err) +errlHndl_t nvdimm_getRandom(uint8_t* o_genData) +{ + errlHndl_t l_err = nullptr; + uint8_t l_xtraData[ENC_KEY_SIZE] = {0}; + + do + { + // Get a random number with the darn instruction + l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, o_genData); + if (l_err) { - TRACFCOMP( g_trac_nvdimm, - ERR_MRK"notifyNvdimmProtectionChange: getRtTarget ERROR" ); break; } - // send the notification msg - if ((nullptr == g_hostInterfaces) || - (nullptr == g_hostInterfaces->firmware_request)) + // Validate and update the random number + // Retry if more randomness required + do { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"notifyNvdimmProtectionChange: " - "Hypervisor firmware_request interface not linked"); + //Get replacement data + l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, l_xtraData); + if (l_err) + { + break; + } + + }while (nvdimm_keyifyRandomNumber(o_genData, l_xtraData)); + + }while (0); + + return l_err; +} + +/* + * @brief Check the ES (enery source)/backup power module(BPM) health status of + * the individual NVDIMMs supplied in list + * + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of + * + * @return false if one or more NVDIMMs fail ES health check, else true + */ +bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): " + "Target list size(%d)", i_nvdimmTargetList.size()); + + // The minimum ES lifetime value + const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% + + // The ES health check status flags for the different states of an + // ES health check + const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 + const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 + const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 - // need to safely convert struct type into uint32_t - union { - TARGETING::ATTR_NVDIMM_ARMED_type tNvdimmArmed; - uint32_t nvdimmArmed_int; - } armed_state_union; - armed_state_union.tNvdimmArmed = l_nvdimm_armed_state; + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // The ES health check status from an ES health check call + uint8_t l_esHealthCheck(0); + + // Status of the accumulation of all calls related to the ES health check. + // If any one call is bad/fails, then this will be false, else it stays true + bool l_didEsHealthCheckPass(true); + + // Iterate thru the NVDIMMs checking the ES health status of each one. + // Going with the assumption that the caller waited the allotted time, + // roughly 20 to 30 minutes, after the start of an IPL. + // Success case: + // * ES health check initiated at start of the IPL, caller waited the + // allotted time (20 to 30 mins) before doing a health check, health + // check returned success and the lifetime meets the minimum threshold + // for a new BPM. + // Error cases are: + // * ES health check is in progress, will assume BPM is hung + // * ES health check failed + // * ES health check succeeded but lifetime does not meet a + // certain threshold + // * If none of the above apply (success case and other error cases), + // then assume the ES health check was never initiated at the start + // of the IPL + // For each of these error cases do a predictive callout + for (auto const l_nvdimm : i_nvdimmTargetList) + { + // Retrieve the Health Check status from the BPM + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES health check data, " + "register ES_CMD_STATUS0(0x%.2X)", + get_huid(l_nvdimm), ES_CMD_STATUS0); + + l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck); + + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) failed to read the ES health check " + "data, register ES_CMD_STATUS0(0x%.2X)", + get_huid(l_nvdimm), ES_CMD_STATUS0); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; + + // Proceed to next NVDIMM, better luck next time + continue; + } + + // Trace out the returned data for inspection + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) returned value(0x%.2X) from the ES health " + "check data, register ES_CMD_STATUS0(0x%.2X)", + get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0); + + if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG) + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "Assuming caller waited the allotted time before " + "doing an ES health check on NVDIMM(0x%.8X), the BPM " + "is hung doing the ES health check.", + get_huid(l_nvdimm) ); /*@ * @errortype - * @severity ERRL_SEV_PREDICTIVE - * @moduleid NOTIFY_NVDIMM_PROTECTION_CHG - * @reasoncode NVDIMM_NULL_FIRMWARE_REQUEST_PTR - * @userdata1 HUID of processor target - * @userdata2[0:31] Requested protection state - * @userdata2[32:63] Current armed state - * @devdesc Unable to inform PHYP of NVDIMM protection - * @custdesc Internal firmware error + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE + * @userdata1 HUID of NVDIMM target + * @userdata2 ES health check status + * @devdesc Assuming caller waited the allotted time before + * doing an ES health check, then the BPM is hung doing + * the ES health check. + * @custdesc NVDIMM ES health check failed. */ - l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, - NOTIFY_NVDIMM_PROTECTION_CHG, - NVDIMM_NULL_FIRMWARE_REQUEST_PTR, - get_huid(i_target), - TWO_UINT32_TO_UINT64( - l_nvdimm_protection_state, - armed_state_union.nvdimmArmed_int) - ); - - l_err->addProcedureCallout(HWAS::EPUB_PRC_PHYP_CODE, + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE, + get_huid(l_nvdimm), + l_esHealthCheck, + ErrlEntry::NO_SW_CALLOUT ); + l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); + + // Add a BPM callout + l_err->addPartCallout( l_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + nvdimmAddPage4Regs(l_nvdimm,l_err); + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; + } + else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG) + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "Assuming caller waited the allotted time before " + "doing an ES health check on NVDIMM(0x%.8X), the BPM " + "reported a failure.", + get_huid(l_nvdimm) ); + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE + * @userdata1 HUID of NVDIMM target + * @userdata2 ES health check status + * @devdesc Assuming caller waited the allotted time before + * doing an ES health check, the BPM reported a failure + * while doing an ES health check. + * @custdesc NVDIMM ES health check failed. + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE, + get_huid(l_nvdimm), + l_esHealthCheck, + ErrlEntry::NO_SW_CALLOUT ); + l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); + + // Add a BPM callout + l_err->addPartCallout( l_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + nvdimmAddPage4Regs(l_nvdimm,l_err); + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; + } + else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES lifetime data, " + "register ES_LIFETIME(0x%.2X)", + get_huid(l_nvdimm), ES_LIFETIME); + + // The lifetime percentage + uint8_t l_lifetimePercentage(0); + + // Retrieve the Lifetime Percentage from the BPM + l_err = nvdimmReadReg(l_nvdimm, ES_LIFETIME, l_lifetimePercentage); + + if (l_err) + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%.8X) failed to read the " + "ES_LIFETIME(0x%.2X) data", + get_huid(l_nvdimm), + ES_LIFETIME ); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; + } + else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT) + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "ES health check on NVDIMM(0x%.8X) succeeded but " + "the BPM's lifetime(%d) does not meet the minimum " + "requirement(%d) needed to qualify as a new BPM.", + get_huid(l_nvdimm), + l_lifetimePercentage, + ES_LIFETIME_MINIMUM_REQUIREMENT ); + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET + * @userdata1[00:31] HUID of NVDIMM target + * @userdata1[32:63] ES health check status + * @userdata2[00:31] Retrieved lifetime percentage + * @userdata2[32:63] lifetime minimum requirement + * @devdesc ES health check succeeded but the BPM's + * lifetime does not meet the minimum + * requirement needed to qualify as a + * new BPM. + * @custdesc NVDIMM ES health check failed + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET, + TWO_UINT32_TO_UINT64( + get_huid(l_nvdimm), + l_esHealthCheck), + TWO_UINT32_TO_UINT64( + l_lifetimePercentage, + ES_LIFETIME_MINIMUM_REQUIREMENT), + ErrlEntry::NO_SW_CALLOUT ); + l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); + + // Add a BPM callout + l_err->addPartCallout( l_nvdimm, + HWAS::BPM_PART_TYPE, HWAS::SRCI_PRIORITY_HIGH); + nvdimmAddPage4Regs(l_nvdimm,l_err); + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; + } // end else if (l_lifetimePercentage ... + else + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "Success: ES health check on NVDIMM(0x%.8X) " + "succeeded and the BPM's lifetime(%d) meet's the " + "minimum requirement(%d) needed to qualify as " + "a new BPM.", + get_huid(l_nvdimm), + l_lifetimePercentage, + ES_LIFETIME_MINIMUM_REQUIREMENT ); + } + } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) + else // Assume the ES health check was never initiated at + // the start of the IPL. + { + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "The ES health check on NVDIMM(0x%.8X) shows no status " + "(in progress, fail or succeed) so assuming it was " + "never initiated at the start of the IPL.", + get_huid(l_nvdimm) ); + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED + * @userdata1 HUID of NVDIMM target + * @userdata2 ES health check status + * @devdesc The ES health check shows no status (in progress, + * fail or succeed) so assuming it was never initiated + * at the start of the IPL. + * @custdesc NVDIMM ES health check failed. + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED, + get_huid(l_nvdimm), + l_esHealthCheck, + ErrlEntry::NO_SW_CALLOUT ); + l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvdimm, l_err); - break; + // Add a BPM callout + l_err->addPartCallout( l_nvdimm, + HWAS::BPM_PART_TYPE, + HWAS::SRCI_PRIORITY_HIGH); + nvdimmAddPage4Regs(l_nvdimm,l_err); + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didEsHealthCheckPass = false; } + } // end for (auto const l_nvdimm : i_nvdimmTargetList) - TRACFCOMP( g_trac_nvdimm, - "notifyNvdimmProtectionChange: 0x%.8X processor NVDIMMS are " - "%s protected (current armed_state: 0x%02X)", - get_huid(i_target), - (l_nvdimm_protection_state == hostInterfaces::HBRT_FW_NVDIMM_PROTECTED)?"now":"NOT", - l_nvdimm_armed_state ); - - // Create the firmware_request request struct to send data - hostInterfaces::hbrt_fw_msg l_req_fw_msg; - memset(&l_req_fw_msg, 0, sizeof(l_req_fw_msg)); // clear it all - - // actual msg size (one type of hbrt_fw_msg) - uint64_t l_req_fw_msg_size = hostInterfaces::HBRT_FW_MSG_BASE_SIZE + - sizeof(l_req_fw_msg.nvdimm_protection_state); - - // Populate the firmware_request request struct with given data - l_req_fw_msg.io_type = - hostInterfaces::HBRT_FW_MSG_TYPE_NVDIMM_PROTECTION; - l_req_fw_msg.nvdimm_protection_state.i_procId = l_chipId; - l_req_fw_msg.nvdimm_protection_state.i_state = - l_nvdimm_protection_state; - - // Create the firmware_request response struct to receive data - hostInterfaces::hbrt_fw_msg l_resp_fw_msg; - uint64_t l_resp_fw_msg_size = sizeof(l_resp_fw_msg); - memset(&l_resp_fw_msg, 0, l_resp_fw_msg_size); - - // Make the firmware_request call - l_err = firmware_request_helper(l_req_fw_msg_size, - &l_req_fw_msg, - &l_resp_fw_msg_size, - &l_resp_fw_msg); - - } while (0); - - TRACFCOMP( g_trac_nvdimm, - EXIT_MRK "notifyNvdimmProtectionChange(%.8X, %d) - ERRL %.8X:%.4X", - get_huid(i_target), i_state, - ERRL_GETEID_SAFE(l_err), ERRL_GETRC_SAFE(l_err) ); + // Should not have any uncommitted errors + assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected " + "uncommitted error found" ); - return l_err; -} + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false"); + + return l_didEsHealthCheckPass; +} // end nvDimmEsCheckHealthStatus /** - * @brief This function polls the command status register for arm completion - * (does not indicate success or fail) + * @brief A wrapper around the call to nvDimmEsCheckHealthStatus * - * @param[in] i_nvdimm - nvdimm target with NV controller + * @see nvDimmEsCheckHealthStatus for more details * - * @param[out] o_poll - total polled time in ms - * - * @return errlHndl_t - Null if successful, otherwise a pointer to - * the error log. + * @return false if one or more NVDIMMs fail an ES health check, else true */ -errlHndl_t nvdimmPollArmDone(TARGETING::Target* i_nvdimm, - uint32_t &o_poll) +bool nvDimmEsCheckHealthStatusOnSystem() { - TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmPollArmDone() nvdimm[%X]", TARGETING::get_huid(i_nvdimm) ); + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()"); - errlHndl_t l_err = nullptr; + // Get the list of NVDIMM Targets from the system + TargetHandleList l_nvDimmTargetList; + nvdimm_getNvdimmList(l_nvDimmTargetList); - l_err = nvdimmPollStatus ( i_nvdimm, ARM, o_poll); + // Return status of doing a check health status + bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList); - TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollArmDone() nvdimm[%X]", - TARGETING::get_huid(i_nvdimm)); + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" ); - return l_err; -} + return l_didEsHealthCheckPass; +} // end nvDimmCheckHealthStatusOnSystem -/** - * @brief This function checks the arm status register to make sure - * the trigger has been armed to ddr_reset_n +/* + * @brief Check the bad flash block percentage against a given maximum allowed. * - * @param[in] i_nvdimm - nvdimm target with NV controller + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_badFlashBlockPercentage will contain what the retrieved + * flash block percentage is. + * If false is returned and the o_badFlashBlockPercentage is zero, then + * the check failed because of a register read fail + * If false is returned and the o_badFlashBlockPercentage is not zero, + * then the check failed because the retrieved bad flash block + * percentage exceeds the given maximum allowed * - * @return errlHndl_t - Null if successful, otherwise a pointer to - * the error log. + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash + * block allowed + * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block + * percentage from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true */ -errlHndl_t nvdimmCheckArmSuccess(TARGETING::Target *i_nvdimm) +bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm, + const uint8_t i_maxPercentageAllowed, + uint8_t &o_badFlashBlockPercentage) { - TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]", - TARGETING::get_huid(i_nvdimm)); + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); - errlHndl_t l_err = nullptr; - uint8_t l_data = 0; + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "NVDIMM(0x%.4X), max bad flash blocks allowed(%d)", + l_nvDimmHuid, + i_maxPercentageAllowed); + + // The status of the check on the bad block percentage + bool l_didBadFlashBlockPercentageCheckPass(true); + + // The retrieved flash block percentage from register, initialize to zero + o_badFlashBlockPercentage = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // Retrieve the percentage of bad blocks and validate + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "Reading NVDIMM(0x%.8X) percentage of bad blocks from " + "register FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); - l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data); + l_err = nvdimmReadReg(i_nvDimm, + FLASH_BAD_BLK_PCT, + o_badFlashBlockPercentage); if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" - "failed to read arm status reg!",TARGETING::get_huid(i_nvdimm)); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: NVDIMM(0x%.8X) failed to read the percentage of " + "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), " + "marking as a fail", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didBadFlashBlockPercentageCheckPass = false; + o_badFlashBlockPercentage = 0; } - else if ((l_data & ARM_SUCCESS) != ARM_SUCCESS) + else { + // Trace out the returned data for inspection + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "NVDIMM(0x%.8X) returned value (%d) from the " + "percentage of bad blocks, register " + "FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, + o_badFlashBlockPercentage, + FLASH_BAD_BLK_PCT); - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]" - "failed to arm!",TARGETING::get_huid(i_nvdimm)); - /*@ - *@errortype - *@reasoncode NVDIMM_ARM_FAILED - *@severity ERRORLOG_SEV_PREDICTIVE - *@moduleid NVDIMM_SET_ARM - *@userdata1[0:31] Related ops (0xff = NA) - *@userdata1[32:63] Target Huid - *@userdata2 <UNUSED> - *@devdesc Encountered error arming the catastrophic save - * trigger on NVDIMM. Make sure an energy source - * is connected to the NVDIMM and the ES policy - * is set properly - *@custdesc NVDIMM encountered error arming save trigger - */ - l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, - NVDIMM_SET_ARM, - NVDIMM_ARM_FAILED, - TWO_UINT32_TO_UINT64(ARM, TARGETING::get_huid(i_nvdimm)), - 0x0, - ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); - - l_err->collectTrace(NVDIMM_COMP_NAME, 256 ); - - // Failure to arm could mean internal NV controller error or - // even error on the battery pack. NVDIMM will lose persistency - // if failed to arm trigger - l_err->addPartCallout( i_nvdimm, - HWAS::NV_CONTROLLER_PART_TYPE, - HWAS::SRCI_PRIORITY_HIGH); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_PART_TYPE, - HWAS::SRCI_PRIORITY_MED); - l_err->addPartCallout( i_nvdimm, - HWAS::BPM_CABLE_PART_TYPE, - HWAS::SRCI_PRIORITY_MED); - } + // Check to see if the bad flash block percentage + // exceeds maximum allowed. + if (o_badFlashBlockPercentage > i_maxPercentageAllowed) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d), read from register " + "FLASH_BAD_BLK_PCT(0x%.4X), exceeds the maximum " + "percentage of bad flash blocks allowed (%d), marking " + "this as a fail", + l_nvDimmHuid, + o_badFlashBlockPercentage, + FLASH_BAD_BLK_PCT, + i_maxPercentageAllowed); - TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]", - TARGETING::get_huid(i_nvdimm), l_data); + // Set up the fail state, so caller can determine that the fail was + // due to percentage exceeding the max percentage allowed. + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = false; + } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d) is less than or meets the maximum " + "percentage of bad flash blocks allowed (%d), " + "marking this as a pass", + l_nvDimmHuid, + o_badFlashBlockPercentage, + i_maxPercentageAllowed); - return l_err; + // Set up the pass state + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = true; + } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed) + } // end if (l_err) ... else + + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "Returning %s", + l_didBadFlashBlockPercentageCheckPass == true ? "true" : "false" ); + + return l_didBadFlashBlockPercentageCheckPass; } -bool nvdimmArm(TARGETING::TargetHandleList &i_nvdimmTargetList) +/* + * @brief Check the flash error count against a given maximum allowed. + * + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_readFlashErrorCount will contain what the retrieved + * flash error count is. + * If false is returned and the o_readFlashErrorCount is zero, then + * the check failed because of a register read fail + * If false is returned and the o_readFlashErrorCount is not zero, + * then the check failed because the retrieved flash error + * count exceeds the given maximum allowed + * + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors + * allowed + * @param[out] o_readFlashErrorCount - The retrieved bad flash error + * count from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true + */ +bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm, + const uint32_t i_maxFlashErrorsAllowed, + uint32_t &o_readFlashErrorCount) { - bool o_arm_successful = true; + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); - TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() %d", - i_nvdimmTargetList.size()); + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.4X), max flash errors allowed(%d)", + l_nvDimmHuid, + i_maxFlashErrorsAllowed); - errlHndl_t l_err = nullptr; + // The status of the check on the flash error count + bool l_didFlashErrorCountCheckPass(true); - for (auto const l_nvdimm : i_nvdimmTargetList) + // The retrieved flash error count from register, initialize to zero + o_readFlashErrorCount = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // The retrieved flash error count from a register + uint8_t l_readFlashErrorCountByte(0); + + // Read the flash error count registers starting from MSB to LSB + for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; + l_flashErrorRegister >= FLASH_ERROR_COUNT0; + --l_flashErrorRegister) { - // skip if the nvdimm is in error state - if (NVDIMM::nvdimmInErrorState(l_nvdimm)) - { - // error state means arming not successful - o_arm_successful = false; - continue; - } + // Reset this for every iteration, may be redundant + l_readFlashErrorCountByte = 0; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "Reading NVDIMM(0x%.8X) flash error count from " + "register FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, l_flashErrorRegister); + + l_err = nvdimmReadReg(i_nvDimm, + static_cast<i2cReg >(l_flashErrorRegister), + l_readFlashErrorCountByte); - l_err = nvdimmSetESPolicy(l_nvdimm); if (l_err) { - o_arm_successful = false; - nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP); + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: NVDIMM(0x%.8X) failed to read flash error " + "count from register FLASH_ERROR_COUNT(0x%.4X) " + "marking as a fail", + l_nvDimmHuid, l_flashErrorRegister); - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit( l_err, NVDIMM_COMP_ID ); - continue; + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didFlashErrorCountCheckPass = false; + o_readFlashErrorCount = 0; + + break; } - l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER); - // If we run into any error here we will just - // commit the error log and move on. Let the - // system continue to boot and let the user - // salvage the data - if (l_err) + // If we get here, then the read was successful + // Append the read flash error count byte to the LSB of the + // aggregated flash error count bytes. + o_readFlashErrorCount = (o_readFlashErrorCount << 8) | + l_readFlashErrorCountByte; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) returned value (0x%.2X) from the " + "partial flash error count, register " + "FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, + l_readFlashErrorCountByte, + l_flashErrorRegister); + + } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ... + + // If o_readFlashErrorCount is not zero, then register read was successful + if (o_readFlashErrorCount) + { + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) flash error count = %d ", + l_nvDimmHuid, o_readFlashErrorCount); + + // Check the validity of the flash error count + if (o_readFlashErrorCount > i_maxFlashErrorsAllowed) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; - continue; + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: For NVDIMM (0x%.8X), the flash error count (%d), " + "read from registers FLASH_ERROR_COUNT0(0x%.4X), " + "FLASH_ERROR_COUNT1(0x%.4X) and FLASH_ERROR_COUNT2(0x%.4X), " + "exceeds the maximum number of flash " + "errors allowed (%d), marking this as a fail", + l_nvDimmHuid, + o_readFlashErrorCount, + FLASH_ERROR_COUNT0, + FLASH_ERROR_COUNT1, + FLASH_ERROR_COUNT2, + i_maxFlashErrorsAllowed); + + // Set up the fail state, so caller can determine that the fail was + // due to error count exceeding the max errors allowed. + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = false; } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "SUCCESS: For NVDIMM(0x%.8X), the flash error counts " + "(%d) is less than or meets the maximum number of " + "errors allowed (%d), marking this as a pass", + l_nvDimmHuid, + o_readFlashErrorCount, + i_maxFlashErrorsAllowed); - // Arm happens one module at a time. No need to set any offset on the counter - uint32_t l_poll = 0; - l_err = nvdimmPollArmDone(l_nvdimm, l_poll); - if (l_err) + // Set up the pass state + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = true; + } + } // end if (o_readFlashErrorCount) + + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckFlashErrorCount(): " + "Returning %s", + l_didFlashErrorCountCheckPass == true ? "true" : "false" ); + + return l_didFlashErrorCountCheckPass; +} + +/* + * @brief Check the NVM (non-volatile memory)/flash health of the individual + * NVDIMMs supplied in list. + * + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash + * + * @return false if one or more NVDIMMs fail NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList) +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): " + "Target list size(%d)", i_nvDimmTargetList.size()); + + // The following maximums are the same values used by SMART's + // manufacturing and recommended that we use. + // The maximum percentage of bad flash blocks + // Fail if over 19% of bad flash blocks is encountered + const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19; + // The maximum number of flash memory errors allowed + // Fail if over 300 flash memory errors is encountered + const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300; + + // Status of the accumulation of all calls related to the NVM health check. + // If any one call is bad/fails, then this will be false, else it stays true + bool l_didNvmHealthCheckPass(true); + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // The retrieved flash block percentage from register + uint8_t l_badFlashBlockPercentage(0); + // The retrieved flash error count from register + uint32_t l_flashErrorCount(0); + + // The status of the checks on the percentage of bad blocks and + // flash error count + // Default to true + bool l_badFlashBlockPercentageCheckPassed(true); + bool l_flashErrorCountCheckPassed(true); + + // Iterate thru the supplied NVDIMMs checking the health of the NVM + for (auto const l_nvDimm : i_nvDimmTargetList) + { + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( l_nvDimm ); + + // Reset these for every NVDIMM that is checked + l_badFlashBlockPercentage = 0; + l_flashErrorCount = 0; + l_badFlashBlockPercentageCheckPassed = true; + l_flashErrorCountCheckPassed = true; + + // Check the validity of bad flash block percentage + if (!nvDimmCheckBadFlashBlockPercentage( + l_nvDimm, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED, + l_badFlashBlockPercentage)) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; - continue; + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_badFlashBlockPercentage, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_badFlashBlockPercentage) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_badFlashBlockPercentageCheckPassed = false; } - l_err = nvdimmCheckArmSuccess(l_nvdimm); - if (l_err) + // Check the validity of the flash error count + if (!nvDimmCheckFlashErrorCount( + l_nvDimm, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED, + l_flashErrorCount)) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; - continue; + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_flashErrorCount, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_flashErrorCount) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_flashErrorCountCheckPassed = false; } - // After arming the trigger, erase the image to prevent the possible - // stale image getting the restored on the next boot in case of failed - // save. - l_err = nvdimmEraseNF(l_nvdimm); - if (l_err) + /// Now we assess the health of the flash based on data gathered above + if ( !l_badFlashBlockPercentageCheckPassed || + !l_flashErrorCountCheckPassed ) { - NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP); - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; + // First set the NVDIMM HUID to the first 32 bits of user data 1 + uint64_t l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0); - // If the erase failed let's disarm the trigger - l_err = nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER); - if (l_err) + // If an issue with the bad flash block percentage, then append + // data to user data 1 + if (!l_badFlashBlockPercentageCheckPassed && + l_badFlashBlockPercentage) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X], error disarming the nvdimm!", - TARGETING::get_huid(l_nvdimm)); - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME, 1024); - errlCommit(l_err, NVDIMM_COMP_ID); + // Setting the HUID here is redundant but easier than trying to + // do some clever code that will set the HUID for user data 1 + // when this path is not taken, but the next check on the flash + // error count is taken + l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, + TWO_UINT16_TO_UINT32( + l_badFlashBlockPercentage, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED)); } - continue; + // If an issue with the flash error count, then set user + // data 2 to contain the flash error count value + uint64_t l_flashErrorCountUserData2(0); + if (!l_flashErrorCountCheckPassed && + l_flashErrorCount) + { + l_flashErrorCountUserData2 = + TWO_UINT32_TO_UINT64(l_flashErrorCount, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED); + } + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_NVM_HEALTH_CHECK + * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED + * @userdata1[0:31] HUID of NVDIMM target + * @userdata1[32:47] The retrieved bad flash block percentage, + * if error with, else 0 + * @userdata1[48:63] The maximum percentage of bad flash blocks + * allowed, if bad flash block percentage + * exceeds this maximum, else 0 + * @userdata2[0:31] The retrieved flash error count, + * if error with, else 0 + * @userdata2[32:63] The maximum number of flash errors + * allowed, if flash error exceeds this + * maximum, else 0 + * @devdesc Either the NVDIMM NVM bad flash block + * percentage exceeded the maximum percentage + * allowed or the NVDIMM NVM number of flash + * error exceeds the maximum count allowed + * or both. + * @custdesc NVDIMM NVM health check failed. + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_NVM_HEALTH_CHECK, + NVDIMM_NVM_HEALTH_CHECK_FAILED, + l_badFlashBlockPercentageUserData1, + l_flashErrorCountUserData2, + ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + nvdimmAddVendorLog(l_nvDimm, l_err); + + // Add a DIMM callout + l_err->addHwCallout( l_nvDimm, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL ); + + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didNvmHealthCheckPass = false; } - } + else + { + // This NVDIMM passed the NVM health check + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): " + "Success: NVDIMM (0x%.8X) passed the NVM health check.", + l_nvDimmHuid); + } // end if ( !l_badFlashBlockPercentageCheckPassed .. else + } // end for (auto const l_nvdimm : i_nvdimmTargetList) - TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d", - o_arm_successful); - return o_arm_successful; -} + // Should not have any uncommitted errors + assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected " + "uncommitted error found"); + + TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s", + l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmNvmCheckHealthStatus /** - * @brief Check nvdimm error state + * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus * - * @param[in] i_nvdimm - nvdimm target + * @see nvDimmNvmCheckHealthStatus for more details * - * @return bool - true if nvdimm is in any error state, false otherwise + * @return false if one or more NVDIMMs fail an NVM health check, else true */ -bool nvdimmInErrorState(TARGETING::Target *i_nvdimm) +bool nvDimmNvmCheckHealthStatusOnSystem() { - TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmInErrorState() HUID[%X]",TARGETING::get_huid(i_nvdimm)); + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()"); - uint8_t l_statusFlag = i_nvdimm->getAttr<TARGETING::ATTR_NV_STATUS_FLAG>(); - bool l_ret = true; + // Get the list of NVDIMM Targets from the system + TargetHandleList l_nvDimmTargetList; + nvdimm_getNvdimmList(l_nvDimmTargetList); - if ((l_statusFlag & NSTD_ERR) == 0) - l_ret = false; + // Return status of doing a check health status + bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList); - TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmInErrorState() HUID[%X]",TARGETING::get_huid(i_nvdimm)); - return l_ret; + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): " + "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmCheckHealthStatusOnSystem + + +/** + * @brief Send NV_STATUS to host + */ +void nvdimmSendNvStatus() +{ + // Send NV_STATUS for all nvdimms + TargetHandleList l_nvdimmTargetList; + nvdimm_getNvdimmList(l_nvdimmTargetList); + for (const auto & l_nvdimm : l_nvdimmTargetList) + { + errlHndl_t l_err = nullptr; + l_err = notifyNvdimmProtectionChange(l_nvdimm,SEND_NV_STATUS); + if (l_err) + { + errlCommit(l_err, NVDIMM_COMP_ID); + } + } } + +struct registerNvdimmRt +{ + registerNvdimmRt() + { + // Register function to call at end of RT init + postInitCalls_t * rt_post = getPostInitCalls(); + rt_post->callSendNvStatus = &nvdimmSendNvStatus; + } +}; + +registerNvdimmRt g_registerNvdimmRt; + } // end NVDIMM namespace |