summaryrefslogtreecommitdiffstats
path: root/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
diff options
context:
space:
mode:
Diffstat (limited to 'src/usr/isteps/nvdimm/runtime/nvdimm_rt.C')
-rw-r--r--src/usr/isteps/nvdimm/runtime/nvdimm_rt.C1222
1 files changed, 904 insertions, 318 deletions
diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
index 267fab07c..e8ad1d9e9 100644
--- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
+++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
@@ -25,446 +25,1032 @@
/**
* @file nvdimm_rt.C
*
- * @brief NVDIMM functions only needed for runtime
+ * @brief NVDIMM functions only needed for runtime. These functions include
+ * but are not limited to arming/disarming the NVDIMM along with methods
+ * to poll the arming and check the status of the arming. Checking the
+ * error state of the NVDIMM, getting a random number with the darn
+ * instruction and checking the ES or NVM health status.
*/
+
+/// BPM - Backup Power Module
+
#include <trace/interface.H>
#include <errl/errlentry.H>
#include <errl/errlmanager.H>
+#include <errl/errludstring.H>
#include <util/runtime/rt_fwreq_helper.H>
#include <targeting/common/attributes.H>
#include <targeting/common/commontargeting.H>
#include <targeting/common/util.H>
#include <targeting/common/utilFilter.H>
-#include <usr/runtime/rt_targeting.H>
+#include <targeting/runtime/rt_targeting.H>
#include <runtime/interface.h>
+#include <arch/ppc.H>
#include <isteps/nvdimm/nvdimmreasoncodes.H>
+#include "../errlud_nvdimm.H"
+#include "../nvdimmErrorLog.H"
#include <isteps/nvdimm/nvdimm.H> // implements some of these
#include "../nvdimm.H" // for g_trac_nvdimm
+#include <sys/time.h>
//#define TRACUCOMP(args...) TRACFCOMP(args)
#define TRACUCOMP(args...)
+using namespace TARGETING;
+using namespace ERRORLOG;
+
namespace NVDIMM
{
+static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull;
+static constexpr uint32_t MAX_DARN_ERRORS = 10;
+
/**
-* @brief Notify PHYP of NVDIMM OCC protection status
-*/
-errlHndl_t notifyNvdimmProtectionChange(TARGETING::Target* i_target,
- const nvdimm_protection_t i_state)
+ * @brief Check nvdimm error state
+ *
+ * @param[in] i_nvdimm - nvdimm target
+ *
+ * @return bool - true if nvdimm is in any error state, false otherwise
+ */
+bool nvdimmInErrorState(Target *i_nvdimm)
{
- errlHndl_t l_err = nullptr;
+ TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm));
- // default to send a not protected status
- uint64_t l_nvdimm_protection_state =
- hostInterfaces::HBRT_FW_NVDIMM_NOT_PROTECTED;
+ uint8_t l_statusFlag = i_nvdimm->getAttr<ATTR_NV_STATUS_FLAG>();
+ bool l_ret = true;
- TRACFCOMP( g_trac_nvdimm, ENTER_MRK
- "notifyNvdimmProtectionChange: Target huid 0x%.8X, state %d",
- get_huid(i_target), i_state);
- do
+ // Just checking bit 1 for now, need to investigate these
+ // Should be checking NVDIMM_ARMED instead
+ if ((l_statusFlag & NSTD_VAL_ERASED) == 0)
{
- TARGETING::TargetHandleList l_nvdimmTargetList =
- TARGETING::getProcNVDIMMs(i_target);
+ l_ret = false;
+ }
- // Only send command if the processor has an NVDIMM under it
- if (l_nvdimmTargetList.empty())
+ // Also check the encryption error status
+ Target* l_sys = nullptr;
+ targetService().getTopLevelTarget( l_sys );
+ assert(l_sys, "nvdimmInErrorState: no TopLevelTarget");
+ if (l_sys->getAttr<ATTR_NVDIMM_ENCRYPTION_ENABLE>())
+ {
+ ATTR_NVDIMM_ARMED_type l_armed_state = {};
+ l_armed_state = i_nvdimm->getAttr<ATTR_NVDIMM_ARMED>();
+ if (l_armed_state.encryption_error_detected)
{
- TRACFCOMP( g_trac_nvdimm,
- "notifyNvdimmProtectionChange: No NVDIMM found under processor 0x%.8X",
- get_huid(i_target));
- break;
+ l_ret = true;
}
+ }
+
+ TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmInErrorState() HUID[%X]",get_huid(i_nvdimm));
+ return l_ret;
+}
+
- TARGETING::ATTR_NVDIMM_ARMED_type l_nvdimm_armed_state =
- i_target->getAttr<TARGETING::ATTR_NVDIMM_ARMED>();
+// This could be made a generic utility
+errlHndl_t nvdimm_getDarnNumber(size_t i_genSize, uint8_t* o_genData)
+{
+ assert(i_genSize % sizeof(uint64_t) == 0,"nvdimm_getDarnNumber() bad i_genSize");
- // Only notify protected state if NVDIMM controllers are
- // armed and no error was or is detected
- if (i_state == NVDIMM::PROTECTED)
+ errlHndl_t l_err = nullptr;
+ uint64_t* l_darnData = reinterpret_cast<uint64_t*>(o_genData);
+
+ for (uint32_t l_loop = 0; l_loop < (i_genSize / sizeof(uint64_t)); l_loop++)
+ {
+ // Darn could return an error code
+ uint32_t l_darnErrors = 0;
+
+ while (l_darnErrors < MAX_DARN_ERRORS)
{
- // Exit without notifying phyp if in error state
- if (l_nvdimm_armed_state.error_detected)
+ // Get a 64-bit random number with the darn instruction
+ l_darnData[l_loop] = getDarn();
+
+ if ( l_darnData[l_loop] != DARN_ERROR_CODE )
{
- // State can't go to protected after error is detected
break;
}
- // check if we need to rearm the NVDIMM(s)
- else if (!l_nvdimm_armed_state.armed)
- {
- bool nvdimms_armed =
- NVDIMM::nvdimmArm(l_nvdimmTargetList);
- if (nvdimms_armed)
- {
- // NVDIMMs are now armed and ready for backup
- l_nvdimm_armed_state.armed = 1;
- i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state);
-
- l_nvdimm_protection_state = hostInterfaces::HBRT_FW_NVDIMM_PROTECTED;
- }
- else
- {
- // If nvdimm arming failed,
- // do NOT post that the dimms are now protected.
-
- // Remember this error, only try arming once
- if (!l_nvdimm_armed_state.error_detected)
- {
- l_nvdimm_armed_state.error_detected = 1;
- i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state);
- }
-
- // Exit without notifying phyp of any protection change
- break;
- }
- }
else
{
- // NVDIMM already armed and no error found
- l_nvdimm_protection_state = hostInterfaces::HBRT_FW_NVDIMM_PROTECTED;
+ l_darnErrors++;
}
}
- else if (i_state == NVDIMM::UNPROTECTED_BECAUSE_ERROR)
+
+ if (l_darnErrors == MAX_DARN_ERRORS)
{
- // Remember that this NV controller has an error so
- // we don't rearm this until next IPL
- if (!l_nvdimm_armed_state.error_detected)
- {
- l_nvdimm_armed_state.error_detected = 1;
- i_target->setAttr<TARGETING::ATTR_NVDIMM_ARMED>(l_nvdimm_armed_state);
- }
- // still notify phyp that NVDIMM is Not Protected
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimm_getDarnNumber() reached MAX_DARN_ERRORS");
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ENCRYPTION_MAX_DARN_ERRORS
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_GET_DARN_NUMBER
+ *@userdata1 MAX_DARN_ERRORS
+ *@devdesc Error using darn instruction
+ *@custdesc NVDIMM encryption error
+ */
+ l_err = new ERRORLOG::ErrlEntry(
+ ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_GET_DARN_NUMBER,
+ NVDIMM_ENCRYPTION_MAX_DARN_ERRORS,
+ MAX_DARN_ERRORS,
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ break;
}
+ }
+ return l_err;
+}
- // Get the Proc Chip Id
- RT_TARG::rtChipId_t l_chipId = 0;
- l_err = RT_TARG::getRtTarget(i_target, l_chipId);
- if(l_err)
+errlHndl_t nvdimm_getRandom(uint8_t* o_genData)
+{
+ errlHndl_t l_err = nullptr;
+ uint8_t l_xtraData[ENC_KEY_SIZE] = {0};
+
+ do
+ {
+ // Get a random number with the darn instruction
+ l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, o_genData);
+ if (l_err)
{
- TRACFCOMP( g_trac_nvdimm,
- ERR_MRK"notifyNvdimmProtectionChange: getRtTarget ERROR" );
break;
}
- // send the notification msg
- if ((nullptr == g_hostInterfaces) ||
- (nullptr == g_hostInterfaces->firmware_request))
+ // Validate and update the random number
+ // Retry if more randomness required
+ do
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"notifyNvdimmProtectionChange: "
- "Hypervisor firmware_request interface not linked");
+ //Get replacement data
+ l_err = nvdimm_getDarnNumber(ENC_KEY_SIZE, l_xtraData);
+ if (l_err)
+ {
+ break;
+ }
+
+ }while (nvdimm_keyifyRandomNumber(o_genData, l_xtraData));
+
+ }while (0);
+
+ return l_err;
+}
+
+/*
+ * @brief Check the ES (enery source)/backup power module(BPM) health status of
+ * the individual NVDIMMs supplied in list
+ *
+ * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of
+ *
+ * @return false if one or more NVDIMMs fail ES health check, else true
+ */
+bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList)
+{
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): "
+ "Target list size(%d)", i_nvdimmTargetList.size());
+
+ // The minimum ES lifetime value
+ const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97%
+
+ // The ES health check status flags for the different states of an
+ // ES health check
+ const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0
+ const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1
+ const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2
- // need to safely convert struct type into uint32_t
- union {
- TARGETING::ATTR_NVDIMM_ARMED_type tNvdimmArmed;
- uint32_t nvdimmArmed_int;
- } armed_state_union;
- armed_state_union.tNvdimmArmed = l_nvdimm_armed_state;
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // The ES health check status from an ES health check call
+ uint8_t l_esHealthCheck(0);
+
+ // Status of the accumulation of all calls related to the ES health check.
+ // If any one call is bad/fails, then this will be false, else it stays true
+ bool l_didEsHealthCheckPass(true);
+
+ // Iterate thru the NVDIMMs checking the ES health status of each one.
+ // Going with the assumption that the caller waited the allotted time,
+ // roughly 20 to 30 minutes, after the start of an IPL.
+ // Success case:
+ // * ES health check initiated at start of the IPL, caller waited the
+ // allotted time (20 to 30 mins) before doing a health check, health
+ // check returned success and the lifetime meets the minimum threshold
+ // for a new BPM.
+ // Error cases are:
+ // * ES health check is in progress, will assume BPM is hung
+ // * ES health check failed
+ // * ES health check succeeded but lifetime does not meet a
+ // certain threshold
+ // * If none of the above apply (success case and other error cases),
+ // then assume the ES health check was never initiated at the start
+ // of the IPL
+ // For each of these error cases do a predictive callout
+ for (auto const l_nvdimm : i_nvdimmTargetList)
+ {
+ // Retrieve the Health Check status from the BPM
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "Reading NVDIMM(0x%.8X) ES health check data, "
+ "register ES_CMD_STATUS0(0x%.2X)",
+ get_huid(l_nvdimm), ES_CMD_STATUS0);
+
+ l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck);
+
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "NVDIMM(0x%X) failed to read the ES health check "
+ "data, register ES_CMD_STATUS0(0x%.2X)",
+ get_huid(l_nvdimm), ES_CMD_STATUS0);
+
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
+
+ // Proceed to next NVDIMM, better luck next time
+ continue;
+ }
+
+ // Trace out the returned data for inspection
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "NVDIMM(0x%X) returned value(0x%.2X) from the ES health "
+ "check data, register ES_CMD_STATUS0(0x%.2X)",
+ get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0);
+
+ if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG)
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "Assuming caller waited the allotted time before "
+ "doing an ES health check on NVDIMM(0x%.8X), the BPM "
+ "is hung doing the ES health check.",
+ get_huid(l_nvdimm) );
/*@
* @errortype
- * @severity ERRL_SEV_PREDICTIVE
- * @moduleid NOTIFY_NVDIMM_PROTECTION_CHG
- * @reasoncode NVDIMM_NULL_FIRMWARE_REQUEST_PTR
- * @userdata1 HUID of processor target
- * @userdata2[0:31] Requested protection state
- * @userdata2[32:63] Current armed state
- * @devdesc Unable to inform PHYP of NVDIMM protection
- * @custdesc Internal firmware error
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE
+ * @userdata1 HUID of NVDIMM target
+ * @userdata2 ES health check status
+ * @devdesc Assuming caller waited the allotted time before
+ * doing an ES health check, then the BPM is hung doing
+ * the ES health check.
+ * @custdesc NVDIMM ES health check failed.
*/
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
- NOTIFY_NVDIMM_PROTECTION_CHG,
- NVDIMM_NULL_FIRMWARE_REQUEST_PTR,
- get_huid(i_target),
- TWO_UINT32_TO_UINT64(
- l_nvdimm_protection_state,
- armed_state_union.nvdimmArmed_int)
- );
-
- l_err->addProcedureCallout(HWAS::EPUB_PRC_PHYP_CODE,
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE,
+ get_huid(l_nvdimm),
+ l_esHealthCheck,
+ ErrlEntry::NO_SW_CALLOUT );
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
+
+ // Add a BPM callout
+ l_err->addPartCallout( l_nvdimm,
+ HWAS::BPM_PART_TYPE,
+ HWAS::SRCI_PRIORITY_HIGH);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
+ }
+ else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG)
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "Assuming caller waited the allotted time before "
+ "doing an ES health check on NVDIMM(0x%.8X), the BPM "
+ "reported a failure.",
+ get_huid(l_nvdimm) );
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE
+ * @userdata1 HUID of NVDIMM target
+ * @userdata2 ES health check status
+ * @devdesc Assuming caller waited the allotted time before
+ * doing an ES health check, the BPM reported a failure
+ * while doing an ES health check.
+ * @custdesc NVDIMM ES health check failed.
+ */
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE,
+ get_huid(l_nvdimm),
+ l_esHealthCheck,
+ ErrlEntry::NO_SW_CALLOUT );
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
+
+ // Add a BPM callout
+ l_err->addPartCallout( l_nvdimm,
+ HWAS::BPM_PART_TYPE,
+ HWAS::SRCI_PRIORITY_HIGH);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
+ }
+ else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG)
+ {
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "Reading NVDIMM(0x%.8X) ES lifetime data, "
+ "register ES_LIFETIME(0x%.2X)",
+ get_huid(l_nvdimm), ES_LIFETIME);
+
+ // The lifetime percentage
+ uint8_t l_lifetimePercentage(0);
+
+ // Retrieve the Lifetime Percentage from the BPM
+ l_err = nvdimmReadReg(l_nvdimm, ES_LIFETIME, l_lifetimePercentage);
+
+ if (l_err)
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "NVDIMM(0x%.8X) failed to read the "
+ "ES_LIFETIME(0x%.2X) data",
+ get_huid(l_nvdimm),
+ ES_LIFETIME );
+
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
+ }
+ else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT)
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "ES health check on NVDIMM(0x%.8X) succeeded but "
+ "the BPM's lifetime(%d) does not meet the minimum "
+ "requirement(%d) needed to qualify as a new BPM.",
+ get_huid(l_nvdimm),
+ l_lifetimePercentage,
+ ES_LIFETIME_MINIMUM_REQUIREMENT );
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET
+ * @userdata1[00:31] HUID of NVDIMM target
+ * @userdata1[32:63] ES health check status
+ * @userdata2[00:31] Retrieved lifetime percentage
+ * @userdata2[32:63] lifetime minimum requirement
+ * @devdesc ES health check succeeded but the BPM's
+ * lifetime does not meet the minimum
+ * requirement needed to qualify as a
+ * new BPM.
+ * @custdesc NVDIMM ES health check failed
+ */
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET,
+ TWO_UINT32_TO_UINT64(
+ get_huid(l_nvdimm),
+ l_esHealthCheck),
+ TWO_UINT32_TO_UINT64(
+ l_lifetimePercentage,
+ ES_LIFETIME_MINIMUM_REQUIREMENT),
+ ErrlEntry::NO_SW_CALLOUT );
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
+
+ // Add a BPM callout
+ l_err->addPartCallout( l_nvdimm,
+ HWAS::BPM_PART_TYPE,
HWAS::SRCI_PRIORITY_HIGH);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
+ } // end else if (l_lifetimePercentage ...
+ else
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "Success: ES health check on NVDIMM(0x%.8X) "
+ "succeeded and the BPM's lifetime(%d) meet's the "
+ "minimum requirement(%d) needed to qualify as "
+ "a new BPM.",
+ get_huid(l_nvdimm),
+ l_lifetimePercentage,
+ ES_LIFETIME_MINIMUM_REQUIREMENT );
+ }
+ } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG)
+ else // Assume the ES health check was never initiated at
+ // the start of the IPL.
+ {
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "The ES health check on NVDIMM(0x%.8X) shows no status "
+ "(in progress, fail or succeed) so assuming it was "
+ "never initiated at the start of the IPL.",
+ get_huid(l_nvdimm) );
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED
+ * @userdata1 HUID of NVDIMM target
+ * @userdata2 ES health check status
+ * @devdesc The ES health check shows no status (in progress,
+ * fail or succeed) so assuming it was never initiated
+ * at the start of the IPL.
+ * @custdesc NVDIMM ES health check failed.
+ */
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED,
+ get_huid(l_nvdimm),
+ l_esHealthCheck,
+ ErrlEntry::NO_SW_CALLOUT );
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
- break;
+ // Add a BPM callout
+ l_err->addPartCallout( l_nvdimm,
+ HWAS::BPM_PART_TYPE,
+ HWAS::SRCI_PRIORITY_HIGH);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didEsHealthCheckPass = false;
}
+ } // end for (auto const l_nvdimm : i_nvdimmTargetList)
- TRACFCOMP( g_trac_nvdimm,
- "notifyNvdimmProtectionChange: 0x%.8X processor NVDIMMS are "
- "%s protected (current armed_state: 0x%02X)",
- get_huid(i_target),
- (l_nvdimm_protection_state == hostInterfaces::HBRT_FW_NVDIMM_PROTECTED)?"now":"NOT",
- l_nvdimm_armed_state );
-
- // Create the firmware_request request struct to send data
- hostInterfaces::hbrt_fw_msg l_req_fw_msg;
- memset(&l_req_fw_msg, 0, sizeof(l_req_fw_msg)); // clear it all
-
- // actual msg size (one type of hbrt_fw_msg)
- uint64_t l_req_fw_msg_size = hostInterfaces::HBRT_FW_MSG_BASE_SIZE +
- sizeof(l_req_fw_msg.nvdimm_protection_state);
-
- // Populate the firmware_request request struct with given data
- l_req_fw_msg.io_type =
- hostInterfaces::HBRT_FW_MSG_TYPE_NVDIMM_PROTECTION;
- l_req_fw_msg.nvdimm_protection_state.i_procId = l_chipId;
- l_req_fw_msg.nvdimm_protection_state.i_state =
- l_nvdimm_protection_state;
-
- // Create the firmware_request response struct to receive data
- hostInterfaces::hbrt_fw_msg l_resp_fw_msg;
- uint64_t l_resp_fw_msg_size = sizeof(l_resp_fw_msg);
- memset(&l_resp_fw_msg, 0, l_resp_fw_msg_size);
-
- // Make the firmware_request call
- l_err = firmware_request_helper(l_req_fw_msg_size,
- &l_req_fw_msg,
- &l_resp_fw_msg_size,
- &l_resp_fw_msg);
-
- } while (0);
-
- TRACFCOMP( g_trac_nvdimm,
- EXIT_MRK "notifyNvdimmProtectionChange(%.8X, %d) - ERRL %.8X:%.4X",
- get_huid(i_target), i_state,
- ERRL_GETEID_SAFE(l_err), ERRL_GETRC_SAFE(l_err) );
+ // Should not have any uncommitted errors
+ assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected "
+ "uncommitted error found" );
- return l_err;
-}
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): "
+ "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false");
+
+ return l_didEsHealthCheckPass;
+} // end nvDimmEsCheckHealthStatus
/**
- * @brief This function polls the command status register for arm completion
- * (does not indicate success or fail)
+ * @brief A wrapper around the call to nvDimmEsCheckHealthStatus
*
- * @param[in] i_nvdimm - nvdimm target with NV controller
+ * @see nvDimmEsCheckHealthStatus for more details
*
- * @param[out] o_poll - total polled time in ms
- *
- * @return errlHndl_t - Null if successful, otherwise a pointer to
- * the error log.
+ * @return false if one or more NVDIMMs fail an ES health check, else true
*/
-errlHndl_t nvdimmPollArmDone(TARGETING::Target* i_nvdimm,
- uint32_t &o_poll)
+bool nvDimmEsCheckHealthStatusOnSystem()
{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmPollArmDone() nvdimm[%X]", TARGETING::get_huid(i_nvdimm) );
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()");
- errlHndl_t l_err = nullptr;
+ // Get the list of NVDIMM Targets from the system
+ TargetHandleList l_nvDimmTargetList;
+ nvdimm_getNvdimmList(l_nvDimmTargetList);
- l_err = nvdimmPollStatus ( i_nvdimm, ARM, o_poll);
+ // Return status of doing a check health status
+ bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList);
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollArmDone() nvdimm[%X]",
- TARGETING::get_huid(i_nvdimm));
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): "
+ "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" );
- return l_err;
-}
+ return l_didEsHealthCheckPass;
+} // end nvDimmCheckHealthStatusOnSystem
-/**
- * @brief This function checks the arm status register to make sure
- * the trigger has been armed to ddr_reset_n
+/*
+ * @brief Check the bad flash block percentage against a given maximum allowed.
*
- * @param[in] i_nvdimm - nvdimm target with NV controller
+ * @details This returns a tristate - 1 pass, 2 different fails
+ * If true is returned, then the check passed and
+ * o_badFlashBlockPercentage will contain what the retrieved
+ * flash block percentage is.
+ * If false is returned and the o_badFlashBlockPercentage is zero, then
+ * the check failed because of a register read fail
+ * If false is returned and the o_badFlashBlockPercentage is not zero,
+ * then the check failed because the retrieved bad flash block
+ * percentage exceeds the given maximum allowed
*
- * @return errlHndl_t - Null if successful, otherwise a pointer to
- * the error log.
+ * @param[in] i_nvDimm - The NVDIMM to check
+ * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash
+ * block allowed
+ * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block
+ * percentage from i_nvDimm, if no
+ * register read error.
+ *
+ * @return false if check failed or register read failed, else true
*/
-errlHndl_t nvdimmCheckArmSuccess(TARGETING::Target *i_nvdimm)
+bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm,
+ const uint8_t i_maxPercentageAllowed,
+ uint8_t &o_badFlashBlockPercentage)
{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]",
- TARGETING::get_huid(i_nvdimm));
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( i_nvDimm );
- errlHndl_t l_err = nullptr;
- uint8_t l_data = 0;
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "NVDIMM(0x%.4X), max bad flash blocks allowed(%d)",
+ l_nvDimmHuid,
+ i_maxPercentageAllowed);
+
+ // The status of the check on the bad block percentage
+ bool l_didBadFlashBlockPercentageCheckPass(true);
+
+ // The retrieved flash block percentage from register, initialize to zero
+ o_badFlashBlockPercentage = 0;
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // Retrieve the percentage of bad blocks and validate
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "Reading NVDIMM(0x%.8X) percentage of bad blocks from "
+ "register FLASH_BAD_BLK_PCT(0x%.4X)",
+ l_nvDimmHuid, FLASH_BAD_BLK_PCT);
- l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data);
+ l_err = nvdimmReadReg(i_nvDimm,
+ FLASH_BAD_BLK_PCT,
+ o_badFlashBlockPercentage);
if (l_err)
{
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
- "failed to read arm status reg!",TARGETING::get_huid(i_nvdimm));
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "FAIL: NVDIMM(0x%.8X) failed to read the percentage of "
+ "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), "
+ "marking as a fail",
+ l_nvDimmHuid, FLASH_BAD_BLK_PCT);
+
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to a register read error
+ l_didBadFlashBlockPercentageCheckPass = false;
+ o_badFlashBlockPercentage = 0;
}
- else if ((l_data & ARM_SUCCESS) != ARM_SUCCESS)
+ else
{
+ // Trace out the returned data for inspection
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "NVDIMM(0x%.8X) returned value (%d) from the "
+ "percentage of bad blocks, register "
+ "FLASH_BAD_BLK_PCT(0x%.4X)",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ FLASH_BAD_BLK_PCT);
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
- "failed to arm!",TARGETING::get_huid(i_nvdimm));
- /*@
- *@errortype
- *@reasoncode NVDIMM_ARM_FAILED
- *@severity ERRORLOG_SEV_PREDICTIVE
- *@moduleid NVDIMM_SET_ARM
- *@userdata1[0:31] Related ops (0xff = NA)
- *@userdata1[32:63] Target Huid
- *@userdata2 <UNUSED>
- *@devdesc Encountered error arming the catastrophic save
- * trigger on NVDIMM. Make sure an energy source
- * is connected to the NVDIMM and the ES policy
- * is set properly
- *@custdesc NVDIMM encountered error arming save trigger
- */
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
- NVDIMM_SET_ARM,
- NVDIMM_ARM_FAILED,
- TWO_UINT32_TO_UINT64(ARM, TARGETING::get_huid(i_nvdimm)),
- 0x0,
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace(NVDIMM_COMP_NAME, 256 );
-
- // Failure to arm could mean internal NV controller error or
- // even error on the battery pack. NVDIMM will lose persistency
- // if failed to arm trigger
- l_err->addPartCallout( i_nvdimm,
- HWAS::NV_CONTROLLER_PART_TYPE,
- HWAS::SRCI_PRIORITY_HIGH);
- l_err->addPartCallout( i_nvdimm,
- HWAS::BPM_PART_TYPE,
- HWAS::SRCI_PRIORITY_MED);
- l_err->addPartCallout( i_nvdimm,
- HWAS::BPM_CABLE_PART_TYPE,
- HWAS::SRCI_PRIORITY_MED);
- }
+ // Check to see if the bad flash block percentage
+ // exceeds maximum allowed.
+ if (o_badFlashBlockPercentage > i_maxPercentageAllowed)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "FAIL: For NVDIMM (0x%.8X), the percentage of bad "
+ "flash blocks (%d), read from register "
+ "FLASH_BAD_BLK_PCT(0x%.4X), exceeds the maximum "
+ "percentage of bad flash blocks allowed (%d), marking "
+ "this as a fail",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ FLASH_BAD_BLK_PCT,
+ i_maxPercentageAllowed);
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]",
- TARGETING::get_huid(i_nvdimm), l_data);
+ // Set up the fail state, so caller can determine that the fail was
+ // due to percentage exceeding the max percentage allowed.
+ // Note: Leave the value in o_badFlashBlockPercentage so caller
+ // can inspect, if they wish
+ l_didBadFlashBlockPercentageCheckPass = false;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad "
+ "flash blocks (%d) is less than or meets the maximum "
+ "percentage of bad flash blocks allowed (%d), "
+ "marking this as a pass",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ i_maxPercentageAllowed);
- return l_err;
+ // Set up the pass state
+ // Note: Leave the value in o_badFlashBlockPercentage so caller
+ // can inspect, if they wish
+ l_didBadFlashBlockPercentageCheckPass = true;
+ } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed)
+ } // end if (l_err) ... else
+
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "Returning %s",
+ l_didBadFlashBlockPercentageCheckPass == true ? "true" : "false" );
+
+ return l_didBadFlashBlockPercentageCheckPass;
}
-bool nvdimmArm(TARGETING::TargetHandleList &i_nvdimmTargetList)
+/*
+ * @brief Check the flash error count against a given maximum allowed.
+ *
+ * @details This returns a tristate - 1 pass, 2 different fails
+ * If true is returned, then the check passed and
+ * o_readFlashErrorCount will contain what the retrieved
+ * flash error count is.
+ * If false is returned and the o_readFlashErrorCount is zero, then
+ * the check failed because of a register read fail
+ * If false is returned and the o_readFlashErrorCount is not zero,
+ * then the check failed because the retrieved flash error
+ * count exceeds the given maximum allowed
+ *
+ * @param[in] i_nvDimm - The NVDIMM to check
+ * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors
+ * allowed
+ * @param[out] o_readFlashErrorCount - The retrieved bad flash error
+ * count from i_nvDimm, if no
+ * register read error.
+ *
+ * @return false if check failed or register read failed, else true
+ */
+bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm,
+ const uint32_t i_maxFlashErrorsAllowed,
+ uint32_t &o_readFlashErrorCount)
{
- bool o_arm_successful = true;
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( i_nvDimm );
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() %d",
- i_nvdimmTargetList.size());
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckFlashErrorCount(): "
+ "NVDIMM(0x%.4X), max flash errors allowed(%d)",
+ l_nvDimmHuid,
+ i_maxFlashErrorsAllowed);
- errlHndl_t l_err = nullptr;
+ // The status of the check on the flash error count
+ bool l_didFlashErrorCountCheckPass(true);
- for (auto const l_nvdimm : i_nvdimmTargetList)
+ // The retrieved flash error count from register, initialize to zero
+ o_readFlashErrorCount = 0;
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // The retrieved flash error count from a register
+ uint8_t l_readFlashErrorCountByte(0);
+
+ // Read the flash error count registers starting from MSB to LSB
+ for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2;
+ l_flashErrorRegister >= FLASH_ERROR_COUNT0;
+ --l_flashErrorRegister)
{
- // skip if the nvdimm is in error state
- if (NVDIMM::nvdimmInErrorState(l_nvdimm))
- {
- // error state means arming not successful
- o_arm_successful = false;
- continue;
- }
+ // Reset this for every iteration, may be redundant
+ l_readFlashErrorCountByte = 0;
+
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "Reading NVDIMM(0x%.8X) flash error count from "
+ "register FLASH_ERROR_COUNT(0x%.4X)",
+ l_nvDimmHuid, l_flashErrorRegister);
+
+ l_err = nvdimmReadReg(i_nvDimm,
+ static_cast<i2cReg >(l_flashErrorRegister),
+ l_readFlashErrorCountByte);
- l_err = nvdimmSetESPolicy(l_nvdimm);
if (l_err)
{
- o_arm_successful = false;
- nvdimmSetStatusFlag(l_nvdimm, NSTD_ERR_NOBKUP);
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): "
+ "FAIL: NVDIMM(0x%.8X) failed to read flash error "
+ "count from register FLASH_ERROR_COUNT(0x%.4X) "
+ "marking as a fail",
+ l_nvDimmHuid, l_flashErrorRegister);
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit( l_err, NVDIMM_COMP_ID );
- continue;
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to a register read error
+ l_didFlashErrorCountCheckPass = false;
+ o_readFlashErrorCount = 0;
+
+ break;
}
- l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER);
- // If we run into any error here we will just
- // commit the error log and move on. Let the
- // system continue to boot and let the user
- // salvage the data
- if (l_err)
+ // If we get here, then the read was successful
+ // Append the read flash error count byte to the LSB of the
+ // aggregated flash error count bytes.
+ o_readFlashErrorCount = (o_readFlashErrorCount << 8) |
+ l_readFlashErrorCountByte;
+
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "NVDIMM(0x%.8X) returned value (0x%.2X) from the "
+ "partial flash error count, register "
+ "FLASH_ERROR_COUNT(0x%.4X)",
+ l_nvDimmHuid,
+ l_readFlashErrorCountByte,
+ l_flashErrorRegister);
+
+ } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ...
+
+ // If o_readFlashErrorCount is not zero, then register read was successful
+ if (o_readFlashErrorCount)
+ {
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "NVDIMM(0x%.8X) flash error count = %d ",
+ l_nvDimmHuid, o_readFlashErrorCount);
+
+ // Check the validity of the flash error count
+ if (o_readFlashErrorCount > i_maxFlashErrorsAllowed)
{
- NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP);
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- continue;
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): "
+ "FAIL: For NVDIMM (0x%.8X), the flash error count (%d), "
+ "read from registers FLASH_ERROR_COUNT0(0x%.4X), "
+ "FLASH_ERROR_COUNT1(0x%.4X) and FLASH_ERROR_COUNT2(0x%.4X), "
+ "exceeds the maximum number of flash "
+ "errors allowed (%d), marking this as a fail",
+ l_nvDimmHuid,
+ o_readFlashErrorCount,
+ FLASH_ERROR_COUNT0,
+ FLASH_ERROR_COUNT1,
+ FLASH_ERROR_COUNT2,
+ i_maxFlashErrorsAllowed);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to error count exceeding the max errors allowed.
+ // Note: Leave the value in o_readFlashErrorCount so caller
+ // can inspect, if they wish
+ l_didFlashErrorCountCheckPass = false;
}
+ else
+ {
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "SUCCESS: For NVDIMM(0x%.8X), the flash error counts "
+ "(%d) is less than or meets the maximum number of "
+ "errors allowed (%d), marking this as a pass",
+ l_nvDimmHuid,
+ o_readFlashErrorCount,
+ i_maxFlashErrorsAllowed);
- // Arm happens one module at a time. No need to set any offset on the counter
- uint32_t l_poll = 0;
- l_err = nvdimmPollArmDone(l_nvdimm, l_poll);
- if (l_err)
+ // Set up the pass state
+ // Note: Leave the value in o_readFlashErrorCount so caller
+ // can inspect, if they wish
+ l_didFlashErrorCountCheckPass = true;
+ }
+ } // end if (o_readFlashErrorCount)
+
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckFlashErrorCount(): "
+ "Returning %s",
+ l_didFlashErrorCountCheckPass == true ? "true" : "false" );
+
+ return l_didFlashErrorCountCheckPass;
+}
+
+/*
+ * @brief Check the NVM (non-volatile memory)/flash health of the individual
+ * NVDIMMs supplied in list.
+ *
+ * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash
+ *
+ * @return false if one or more NVDIMMs fail NVM health check, else true
+ */
+bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList)
+{
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): "
+ "Target list size(%d)", i_nvDimmTargetList.size());
+
+ // The following maximums are the same values used by SMART's
+ // manufacturing and recommended that we use.
+ // The maximum percentage of bad flash blocks
+ // Fail if over 19% of bad flash blocks is encountered
+ const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19;
+ // The maximum number of flash memory errors allowed
+ // Fail if over 300 flash memory errors is encountered
+ const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300;
+
+ // Status of the accumulation of all calls related to the NVM health check.
+ // If any one call is bad/fails, then this will be false, else it stays true
+ bool l_didNvmHealthCheckPass(true);
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // The retrieved flash block percentage from register
+ uint8_t l_badFlashBlockPercentage(0);
+ // The retrieved flash error count from register
+ uint32_t l_flashErrorCount(0);
+
+ // The status of the checks on the percentage of bad blocks and
+ // flash error count
+ // Default to true
+ bool l_badFlashBlockPercentageCheckPassed(true);
+ bool l_flashErrorCountCheckPassed(true);
+
+ // Iterate thru the supplied NVDIMMs checking the health of the NVM
+ for (auto const l_nvDimm : i_nvDimmTargetList)
+ {
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( l_nvDimm );
+
+ // Reset these for every NVDIMM that is checked
+ l_badFlashBlockPercentage = 0;
+ l_flashErrorCount = 0;
+ l_badFlashBlockPercentageCheckPassed = true;
+ l_flashErrorCountCheckPassed = true;
+
+ // Check the validity of bad flash block percentage
+ if (!nvDimmCheckBadFlashBlockPercentage(
+ l_nvDimm,
+ MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED,
+ l_badFlashBlockPercentage))
{
- NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP);
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- continue;
+ // Set this to false to indicate that the overall check on the
+ // NVDIMMs had at least one failure
+ l_didNvmHealthCheckPass = false;
+
+ // If no data in the variable l_badFlashBlockPercentage, then
+ // this is a read register fail. Move onto the next NVDIMM
+ // this is a dud
+ if (!l_badFlashBlockPercentage)
+ {
+ continue;
+ }
+
+ // Set the check to false, to facilitate error reporting
+ l_badFlashBlockPercentageCheckPassed = false;
}
- l_err = nvdimmCheckArmSuccess(l_nvdimm);
- if (l_err)
+ // Check the validity of the flash error count
+ if (!nvDimmCheckFlashErrorCount(
+ l_nvDimm,
+ MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED,
+ l_flashErrorCount))
{
- NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP);
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- continue;
+ // Set this to false to indicate that the overall check on the
+ // NVDIMMs had at least one failure
+ l_didNvmHealthCheckPass = false;
+
+ // If no data in the variable l_flashErrorCount, then
+ // this is a read register fail. Move onto the next NVDIMM
+ // this is a dud
+ if (!l_flashErrorCount)
+ {
+ continue;
+ }
+
+ // Set the check to false, to facilitate error reporting
+ l_flashErrorCountCheckPassed = false;
}
- // After arming the trigger, erase the image to prevent the possible
- // stale image getting the restored on the next boot in case of failed
- // save.
- l_err = nvdimmEraseNF(l_nvdimm);
- if (l_err)
+ /// Now we assess the health of the flash based on data gathered above
+ if ( !l_badFlashBlockPercentageCheckPassed ||
+ !l_flashErrorCountCheckPassed )
{
- NVDIMM::nvdimmSetStatusFlag(l_nvdimm, NVDIMM::NSTD_ERR_NOBKUP);
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
+ // First set the NVDIMM HUID to the first 32 bits of user data 1
+ uint64_t l_badFlashBlockPercentageUserData1 =
+ TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0);
- // If the erase failed let's disarm the trigger
- l_err = nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER);
- if (l_err)
+ // If an issue with the bad flash block percentage, then append
+ // data to user data 1
+ if (!l_badFlashBlockPercentageCheckPassed &&
+ l_badFlashBlockPercentage)
{
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X], error disarming the nvdimm!",
- TARGETING::get_huid(l_nvdimm));
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME, 1024);
- errlCommit(l_err, NVDIMM_COMP_ID);
+ // Setting the HUID here is redundant but easier than trying to
+ // do some clever code that will set the HUID for user data 1
+ // when this path is not taken, but the next check on the flash
+ // error count is taken
+ l_badFlashBlockPercentageUserData1 =
+ TWO_UINT32_TO_UINT64(l_nvDimmHuid,
+ TWO_UINT16_TO_UINT32(
+ l_badFlashBlockPercentage,
+ MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED));
}
- continue;
+ // If an issue with the flash error count, then set user
+ // data 2 to contain the flash error count value
+ uint64_t l_flashErrorCountUserData2(0);
+ if (!l_flashErrorCountCheckPassed &&
+ l_flashErrorCount)
+ {
+ l_flashErrorCountUserData2 =
+ TWO_UINT32_TO_UINT64(l_flashErrorCount,
+ MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED);
+ }
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_NVM_HEALTH_CHECK
+ * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED
+ * @userdata1[0:31] HUID of NVDIMM target
+ * @userdata1[32:47] The retrieved bad flash block percentage,
+ * if error with, else 0
+ * @userdata1[48:63] The maximum percentage of bad flash blocks
+ * allowed, if bad flash block percentage
+ * exceeds this maximum, else 0
+ * @userdata2[0:31] The retrieved flash error count,
+ * if error with, else 0
+ * @userdata2[32:63] The maximum number of flash errors
+ * allowed, if flash error exceeds this
+ * maximum, else 0
+ * @devdesc Either the NVDIMM NVM bad flash block
+ * percentage exceeded the maximum percentage
+ * allowed or the NVDIMM NVM number of flash
+ * error exceeds the maximum count allowed
+ * or both.
+ * @custdesc NVDIMM NVM health check failed.
+ */
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_NVM_HEALTH_CHECK,
+ NVDIMM_NVM_HEALTH_CHECK_FAILED,
+ l_badFlashBlockPercentageUserData1,
+ l_flashErrorCountUserData2,
+ ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ nvdimmAddVendorLog(l_nvDimm, l_err);
+
+ // Add a DIMM callout
+ l_err->addHwCallout( l_nvDimm,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL );
+
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didNvmHealthCheckPass = false;
}
- }
+ else
+ {
+ // This NVDIMM passed the NVM health check
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): "
+ "Success: NVDIMM (0x%.8X) passed the NVM health check.",
+ l_nvDimmHuid);
+ } // end if ( !l_badFlashBlockPercentageCheckPassed .. else
+ } // end for (auto const l_nvdimm : i_nvdimmTargetList)
- TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d",
- o_arm_successful);
- return o_arm_successful;
-}
+ // Should not have any uncommitted errors
+ assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected "
+ "uncommitted error found");
+
+ TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s",
+ l_didNvmHealthCheckPass == true ? "true" : "false" );
+
+ return l_didNvmHealthCheckPass;
+} // end nvDimmNvmCheckHealthStatus
/**
- * @brief Check nvdimm error state
+ * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus
*
- * @param[in] i_nvdimm - nvdimm target
+ * @see nvDimmNvmCheckHealthStatus for more details
*
- * @return bool - true if nvdimm is in any error state, false otherwise
+ * @return false if one or more NVDIMMs fail an NVM health check, else true
*/
-bool nvdimmInErrorState(TARGETING::Target *i_nvdimm)
+bool nvDimmNvmCheckHealthStatusOnSystem()
{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmInErrorState() HUID[%X]",TARGETING::get_huid(i_nvdimm));
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()");
- uint8_t l_statusFlag = i_nvdimm->getAttr<TARGETING::ATTR_NV_STATUS_FLAG>();
- bool l_ret = true;
+ // Get the list of NVDIMM Targets from the system
+ TargetHandleList l_nvDimmTargetList;
+ nvdimm_getNvdimmList(l_nvDimmTargetList);
- if ((l_statusFlag & NSTD_ERR) == 0)
- l_ret = false;
+ // Return status of doing a check health status
+ bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList);
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmInErrorState() HUID[%X]",TARGETING::get_huid(i_nvdimm));
- return l_ret;
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): "
+ "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" );
+
+ return l_didNvmHealthCheckPass;
+} // end nvDimmCheckHealthStatusOnSystem
+
+
+/**
+ * @brief Send NV_STATUS to host
+ */
+void nvdimmSendNvStatus()
+{
+ // Send NV_STATUS for all nvdimms
+ TargetHandleList l_nvdimmTargetList;
+ nvdimm_getNvdimmList(l_nvdimmTargetList);
+ for (const auto & l_nvdimm : l_nvdimmTargetList)
+ {
+ errlHndl_t l_err = nullptr;
+ l_err = notifyNvdimmProtectionChange(l_nvdimm,SEND_NV_STATUS);
+ if (l_err)
+ {
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ }
+ }
}
+
+struct registerNvdimmRt
+{
+ registerNvdimmRt()
+ {
+ // Register function to call at end of RT init
+ postInitCalls_t * rt_post = getPostInitCalls();
+ rt_post->callSendNvStatus = &nvdimmSendNvStatus;
+ }
+};
+
+registerNvdimmRt g_registerNvdimmRt;
+
} // end NVDIMM namespace
OpenPOWER on IntegriCloud