summaryrefslogtreecommitdiffstats
path: root/src/usr/isteps
diff options
context:
space:
mode:
authorRoland Veloz <rveloz@us.ibm.com>2019-08-23 20:00:36 -0500
committerChristian R Geddes <crgeddes@us.ibm.com>2019-09-03 09:49:31 -0500
commitc3d8cfd066998862656706bb00d15ad986470fe5 (patch)
treef267927cbfb821659b4bd25107dcf1155a0b3ba2 /src/usr/isteps
parente22e362f3cd9b96b845f8c51acd6da97c78554a4 (diff)
downloadtalos-hostboot-c3d8cfd066998862656706bb00d15ad986470fe5.tar.gz
talos-hostboot-c3d8cfd066998862656706bb00d15ad986470fe5.zip
Added code to support doing an NVM health check
Added a method that will do an NVM (non-volatile memory) health check. In particular this method will check the flash error counts and does a predictive callout if the number of flash error counts exceeds the maximum allowed. This method also checks the bad block percentage and does a predictive callout if the number of bad block percentage exceeds the maximum allowed. A predictive callout is done if either or both fail the check. Added support in the runtime commands to make the nvm health check call as well. Also, when I did the ES (energy source) health check method, I was not very explicit in the method that it was doing an ES health check. So I updated the verbiage in the nvDimmCheckHealthStatus to add ES wherever appropriate as to make these two method explicit as to what health check is being performed. Change-Id: Ib9925fd2bb8430cf2121108329247d96072beb1b CQ: 473220 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/82843 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Glenn Miles <milesg@ibm.com> Reviewed-by: Zachary Clark <zach@ibm.com> Reviewed-by: Christian R Geddes <crgeddes@us.ibm.com>
Diffstat (limited to 'src/usr/isteps')
-rw-r--r--src/usr/isteps/nvdimm/nvdimm.H5
-rw-r--r--src/usr/isteps/nvdimm/runtime/nvdimm_rt.C708
2 files changed, 596 insertions, 117 deletions
diff --git a/src/usr/isteps/nvdimm/nvdimm.H b/src/usr/isteps/nvdimm/nvdimm.H
index a99f1180a..d2d2985b6 100644
--- a/src/usr/isteps/nvdimm/nvdimm.H
+++ b/src/usr/isteps/nvdimm/nvdimm.H
@@ -275,6 +275,11 @@ enum i2cReg : uint16_t
TYPED_BLOCK_DATA_BYTE30 = 0x39E,
TYPED_BLOCK_DATA_BYTE31 = 0x39F,
TYPED_BLOCK_DATA_OFFSET = 0x3E0,
+ FLASH_BAD_BLK_PCT = 0x41D, // Read only; Percentage of flash blocks
+ // in the flash array marked as bad blocks
+ FLASH_ERROR_COUNT0 = 0x428, // Read only; LSB[7:0] Flash error count
+ FLASH_ERROR_COUNT1 = 0x429, // Read only; [15:8]
+ FLASH_ERROR_COUNT2 = 0x42A, // Read only; MSB[23:16]
BPM_MAGIC_REG1 = 0x430,
BPM_MAGIC_REG2 = 0x431,
SCAP_STATUS = 0x432,
diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
index d615aa546..b38dd394d 100644
--- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
+++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
@@ -25,7 +25,11 @@
/**
* @file nvdimm_rt.C
*
- * @brief NVDIMM functions only needed for runtime
+ * @brief NVDIMM functions only needed for runtime. These functions include
+ * but are not limited to arming/disarming the NVDIMM along with methods
+ * to poll the arming and check the status of the arming. Checking the
+ * error state of the NVDIMM, getting a random number with the darn
+ * instruction and checking the ES or NVM health status.
*/
/// BPM - Backup Power Module
@@ -734,65 +738,68 @@ errlHndl_t nvdimm_getRandom(uint8_t* o_genData)
}
/*
- * @brief Check the health status of the individual NVDIMMs supplied in list
+ * @brief Check the ES (enery source)/backup power module(BPM) health status of
+ * the individual NVDIMMs supplied in list
*
- * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of
+ * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of
*
- * @return false if one or more NVDIMMs fail health check, else true
+ * @return false if one or more NVDIMMs fail ES health check, else true
*/
-bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
+bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList)
{
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatus(): "
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): "
"Target list size(%d)", i_nvdimmTargetList.size());
- // The minimum lifetime value
- const uint8_t LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97%
+ // The minimum ES lifetime value
+ const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97%
- // The health check status flags for the different states of a health check
- const uint8_t HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0
- const uint8_t HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1
- const uint8_t HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2
+ // The ES health check status flags for the different states of an
+ // ES health check
+ const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0
+ const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1
+ const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2
// Handle to catch any errors
errlHndl_t l_err(nullptr);
- // The health check status from a health check call
- uint8_t l_healthCheck(0);
+ // The ES health check status from an ES health check call
+ uint8_t l_esHealthCheck(0);
- // Status of the accumulation of all calls related to the health check.
+ // Status of the accumulation of all calls related to the ES health check.
// If any one call is bad/fails, then this will be false, else it stays true
- bool l_didHealthCheckPass(true);
+ bool l_didEsHealthCheckPass(true);
- // Iterate thru the NVDIMMs checking the health status of each one.
+ // Iterate thru the NVDIMMs checking the ES health status of each one.
// Going with the assumption that the caller waited the allotted time,
// roughly 20 to 30 minutes, after the start of an IPL.
// Success case:
- // * Health check initiated at start of the IPL, caller waited the
+ // * ES health check initiated at start of the IPL, caller waited the
// allotted time (20 to 30 mins) before doing a health check, health
// check returned success and the lifetime meets the minimum threshold
// for a new BPM.
// Error cases are:
- // * Health check is in progress, will assume BPM is hung
- // * Health check failed
- // * Health check succeeded but lifetime does not meet a certain threshold
+ // * ES health check is in progress, will assume BPM is hung
+ // * ES health check failed
+ // * ES health check succeeded but lifetime does not meet a
+ // certain threshold
// * If none of the above apply (success case and other error cases),
- // then assume the health check was never initiated at the start of the
- // IPL
+ // then assume the ES health check was never initiated at the start
+ // of the IPL
// For each of these error cases do a predictive callout
for (auto const l_nvdimm : i_nvdimmTargetList)
{
// Retrieve the Health Check status from the BPM
- TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): "
- "Reading NVDIMM(0x%.8X) health check data, "
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "Reading NVDIMM(0x%.8X) ES health check data, "
"register ES_CMD_STATUS0(0x%.2X)",
get_huid(l_nvdimm), ES_CMD_STATUS0);
- l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_healthCheck);
+ l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck);
if (l_err)
{
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
- "NVDIMM(0x%X) failed to read the health check "
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "NVDIMM(0x%X) failed to read the ES health check "
"data, register ES_CMD_STATUS0(0x%.2X)",
get_huid(l_nvdimm), ES_CMD_STATUS0);
@@ -801,43 +808,43 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
// Proceed to next NVDIMM, better luck next time
continue;
}
// Trace out the returned data for inspection
- TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): "
- "NVDIMM(0x%X) returned value(0x%.2X) from health check "
- "data, register ES_CMD_STATUS0(0x%.2X)",
- get_huid(l_nvdimm), l_healthCheck, ES_CMD_STATUS0)
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "NVDIMM(0x%X) returned value(0x%.2X) from the ES health "
+ "check data, register ES_CMD_STATUS0(0x%.2X)",
+ get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0);
- if (l_healthCheck & HEALTH_CHECK_IN_PROGRESS_FLAG)
+ if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG)
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
"Assuming caller waited the allotted time before "
- "doing a health check on NVDIMM(0x%.8X), the BPM "
- "is hung doing the health check.",
+ "doing an ES health check on NVDIMM(0x%.8X), the BPM "
+ "is hung doing the ES health check.",
get_huid(l_nvdimm) );
/*@
* @errortype
* @severity ERRL_SEV_PREDICTIVE
- * @moduleid NVDIMM_HEALTH_CHECK
- * @reasoncode NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE
* @userdata1 HUID of NVDIMM target
- * @userdata2 Health check status
+ * @userdata2 ES health check status
* @devdesc Assuming caller waited the allotted time before
- * doing a health check, then the BPM is hung doing
- * the health check.
- * @custdesc NVDIMM Health Check failed.
+ * doing an ES health check, then the BPM is hung doing
+ * the ES health check.
+ * @custdesc NVDIMM ES health check failed.
*/
l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
- NVDIMM_HEALTH_CHECK,
- NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE,
get_huid(l_nvdimm),
- l_healthCheck,
+ l_esHealthCheck,
ErrlEntry::NO_SW_CALLOUT );
l_err->collectTrace(NVDIMM_COMP_NAME);
@@ -849,34 +856,33 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
}
- else if (l_healthCheck & HEALTH_CHECK_FAILED_FLAG)
+ else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG)
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
"Assuming caller waited the allotted time before "
- "doing a health check on NVDIMM(0x%.8X), the BPM "
+ "doing an ES health check on NVDIMM(0x%.8X), the BPM "
"reported a failure.",
get_huid(l_nvdimm) );
/*@
* @errortype
* @severity ERRL_SEV_PREDICTIVE
- * @moduleid NVDIMM_HEALTH_CHECK
- * @reasoncode NVDIMM_HEALTH_CHECK_REPORTED_FAILURE
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE
* @userdata1 HUID of NVDIMM target
- * @userdata2 Health check status
- * @devdesc NVDIMM Health Check failed
+ * @userdata2 ES health check status
* @devdesc Assuming caller waited the allotted time before
- * doing a health check, the BPM reported a failure
- * while doing a health check.
- * @custdesc NVDIMM Health Check failed.
+ * doing an ES health check, the BPM reported a failure
+ * while doing an ES health check.
+ * @custdesc NVDIMM ES health check failed.
*/
l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
- NVDIMM_HEALTH_CHECK,
- NVDIMM_HEALTH_CHECK_REPORTED_FAILURE,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE,
get_huid(l_nvdimm),
- l_healthCheck,
+ l_esHealthCheck,
ErrlEntry::NO_SW_CALLOUT );
l_err->collectTrace(NVDIMM_COMP_NAME);
@@ -888,12 +894,12 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
}
- else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG)
+ else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG)
{
- TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): "
- "Reading NVDIMM(0x%.8X) es lifetime data, "
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): "
+ "Reading NVDIMM(0x%.8X) ES lifetime data, "
"register ES_LIFETIME(0x%.2X)",
get_huid(l_nvdimm), ES_LIFETIME);
@@ -905,7 +911,7 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
if (l_err)
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
"NVDIMM(0x%.8X) failed to read the "
"ES_LIFETIME(0x%.2X) data",
get_huid(l_nvdimm),
@@ -916,42 +922,42 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
}
- else if (l_lifetimePercentage < LIFETIME_MINIMUM_REQUIREMENT)
+ else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT)
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
- "Health check on NVDIMM(0x%.8X) succeeded but the "
- "BPM's lifetime(%d) does not meet the minimum "
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "ES health check on NVDIMM(0x%.8X) succeeded but "
+ "the BPM's lifetime(%d) does not meet the minimum "
"requirement(%d) needed to qualify as a new BPM.",
get_huid(l_nvdimm),
l_lifetimePercentage,
- LIFETIME_MINIMUM_REQUIREMENT );
+ ES_LIFETIME_MINIMUM_REQUIREMENT );
/*@
* @errortype
* @severity ERRL_SEV_PREDICTIVE
- * @moduleid NVDIMM_HEALTH_CHECK
- * @reasoncode NVDIMM_LIFETIME_MIN_REQ_NOT_MET
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET
* @userdata1[00:31] HUID of NVDIMM target
- * @userdata1[32:63] Health check status
+ * @userdata1[32:63] ES health check status
* @userdata2[00:31] Retrieved lifetime percentage
* @userdata2[32:63] lifetime minimum requirement
- * @devdesc Health check succeeded but the BPM's
+ * @devdesc ES health check succeeded but the BPM's
* lifetime does not meet the minimum
* requirement needed to qualify as a
* new BPM.
- * @custdesc NVDIMM Health Check failed
+ * @custdesc NVDIMM ES health check failed
*/
l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
- NVDIMM_HEALTH_CHECK,
- NVDIMM_LIFETIME_MIN_REQ_NOT_MET,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET,
TWO_UINT32_TO_UINT64(
get_huid(l_nvdimm),
- l_healthCheck),
+ l_esHealthCheck),
TWO_UINT32_TO_UINT64(
l_lifetimePercentage,
- LIFETIME_MINIMUM_REQUIREMENT),
+ ES_LIFETIME_MINIMUM_REQUIREMENT),
ErrlEntry::NO_SW_CALLOUT );
l_err->collectTrace(NVDIMM_COMP_NAME);
@@ -963,45 +969,46 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
} // end else if (l_lifetimePercentage ...
else
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmCheckHealthStatus(): "
- "Success: Health check on NVDIMM(0x%.8X) succeeded "
- "and the BPM's lifetime(%d) meet's the minimum "
- "requirement(%d) needed to qualify as a new BPM.",
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "Success: ES health check on NVDIMM(0x%.8X) "
+ "succeeded and the BPM's lifetime(%d) meet's the "
+ "minimum requirement(%d) needed to qualify as "
+ "a new BPM.",
get_huid(l_nvdimm),
l_lifetimePercentage,
- LIFETIME_MINIMUM_REQUIREMENT );
+ ES_LIFETIME_MINIMUM_REQUIREMENT );
}
- } // end else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG)
- else // Assume the health check was never initiated at
+ } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG)
+ else // Assume the ES health check was never initiated at
// the start of the IPL.
{
- TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): "
- "The health check on NVDIMM(0x%.8X) shows no status (in "
- "progress, fail or succeed) so assuming it was never "
- "initiated at the start of the IPL.",
+ TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): "
+ "The ES health check on NVDIMM(0x%.8X) shows no status "
+ "(in progress, fail or succeed) so assuming it was "
+ "never initiated at the start of the IPL.",
get_huid(l_nvdimm) );
/*@
* @errortype
* @severity ERRL_SEV_PREDICTIVE
- * @moduleid NVDIMM_HEALTH_CHECK
- * @reasoncode NVDIMM_HEALTH_CHECK_NEVER_INITIATED
+ * @moduleid NVDIMM_ES_HEALTH_CHECK
+ * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED
* @userdata1 HUID of NVDIMM target
- * @userdata2 Health check status
- * @devdesc The health check shows no status (in progress, fail
- * or succeed) so assuming it was never initiated
+ * @userdata2 ES health check status
+ * @devdesc The ES health check shows no status (in progress,
+ * fail or succeed) so assuming it was never initiated
* at the start of the IPL.
- * @custdesc NVDIMM Health Check failed.
+ * @custdesc NVDIMM ES health check failed.
*/
l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
- NVDIMM_HEALTH_CHECK,
- NVDIMM_HEALTH_CHECK_NEVER_INITIATED,
+ NVDIMM_ES_HEALTH_CHECK,
+ NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED,
get_huid(l_nvdimm),
- l_healthCheck,
+ l_esHealthCheck,
ErrlEntry::NO_SW_CALLOUT );
l_err->collectTrace(NVDIMM_COMP_NAME);
@@ -1013,42 +1020,509 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList)
errlCommit(l_err, NVDIMM_COMP_ID);
// Let the caller know something went amiss
- l_didHealthCheckPass = false;
+ l_didEsHealthCheckPass = false;
}
} // end for (auto const l_nvdimm : i_nvdimmTargetList)
// Should not have any uncommitted errors
- assert(l_err == NULL, "nvDimmCheckHealthStatus() - unexpected uncommitted"
- "error found" );
+ assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected "
+ "uncommitted error found" );
- TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatus(): "
- "Returning %s", l_didHealthCheckPass == true ? "true" : "false" );
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): "
+ "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false");
- return l_didHealthCheckPass;
-} // end nvDimmCheckHealthStatus
+ return l_didEsHealthCheckPass;
+} // end nvDimmEsCheckHealthStatus
/**
- * @brief A wrapper around the call to nvDimmCheckHealthStatus
+ * @brief A wrapper around the call to nvDimmEsCheckHealthStatus
*
- * @see nvDimmCheckHealthStatus for more details
+ * @see nvDimmEsCheckHealthStatus for more details
*
- * @return false if one or more NVDIMMs fail health check, else true
+ * @return false if one or more NVDIMMs fail an ES health check, else true
*/
-bool nvDimmCheckHealthStatusOnSystem()
+bool nvDimmEsCheckHealthStatusOnSystem()
{
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatusOnSystem()");
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()");
// Get the list of NVDIMM Targets from the system
TargetHandleList l_nvDimmTargetList;
nvdimm_getNvdimmList(l_nvDimmTargetList);
// Return status of doing a check health status
- bool l_didHealthCheckPass = nvDimmCheckHealthStatus(l_nvDimmTargetList);
+ bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList);
- TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatusOnSystem(): "
- "Returning %s", l_didHealthCheckPass == true ? "true" : "false" );
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): "
+ "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" );
- return l_didHealthCheckPass;
+ return l_didEsHealthCheckPass;
} // end nvDimmCheckHealthStatusOnSystem
+/*
+ * @brief Check the bad flash block percentage against a given maximum allowed.
+ *
+ * @details This returns a tristate - 1 pass, 2 different fails
+ * If true is returned, then the check passed and
+ * o_badFlashBlockPercentage will contain what the retrieved
+ * flash block percentage is.
+ * If false is returned and the o_badFlashBlockPercentage is zero, then
+ * the check failed because of a register read fail
+ * If false is returned and the o_badFlashBlockPercentage is not zero,
+ * then the check failed because the retrieved bad flash block
+ * percentage exceeds the given maximum allowed
+ *
+ * @param[in] i_nvDimm - The NVDIMM to check
+ * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash
+ * block allowed
+ * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block
+ * percentage from i_nvDimm, if no
+ * register read error.
+ *
+ * @return false if check failed or register read failed, else true
+ */
+bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm,
+ const uint8_t i_maxPercentageAllowed,
+ uint8_t &o_badFlashBlockPercentage)
+{
+ // The status of the check on the bad block percentage
+ bool l_didBadFlashBlockPercentageCheckPass(false);
+
+ // The retrieved flash block percentage from register, initialize to zero
+ o_badFlashBlockPercentage = 0;
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( i_nvDimm );
+
+ // Retrieve the percentage of bad blocks and validate
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "Reading NVDIMM(0x%.8X) percentage of bad blocks from "
+ "register FLASH_BAD_BLK_PCT(0x%.4X)",
+ l_nvDimmHuid, FLASH_BAD_BLK_PCT);
+
+ l_err = nvdimmReadReg(i_nvDimm,
+ FLASH_BAD_BLK_PCT,
+ o_badFlashBlockPercentage);
+
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "FAIL: NVDIMM(0x%.8X) failed to read the percentage of "
+ "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), "
+ "marking as a fail",
+ l_nvDimmHuid, FLASH_BAD_BLK_PCT);
+
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to a register read error
+ l_didBadFlashBlockPercentageCheckPass = false;
+ o_badFlashBlockPercentage = 0;
+ }
+ else
+ {
+ // Trace out the returned data for inspection
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "NVDIMM(0x%.8X) returned value (%d) from the "
+ "percentage of bad blocks, register "
+ "FLASH_BAD_BLK_PCT(0x%.4X)",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ FLASH_BAD_BLK_PCT);
+
+ // Check to see if the bad flash block percentage
+ // exceeds maximum allowed.
+ if (o_badFlashBlockPercentage > i_maxPercentageAllowed)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "FAIL: For NVDIMM (0x%.8X), the percentage of bad "
+ "flash blocks (%d) exceeds the maximum percentage "
+ "of bad flash blocks allowed (%d), marking this "
+ "as a fail",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ i_maxPercentageAllowed);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to percentage exceeding the max percentage allowed.
+ // Note: Leave the value in o_badFlashBlockPercentage so caller
+ // can inspect, if they wish
+ l_didBadFlashBlockPercentageCheckPass = false;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): "
+ "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad "
+ "flash blocks (%d) is less than or meets the maximum "
+ "percentage of bad flash blocks allowed (%d), "
+ "marking this as a pass",
+ l_nvDimmHuid,
+ o_badFlashBlockPercentage,
+ i_maxPercentageAllowed);
+
+ // Set up the pass state
+ // Note: Leave the value in o_badFlashBlockPercentage so caller
+ // can inspect, if they wish
+ l_didBadFlashBlockPercentageCheckPass = true;
+ } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed)
+ } // end if (l_err) ... else
+
+ return l_didBadFlashBlockPercentageCheckPass;
+}
+
+/*
+ * @brief Check the flash error count against a given maximum allowed.
+ *
+ * @details This returns a tristate - 1 pass, 2 different fails
+ * If true is returned, then the check passed and
+ * o_readFlashErrorCount will contain what the retrieved
+ * flash error count is.
+ * If false is returned and the o_readFlashErrorCount is zero, then
+ * the check failed because of a register read fail
+ * If false is returned and the o_readFlashErrorCount is not zero,
+ * then the check failed because the retrieved flash error
+ * count exceeds the given maximum allowed
+ *
+ * @param[in] i_nvDimm - The NVDIMM to check
+ * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors
+ * allowed
+ * @param[out] o_readFlashErrorCount - The retrieved bad flash error
+ * count from i_nvDimm, if no
+ * register read error.
+ *
+ * @return false if check failed or register read failed, else true
+ */
+bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm,
+ const uint32_t i_maxFlashErrorsAllowed,
+ uint32_t &o_readFlashErrorCount)
+{
+ // The status of the check on the flash error count
+ bool l_didFlashErrorCountCheckPass(false);
+
+ // The retrieved flash error count from register, initialize to zero
+ o_readFlashErrorCount = 0;
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( i_nvDimm );
+
+ // The retrieved flash error count from a register
+ uint8_t l_readFlashErrorCountByte(0);
+
+ // Read the flash error count registers starting from MSB to LSB
+ for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2;
+ l_flashErrorRegister >= FLASH_ERROR_COUNT0;
+ --l_flashErrorRegister)
+ {
+ // Reset this for every iteration, may be redundant
+ l_readFlashErrorCountByte = 0;
+
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "Reading NVDIMM(0x%.8X) flash error count from "
+ "register FLASH_ERROR_COUNT(0x%.4X)",
+ l_nvDimmHuid, l_flashErrorRegister);
+
+ l_err = nvdimmReadReg(i_nvDimm,
+ static_cast<i2cReg >(l_flashErrorRegister),
+ l_readFlashErrorCountByte);
+
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): "
+ "FAIL: NVDIMM(0x%.8X) failed to read flash error "
+ "count from register FLASH_ERROR_COUNT(0x%.4X) "
+ "marking as a fail",
+ l_nvDimmHuid, l_flashErrorRegister);
+
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to a register read error
+ l_didFlashErrorCountCheckPass = false;
+ o_readFlashErrorCount = 0;
+
+ break;
+ }
+
+ // If we get here, then the read was successful
+ // Append the read flash error count byte to the LSB of the
+ // aggregated flash error count bytes.
+ o_readFlashErrorCount = (o_readFlashErrorCount << 8) |
+ l_readFlashErrorCountByte;
+
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "NVDIMM(0x%.8X) returned value (0x%.2X) from the "
+ "partial flash error count, register "
+ "FLASH_ERROR_COUNT(0x%.4X)",
+ l_nvDimmHuid,
+ l_readFlashErrorCountByte,
+ l_flashErrorRegister);
+
+ } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ...
+
+ // If o_readFlashErrorCount is not zero, then register read was successful
+ if (o_readFlashErrorCount)
+ {
+ TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "NVDIMM(0x%.8X) flash error count = %d ",
+ l_nvDimmHuid, o_readFlashErrorCount);
+
+ // Check the validity of the flash error count
+ if (o_readFlashErrorCount > i_maxFlashErrorsAllowed)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): "
+ "FAIL: For NVDIMM (0x%.8X), the flash error "
+ "count (%d) exceeds the maximum number of flash "
+ "errors allowed (%d), marking this as a fail",
+ l_nvDimmHuid,
+ o_readFlashErrorCount,
+ i_maxFlashErrorsAllowed);
+
+ // Set up the fail state, so caller can determine that the fail was
+ // due to error count exceeding the max errors allowed.
+ // Note: Leave the value in o_readFlashErrorCount so caller
+ // can inspect, if they wish
+ l_didFlashErrorCountCheckPass = false;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): "
+ "SUCCESS: For NVDIMM(0x%.8X), the flash error counts "
+ "(%d) is less than or meets the maximum number of "
+ "errors allowed (%d), marking this as a pass",
+ l_nvDimmHuid,
+ o_readFlashErrorCount,
+ i_maxFlashErrorsAllowed);
+
+ // Set up the pass state
+ // Note: Leave the value in o_readFlashErrorCount so caller
+ // can inspect, if they wish
+ l_didFlashErrorCountCheckPass = true;
+ }
+ } // end if (o_readFlashErrorCount)
+
+ return l_didFlashErrorCountCheckPass;
+}
+
+/*
+ * @brief Check the NVM (non-volatile memory)/flash health of the individual
+ * NVDIMMs supplied in list.
+ *
+ * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash
+ *
+ * @return false if one or more NVDIMMs fail NVM health check, else true
+ */
+bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList)
+{
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): "
+ "Target list size(%d)", i_nvDimmTargetList.size());
+
+ // The following maximums are the same values used by SMART's
+ // manufacturing and recommended that we use.
+ // The maximum percentage of bad flash blocks
+ // Fail if over 19% of bad flash blocks is encountered
+ const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19;
+ // The maximum number of flash memory errors allowed
+ // Fail if over 300 flash memory errors is encountered
+ const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300;
+
+ // Status of the accumulation of all calls related to the NVM health check.
+ // If any one call is bad/fails, then this will be false, else it stays true
+ bool l_didNvmHealthCheckPass(true);
+
+ // Handle to catch any errors
+ errlHndl_t l_err(nullptr);
+
+ // The retrieved flash block percentage from register
+ uint8_t l_badFlashBlockPercentage(0);
+ // The retrieved flash error count from register
+ uint32_t l_flashErrorCount(0);
+
+ // The status of the checks on the percentage of bad blocks and
+ // flash error count
+ // Default to true
+ bool l_badFlashBlockPercentageCheckPassed(true);
+ bool l_flashErrorCountCheckPassed(true);
+
+ // Iterate thru the supplied NVDIMMs checking the health of the NVM
+ for (auto const l_nvDimm : i_nvDimmTargetList)
+ {
+ // Cache the HUID of the NVDIMM
+ uint32_t l_nvDimmHuid = get_huid( l_nvDimm );
+
+ // Reset these for every NVDIMM that is checked
+ l_badFlashBlockPercentage = 0;
+ l_flashErrorCount = 0;
+ l_badFlashBlockPercentageCheckPassed = true;
+ l_flashErrorCountCheckPassed = true;
+
+ // Check the validity of bad flash block percentage
+ if (!nvDimmCheckBadFlashBlockPercentage(
+ l_nvDimm,
+ MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED,
+ l_badFlashBlockPercentage))
+ {
+ // Set this to false to indicate that the overall check on the
+ // NVDIMMs had at least one failure
+ l_didNvmHealthCheckPass = false;
+
+ // If no data in the variable l_badFlashBlockPercentage, then
+ // this is a read register fail. Move onto the next NVDIMM
+ // this is a dud
+ if (!l_badFlashBlockPercentage)
+ {
+ continue;
+ }
+
+ // Set the check to false, to facilitate error reporting
+ l_badFlashBlockPercentageCheckPassed = false;
+ }
+
+ // Check the validity of the flash error count
+ if (!nvDimmCheckFlashErrorCount(
+ l_nvDimm,
+ MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED,
+ l_flashErrorCount))
+ {
+ // Set this to false to indicate that the overall check on the
+ // NVDIMMs had at least one failure
+ l_didNvmHealthCheckPass = false;
+
+ // If no data in the variable l_flashErrorCount, then
+ // this is a read register fail. Move onto the next NVDIMM
+ // this is a dud
+ if (!l_flashErrorCount)
+ {
+ continue;
+ }
+
+ // Set the check to false, to facilitate error reporting
+ l_flashErrorCountCheckPassed = false;
+ }
+
+ /// Now we assess the health of the flash based on data gathered above
+ if ( !l_badFlashBlockPercentageCheckPassed ||
+ !l_flashErrorCountCheckPassed )
+ {
+ // First set the NVDIMM HUID to the first 32 bits of user data 1
+ uint64_t l_badFlashBlockPercentageUserData1 =
+ TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0);
+
+ // If an issue with the bad flash block percentage, then append
+ // data to user data 1
+ if (!l_badFlashBlockPercentageCheckPassed &&
+ l_badFlashBlockPercentage)
+ {
+ // Setting the HUID here is redundant but easier than trying to
+ // do some clever code that will set the HUID for user data 1
+ // when this path is not taken, but the next check on the flash
+ // error count is taken
+ l_badFlashBlockPercentageUserData1 =
+ TWO_UINT32_TO_UINT64(l_nvDimmHuid,
+ TWO_UINT16_TO_UINT32(
+ l_badFlashBlockPercentage,
+ MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED));
+ }
+
+ // If an issue with the flash error count, then set user
+ // data 2 to contain the flash error count value
+ uint64_t l_flashErrorCountUserData2(0);
+ if (!l_flashErrorCountCheckPassed &&
+ l_flashErrorCount)
+ {
+ l_flashErrorCountUserData2 =
+ TWO_UINT32_TO_UINT64(l_flashErrorCount,
+ MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED);
+ }
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_PREDICTIVE
+ * @moduleid NVDIMM_NVM_HEALTH_CHECK
+ * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED
+ * @userdata1[0:31] HUID of NVDIMM target
+ * @userdata1[32:47] The retrieved bad flash block percentage,
+ * if error with, else 0
+ * @userdata1[48:63] The maximum percentage of bad flash blocks
+ * allowed, if bad flash block percentage
+ * exceeds this maximum, else 0
+ * @userdata2[0:31] The retrieved flash error count,
+ * if error with, else 0
+ * @userdata2[32:63] The maximum number of flash errors
+ * allowed, if flash error exceeds this
+ * maximum, else 0
+ * @devdesc Either the NVDIMM NVM bad flash block
+ * percentage exceeded the maximum percentage
+ * allowed or the NVDIMM NVM number of flash
+ * error exceeds the maximum count allowed
+ * or both.
+ * @custdesc NVDIMM NVM health check failed.
+ */
+ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE,
+ NVDIMM_NVM_HEALTH_CHECK,
+ NVDIMM_NVM_HEALTH_CHECK_FAILED,
+ l_badFlashBlockPercentageUserData1,
+ l_flashErrorCountUserData2,
+ ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+
+ // Collect the error
+ errlCommit(l_err, NVDIMM_COMP_ID);
+
+ // Let the caller know something went amiss
+ l_didNvmHealthCheckPass = false;
+ }
+ else
+ {
+ // This NVDIMM passed the NVM health check
+ TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): "
+ "Success: NVDIMM (0x%.8X) passed the NVM health check.",
+ l_nvDimmHuid);
+ } // end if ( !l_badFlashBlockPercentageCheckPassed .. else
+ } // end for (auto const l_nvdimm : i_nvdimmTargetList)
+
+ // Should not have any uncommitted errors
+ assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected "
+ "uncommitted error found");
+
+ TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s",
+ l_didNvmHealthCheckPass == true ? "true" : "false" );
+
+ return l_didNvmHealthCheckPass;
+} // end nvDimmNvmCheckHealthStatus
+
+/**
+ * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus
+ *
+ * @see nvDimmNvmCheckHealthStatus for more details
+ *
+ * @return false if one or more NVDIMMs fail an NVM health check, else true
+ */
+bool nvDimmNvmCheckHealthStatusOnSystem()
+{
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()");
+
+ // Get the list of NVDIMM Targets from the system
+ TargetHandleList l_nvDimmTargetList;
+ nvdimm_getNvdimmList(l_nvDimmTargetList);
+
+ // Return status of doing a check health status
+ bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList);
+
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): "
+ "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" );
+
+ return l_didNvmHealthCheckPass;
+} // end nvDimmCheckHealthStatusOnSystem
+
+
} // end NVDIMM namespace
OpenPOWER on IntegriCloud