diff options
author | Roland Veloz <rveloz@us.ibm.com> | 2019-08-23 20:00:36 -0500 |
---|---|---|
committer | Christian R Geddes <crgeddes@us.ibm.com> | 2019-09-03 09:49:31 -0500 |
commit | c3d8cfd066998862656706bb00d15ad986470fe5 (patch) | |
tree | f267927cbfb821659b4bd25107dcf1155a0b3ba2 /src/usr/isteps | |
parent | e22e362f3cd9b96b845f8c51acd6da97c78554a4 (diff) | |
download | talos-hostboot-c3d8cfd066998862656706bb00d15ad986470fe5.tar.gz talos-hostboot-c3d8cfd066998862656706bb00d15ad986470fe5.zip |
Added code to support doing an NVM health check
Added a method that will do an NVM (non-volatile memory)
health check. In particular this method will check the
flash error counts and does a predictive callout
if the number of flash error counts exceeds the maximum
allowed. This method also checks the bad block percentage
and does a predictive callout if the number of bad block
percentage exceeds the maximum allowed. A predictive
callout is done if either or both fail the check.
Added support in the runtime commands to make the nvm health check
call as well.
Also, when I did the ES (energy source) health check method, I was
not very explicit in the method that it was doing an ES health
check. So I updated the verbiage in the nvDimmCheckHealthStatus
to add ES wherever appropriate as to make these two method
explicit as to what health check is being performed.
Change-Id: Ib9925fd2bb8430cf2121108329247d96072beb1b
CQ: 473220
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/82843
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Glenn Miles <milesg@ibm.com>
Reviewed-by: Zachary Clark <zach@ibm.com>
Reviewed-by: Christian R Geddes <crgeddes@us.ibm.com>
Diffstat (limited to 'src/usr/isteps')
-rw-r--r-- | src/usr/isteps/nvdimm/nvdimm.H | 5 | ||||
-rw-r--r-- | src/usr/isteps/nvdimm/runtime/nvdimm_rt.C | 708 |
2 files changed, 596 insertions, 117 deletions
diff --git a/src/usr/isteps/nvdimm/nvdimm.H b/src/usr/isteps/nvdimm/nvdimm.H index a99f1180a..d2d2985b6 100644 --- a/src/usr/isteps/nvdimm/nvdimm.H +++ b/src/usr/isteps/nvdimm/nvdimm.H @@ -275,6 +275,11 @@ enum i2cReg : uint16_t TYPED_BLOCK_DATA_BYTE30 = 0x39E, TYPED_BLOCK_DATA_BYTE31 = 0x39F, TYPED_BLOCK_DATA_OFFSET = 0x3E0, + FLASH_BAD_BLK_PCT = 0x41D, // Read only; Percentage of flash blocks + // in the flash array marked as bad blocks + FLASH_ERROR_COUNT0 = 0x428, // Read only; LSB[7:0] Flash error count + FLASH_ERROR_COUNT1 = 0x429, // Read only; [15:8] + FLASH_ERROR_COUNT2 = 0x42A, // Read only; MSB[23:16] BPM_MAGIC_REG1 = 0x430, BPM_MAGIC_REG2 = 0x431, SCAP_STATUS = 0x432, diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index d615aa546..b38dd394d 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -25,7 +25,11 @@ /** * @file nvdimm_rt.C * - * @brief NVDIMM functions only needed for runtime + * @brief NVDIMM functions only needed for runtime. These functions include + * but are not limited to arming/disarming the NVDIMM along with methods + * to poll the arming and check the status of the arming. Checking the + * error state of the NVDIMM, getting a random number with the darn + * instruction and checking the ES or NVM health status. */ /// BPM - Backup Power Module @@ -734,65 +738,68 @@ errlHndl_t nvdimm_getRandom(uint8_t* o_genData) } /* - * @brief Check the health status of the individual NVDIMMs supplied in list + * @brief Check the ES (enery source)/backup power module(BPM) health status of + * the individual NVDIMMs supplied in list * - * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail ES health check, else true */ -bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) +bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) { - TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): " "Target list size(%d)", i_nvdimmTargetList.size()); - // The minimum lifetime value - const uint8_t LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% + // The minimum ES lifetime value + const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% - // The health check status flags for the different states of a health check - const uint8_t HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 - const uint8_t HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 - const uint8_t HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 + // The ES health check status flags for the different states of an + // ES health check + const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 + const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 + const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 // Handle to catch any errors errlHndl_t l_err(nullptr); - // The health check status from a health check call - uint8_t l_healthCheck(0); + // The ES health check status from an ES health check call + uint8_t l_esHealthCheck(0); - // Status of the accumulation of all calls related to the health check. + // Status of the accumulation of all calls related to the ES health check. // If any one call is bad/fails, then this will be false, else it stays true - bool l_didHealthCheckPass(true); + bool l_didEsHealthCheckPass(true); - // Iterate thru the NVDIMMs checking the health status of each one. + // Iterate thru the NVDIMMs checking the ES health status of each one. // Going with the assumption that the caller waited the allotted time, // roughly 20 to 30 minutes, after the start of an IPL. // Success case: - // * Health check initiated at start of the IPL, caller waited the + // * ES health check initiated at start of the IPL, caller waited the // allotted time (20 to 30 mins) before doing a health check, health // check returned success and the lifetime meets the minimum threshold // for a new BPM. // Error cases are: - // * Health check is in progress, will assume BPM is hung - // * Health check failed - // * Health check succeeded but lifetime does not meet a certain threshold + // * ES health check is in progress, will assume BPM is hung + // * ES health check failed + // * ES health check succeeded but lifetime does not meet a + // certain threshold // * If none of the above apply (success case and other error cases), - // then assume the health check was never initiated at the start of the - // IPL + // then assume the ES health check was never initiated at the start + // of the IPL // For each of these error cases do a predictive callout for (auto const l_nvdimm : i_nvdimmTargetList) { // Retrieve the Health Check status from the BPM - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "Reading NVDIMM(0x%.8X) health check data, " + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES health check data, " "register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); - l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_healthCheck); + l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck); if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "NVDIMM(0x%X) failed to read the health check " + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) failed to read the ES health check " "data, register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); @@ -801,43 +808,43 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; // Proceed to next NVDIMM, better luck next time continue; } // Trace out the returned data for inspection - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "NVDIMM(0x%X) returned value(0x%.2X) from health check " - "data, register ES_CMD_STATUS0(0x%.2X)", - get_huid(l_nvdimm), l_healthCheck, ES_CMD_STATUS0) + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) returned value(0x%.2X) from the ES health " + "check data, register ES_CMD_STATUS0(0x%.2X)", + get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0); - if (l_healthCheck & HEALTH_CHECK_IN_PROGRESS_FLAG) + if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " - "doing a health check on NVDIMM(0x%.8X), the BPM " - "is hung doing the health check.", + "doing an ES health check on NVDIMM(0x%.8X), the BPM " + "is hung doing the ES health check.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status + * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before - * doing a health check, then the BPM is hung doing - * the health check. - * @custdesc NVDIMM Health Check failed. + * doing an ES health check, then the BPM is hung doing + * the ES health check. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -849,34 +856,33 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_healthCheck & HEALTH_CHECK_FAILED_FLAG) + else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " - "doing a health check on NVDIMM(0x%.8X), the BPM " + "doing an ES health check on NVDIMM(0x%.8X), the BPM " "reported a failure.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_REPORTED_FAILURE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status - * @devdesc NVDIMM Health Check failed + * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before - * doing a health check, the BPM reported a failure - * while doing a health check. - * @custdesc NVDIMM Health Check failed. + * doing an ES health check, the BPM reported a failure + * while doing an ES health check. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_REPORTED_FAILURE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -888,12 +894,12 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG) + else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) { - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "Reading NVDIMM(0x%.8X) es lifetime data, " + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES lifetime data, " "register ES_LIFETIME(0x%.2X)", get_huid(l_nvdimm), ES_LIFETIME); @@ -905,7 +911,7 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) if (l_err) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "NVDIMM(0x%.8X) failed to read the " "ES_LIFETIME(0x%.2X) data", get_huid(l_nvdimm), @@ -916,42 +922,42 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_lifetimePercentage < LIFETIME_MINIMUM_REQUIREMENT) + else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "Health check on NVDIMM(0x%.8X) succeeded but the " - "BPM's lifetime(%d) does not meet the minimum " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "ES health check on NVDIMM(0x%.8X) succeeded but " + "the BPM's lifetime(%d) does not meet the minimum " "requirement(%d) needed to qualify as a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT ); + ES_LIFETIME_MINIMUM_REQUIREMENT ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_LIFETIME_MIN_REQ_NOT_MET + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET * @userdata1[00:31] HUID of NVDIMM target - * @userdata1[32:63] Health check status + * @userdata1[32:63] ES health check status * @userdata2[00:31] Retrieved lifetime percentage * @userdata2[32:63] lifetime minimum requirement - * @devdesc Health check succeeded but the BPM's + * @devdesc ES health check succeeded but the BPM's * lifetime does not meet the minimum * requirement needed to qualify as a * new BPM. - * @custdesc NVDIMM Health Check failed + * @custdesc NVDIMM ES health check failed */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_LIFETIME_MIN_REQ_NOT_MET, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET, TWO_UINT32_TO_UINT64( get_huid(l_nvdimm), - l_healthCheck), + l_esHealthCheck), TWO_UINT32_TO_UINT64( l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT), + ES_LIFETIME_MINIMUM_REQUIREMENT), ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -963,45 +969,46 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } // end else if (l_lifetimePercentage ... else { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmCheckHealthStatus(): " - "Success: Health check on NVDIMM(0x%.8X) succeeded " - "and the BPM's lifetime(%d) meet's the minimum " - "requirement(%d) needed to qualify as a new BPM.", + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "Success: ES health check on NVDIMM(0x%.8X) " + "succeeded and the BPM's lifetime(%d) meet's the " + "minimum requirement(%d) needed to qualify as " + "a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT ); + ES_LIFETIME_MINIMUM_REQUIREMENT ); } - } // end else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG) - else // Assume the health check was never initiated at + } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) + else // Assume the ES health check was never initiated at // the start of the IPL. { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "The health check on NVDIMM(0x%.8X) shows no status (in " - "progress, fail or succeed) so assuming it was never " - "initiated at the start of the IPL.", + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "The ES health check on NVDIMM(0x%.8X) shows no status " + "(in progress, fail or succeed) so assuming it was " + "never initiated at the start of the IPL.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_NEVER_INITIATED + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status - * @devdesc The health check shows no status (in progress, fail - * or succeed) so assuming it was never initiated + * @userdata2 ES health check status + * @devdesc The ES health check shows no status (in progress, + * fail or succeed) so assuming it was never initiated * at the start of the IPL. - * @custdesc NVDIMM Health Check failed. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_NEVER_INITIATED, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -1013,42 +1020,509 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } } // end for (auto const l_nvdimm : i_nvdimmTargetList) // Should not have any uncommitted errors - assert(l_err == NULL, "nvDimmCheckHealthStatus() - unexpected uncommitted" - "error found" ); + assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected " + "uncommitted error found" ); - TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatus(): " - "Returning %s", l_didHealthCheckPass == true ? "true" : "false" ); + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false"); - return l_didHealthCheckPass; -} // end nvDimmCheckHealthStatus + return l_didEsHealthCheckPass; +} // end nvDimmEsCheckHealthStatus /** - * @brief A wrapper around the call to nvDimmCheckHealthStatus + * @brief A wrapper around the call to nvDimmEsCheckHealthStatus * - * @see nvDimmCheckHealthStatus for more details + * @see nvDimmEsCheckHealthStatus for more details * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail an ES health check, else true */ -bool nvDimmCheckHealthStatusOnSystem() +bool nvDimmEsCheckHealthStatusOnSystem() { - TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatusOnSystem()"); + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()"); // Get the list of NVDIMM Targets from the system TargetHandleList l_nvDimmTargetList; nvdimm_getNvdimmList(l_nvDimmTargetList); // Return status of doing a check health status - bool l_didHealthCheckPass = nvDimmCheckHealthStatus(l_nvDimmTargetList); + bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList); - TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatusOnSystem(): " - "Returning %s", l_didHealthCheckPass == true ? "true" : "false" ); + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" ); - return l_didHealthCheckPass; + return l_didEsHealthCheckPass; } // end nvDimmCheckHealthStatusOnSystem +/* + * @brief Check the bad flash block percentage against a given maximum allowed. + * + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_badFlashBlockPercentage will contain what the retrieved + * flash block percentage is. + * If false is returned and the o_badFlashBlockPercentage is zero, then + * the check failed because of a register read fail + * If false is returned and the o_badFlashBlockPercentage is not zero, + * then the check failed because the retrieved bad flash block + * percentage exceeds the given maximum allowed + * + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash + * block allowed + * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block + * percentage from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true + */ +bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm, + const uint8_t i_maxPercentageAllowed, + uint8_t &o_badFlashBlockPercentage) +{ + // The status of the check on the bad block percentage + bool l_didBadFlashBlockPercentageCheckPass(false); + + // The retrieved flash block percentage from register, initialize to zero + o_badFlashBlockPercentage = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); + + // Retrieve the percentage of bad blocks and validate + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "Reading NVDIMM(0x%.8X) percentage of bad blocks from " + "register FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); + + l_err = nvdimmReadReg(i_nvDimm, + FLASH_BAD_BLK_PCT, + o_badFlashBlockPercentage); + + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: NVDIMM(0x%.8X) failed to read the percentage of " + "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), " + "marking as a fail", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didBadFlashBlockPercentageCheckPass = false; + o_badFlashBlockPercentage = 0; + } + else + { + // Trace out the returned data for inspection + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "NVDIMM(0x%.8X) returned value (%d) from the " + "percentage of bad blocks, register " + "FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, + o_badFlashBlockPercentage, + FLASH_BAD_BLK_PCT); + + // Check to see if the bad flash block percentage + // exceeds maximum allowed. + if (o_badFlashBlockPercentage > i_maxPercentageAllowed) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d) exceeds the maximum percentage " + "of bad flash blocks allowed (%d), marking this " + "as a fail", + l_nvDimmHuid, + o_badFlashBlockPercentage, + i_maxPercentageAllowed); + + // Set up the fail state, so caller can determine that the fail was + // due to percentage exceeding the max percentage allowed. + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = false; + } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d) is less than or meets the maximum " + "percentage of bad flash blocks allowed (%d), " + "marking this as a pass", + l_nvDimmHuid, + o_badFlashBlockPercentage, + i_maxPercentageAllowed); + + // Set up the pass state + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = true; + } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed) + } // end if (l_err) ... else + + return l_didBadFlashBlockPercentageCheckPass; +} + +/* + * @brief Check the flash error count against a given maximum allowed. + * + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_readFlashErrorCount will contain what the retrieved + * flash error count is. + * If false is returned and the o_readFlashErrorCount is zero, then + * the check failed because of a register read fail + * If false is returned and the o_readFlashErrorCount is not zero, + * then the check failed because the retrieved flash error + * count exceeds the given maximum allowed + * + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors + * allowed + * @param[out] o_readFlashErrorCount - The retrieved bad flash error + * count from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true + */ +bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm, + const uint32_t i_maxFlashErrorsAllowed, + uint32_t &o_readFlashErrorCount) +{ + // The status of the check on the flash error count + bool l_didFlashErrorCountCheckPass(false); + + // The retrieved flash error count from register, initialize to zero + o_readFlashErrorCount = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); + + // The retrieved flash error count from a register + uint8_t l_readFlashErrorCountByte(0); + + // Read the flash error count registers starting from MSB to LSB + for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; + l_flashErrorRegister >= FLASH_ERROR_COUNT0; + --l_flashErrorRegister) + { + // Reset this for every iteration, may be redundant + l_readFlashErrorCountByte = 0; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "Reading NVDIMM(0x%.8X) flash error count from " + "register FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, l_flashErrorRegister); + + l_err = nvdimmReadReg(i_nvDimm, + static_cast<i2cReg >(l_flashErrorRegister), + l_readFlashErrorCountByte); + + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: NVDIMM(0x%.8X) failed to read flash error " + "count from register FLASH_ERROR_COUNT(0x%.4X) " + "marking as a fail", + l_nvDimmHuid, l_flashErrorRegister); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didFlashErrorCountCheckPass = false; + o_readFlashErrorCount = 0; + + break; + } + + // If we get here, then the read was successful + // Append the read flash error count byte to the LSB of the + // aggregated flash error count bytes. + o_readFlashErrorCount = (o_readFlashErrorCount << 8) | + l_readFlashErrorCountByte; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) returned value (0x%.2X) from the " + "partial flash error count, register " + "FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, + l_readFlashErrorCountByte, + l_flashErrorRegister); + + } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ... + + // If o_readFlashErrorCount is not zero, then register read was successful + if (o_readFlashErrorCount) + { + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) flash error count = %d ", + l_nvDimmHuid, o_readFlashErrorCount); + + // Check the validity of the flash error count + if (o_readFlashErrorCount > i_maxFlashErrorsAllowed) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: For NVDIMM (0x%.8X), the flash error " + "count (%d) exceeds the maximum number of flash " + "errors allowed (%d), marking this as a fail", + l_nvDimmHuid, + o_readFlashErrorCount, + i_maxFlashErrorsAllowed); + + // Set up the fail state, so caller can determine that the fail was + // due to error count exceeding the max errors allowed. + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = false; + } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "SUCCESS: For NVDIMM(0x%.8X), the flash error counts " + "(%d) is less than or meets the maximum number of " + "errors allowed (%d), marking this as a pass", + l_nvDimmHuid, + o_readFlashErrorCount, + i_maxFlashErrorsAllowed); + + // Set up the pass state + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = true; + } + } // end if (o_readFlashErrorCount) + + return l_didFlashErrorCountCheckPass; +} + +/* + * @brief Check the NVM (non-volatile memory)/flash health of the individual + * NVDIMMs supplied in list. + * + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash + * + * @return false if one or more NVDIMMs fail NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList) +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): " + "Target list size(%d)", i_nvDimmTargetList.size()); + + // The following maximums are the same values used by SMART's + // manufacturing and recommended that we use. + // The maximum percentage of bad flash blocks + // Fail if over 19% of bad flash blocks is encountered + const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19; + // The maximum number of flash memory errors allowed + // Fail if over 300 flash memory errors is encountered + const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300; + + // Status of the accumulation of all calls related to the NVM health check. + // If any one call is bad/fails, then this will be false, else it stays true + bool l_didNvmHealthCheckPass(true); + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // The retrieved flash block percentage from register + uint8_t l_badFlashBlockPercentage(0); + // The retrieved flash error count from register + uint32_t l_flashErrorCount(0); + + // The status of the checks on the percentage of bad blocks and + // flash error count + // Default to true + bool l_badFlashBlockPercentageCheckPassed(true); + bool l_flashErrorCountCheckPassed(true); + + // Iterate thru the supplied NVDIMMs checking the health of the NVM + for (auto const l_nvDimm : i_nvDimmTargetList) + { + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( l_nvDimm ); + + // Reset these for every NVDIMM that is checked + l_badFlashBlockPercentage = 0; + l_flashErrorCount = 0; + l_badFlashBlockPercentageCheckPassed = true; + l_flashErrorCountCheckPassed = true; + + // Check the validity of bad flash block percentage + if (!nvDimmCheckBadFlashBlockPercentage( + l_nvDimm, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED, + l_badFlashBlockPercentage)) + { + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_badFlashBlockPercentage, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_badFlashBlockPercentage) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_badFlashBlockPercentageCheckPassed = false; + } + + // Check the validity of the flash error count + if (!nvDimmCheckFlashErrorCount( + l_nvDimm, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED, + l_flashErrorCount)) + { + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_flashErrorCount, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_flashErrorCount) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_flashErrorCountCheckPassed = false; + } + + /// Now we assess the health of the flash based on data gathered above + if ( !l_badFlashBlockPercentageCheckPassed || + !l_flashErrorCountCheckPassed ) + { + // First set the NVDIMM HUID to the first 32 bits of user data 1 + uint64_t l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0); + + // If an issue with the bad flash block percentage, then append + // data to user data 1 + if (!l_badFlashBlockPercentageCheckPassed && + l_badFlashBlockPercentage) + { + // Setting the HUID here is redundant but easier than trying to + // do some clever code that will set the HUID for user data 1 + // when this path is not taken, but the next check on the flash + // error count is taken + l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, + TWO_UINT16_TO_UINT32( + l_badFlashBlockPercentage, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED)); + } + + // If an issue with the flash error count, then set user + // data 2 to contain the flash error count value + uint64_t l_flashErrorCountUserData2(0); + if (!l_flashErrorCountCheckPassed && + l_flashErrorCount) + { + l_flashErrorCountUserData2 = + TWO_UINT32_TO_UINT64(l_flashErrorCount, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED); + } + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_NVM_HEALTH_CHECK + * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED + * @userdata1[0:31] HUID of NVDIMM target + * @userdata1[32:47] The retrieved bad flash block percentage, + * if error with, else 0 + * @userdata1[48:63] The maximum percentage of bad flash blocks + * allowed, if bad flash block percentage + * exceeds this maximum, else 0 + * @userdata2[0:31] The retrieved flash error count, + * if error with, else 0 + * @userdata2[32:63] The maximum number of flash errors + * allowed, if flash error exceeds this + * maximum, else 0 + * @devdesc Either the NVDIMM NVM bad flash block + * percentage exceeded the maximum percentage + * allowed or the NVDIMM NVM number of flash + * error exceeds the maximum count allowed + * or both. + * @custdesc NVDIMM NVM health check failed. + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_NVM_HEALTH_CHECK, + NVDIMM_NVM_HEALTH_CHECK_FAILED, + l_badFlashBlockPercentageUserData1, + l_flashErrorCountUserData2, + ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didNvmHealthCheckPass = false; + } + else + { + // This NVDIMM passed the NVM health check + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): " + "Success: NVDIMM (0x%.8X) passed the NVM health check.", + l_nvDimmHuid); + } // end if ( !l_badFlashBlockPercentageCheckPassed .. else + } // end for (auto const l_nvdimm : i_nvdimmTargetList) + + // Should not have any uncommitted errors + assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected " + "uncommitted error found"); + + TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s", + l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmNvmCheckHealthStatus + +/** + * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus + * + * @see nvDimmNvmCheckHealthStatus for more details + * + * @return false if one or more NVDIMMs fail an NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatusOnSystem() +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()"); + + // Get the list of NVDIMM Targets from the system + TargetHandleList l_nvDimmTargetList; + nvdimm_getNvdimmList(l_nvDimmTargetList); + + // Return status of doing a check health status + bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList); + + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): " + "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmCheckHealthStatusOnSystem + + } // end NVDIMM namespace |