diff options
author | Tsung Yeung <tyeung@us.ibm.com> | 2019-10-02 14:54:22 -0500 |
---|---|---|
committer | Dean Sanner <dsanner@us.ibm.com> | 2019-10-04 07:19:42 -0500 |
commit | 34f119b59886e06b837413527db117887380280c (patch) | |
tree | 8f39b651fc0964cd34222f9f8c812670fbbc88ce /src | |
parent | 3920d160e612ab3e2ca7a586e1a48c39b079bf36 (diff) | |
download | talos-hostboot-34f119b59886e06b837413527db117887380280c.tar.gz talos-hostboot-34f119b59886e06b837413527db117887380280c.zip |
Add nvdimm arm retry logic in case of glitches
Random BPM glitches could cause temporary persistency lost and
failing the arm command. Allow arm retry if glitch only persisted
for a second or less.
Change-Id: I28e65b05e482129f6fea34580064a825923aaaf3
CQ:SW477211
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84668
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Daniel M Crowell <dcrowell@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Dean Sanner <dsanner@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H | 1 | ||||
-rw-r--r-- | src/usr/isteps/nvdimm/runtime/nvdimm_rt.C | 281 |
2 files changed, 194 insertions, 88 deletions
diff --git a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H index 573693512..25e7c27ea 100644 --- a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H +++ b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H @@ -203,6 +203,7 @@ enum nvdimmReasonCode NVDIMM_ERASE_ERROR = NVDIMM_COMP_ID | 0x51, NVDIMM_ARM_PRE_CHECK_FAILED = NVDIMM_COMP_ID | 0x52, NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x53, + NVDIMM_ARM_RETRY = NVDIMM_COMP_ID | 0x54, }; enum UserDetailsTypes diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index 4caf2faab..defcdd626 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -37,6 +37,7 @@ #include <trace/interface.H> #include <errl/errlentry.H> #include <errl/errlmanager.H> +#include <errl/errludstring.H> #include <util/runtime/rt_fwreq_helper.H> #include <targeting/common/attributes.H> #include <targeting/common/commontargeting.H> @@ -50,6 +51,7 @@ #include "../nvdimmErrorLog.H" #include <isteps/nvdimm/nvdimm.H> // implements some of these #include "../nvdimm.H" // for g_trac_nvdimm +#include <sys/time.h> //#define TRACUCOMP(args...) TRACFCOMP(args) #define TRACUCOMP(args...) @@ -63,6 +65,7 @@ namespace NVDIMM static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull; static constexpr uint32_t MAX_DARN_ERRORS = 10; static constexpr uint8_t FW_OPS_UPDATE = 0x04; +static constexpr size_t ARM_MAX_RETRY_COUNT = 1; /** * @brief This function polls the command status register for arm completion * (does not indicate success or fail) @@ -234,9 +237,8 @@ errlHndl_t nvdimmArmPreCheck(Target* i_nvdimm) *@userdata1[48:56] l_ready *@userdata1[57:63] l_fwupdate *@userdata2 <UNUSED> - *@devdesc NVDIMM threw an error or failed to set event - * notifications during arming - *@custdesc NVDIMM failed to enable event notificaitons + *@devdesc NVDIMM failed arm precheck. Refer to FFDC for exact reason + *@custdesc NVDIMM failed the arm precheck and is unable to arm */ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE, NVDIMM_ARM_PRE_CHECK, @@ -422,112 +424,215 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList) break; } - l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER); - // If we run into any error here we will just - // commit the error log and move on. Let the - // system continue to boot and let the user - // salvage the data - if (l_err) - { - TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm)); - - nvdimmDisarm(i_nvdimmTargetList); - - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME); - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; - continue; - } - - // Arm happens one module at a time. No need to set any offset on the counter - uint32_t l_poll = 0; - l_err = nvdimmPollArmDone(l_nvdimm, l_poll); - if (l_err) - { - TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm)); - l_arm_timeout = true; - - l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); - if (l_err_t) - { - errlCommit( l_err_t, NVDIMM_COMP_ID ); - } - - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME); + bool l_is_retryable = true; + //continue flag set by the retry loop to continue on the outer loop + bool l_continue_arm = false; + //break flag set by the retry loop to break on the outer loop + bool l_break = false; + errlHndl_t l_err_retry = nullptr; - errlCommit( l_err, NVDIMM_COMP_ID ); - o_arm_successful = false; - } - - // Pass l_arm_timeout value in for health status check - l_continue = l_arm_timeout; - - // Check health status registers and exit if required - l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue ); - - // Check for health status failure - if (l_err) + // Attempt arm multiple times in case of glitches + for (size_t l_retry = 0; l_retry <= ARM_MAX_RETRY_COUNT; l_retry++) { - TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm)); - // The arm timeout variable is used here as the continue variable for the - // health status check. This was done to include the timeout for use in the check - // If true either the arm timed out with a health status fail or the - // health status check failed with another disarm and exit condition - if (!l_continue) + l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER); + // If we run into any error here we will just + // commit the error log and move on. Let the + // system continue to boot and let the user + // salvage the data + if (l_err) { - errlCommit( l_err, NVDIMM_COMP_ID ); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm)); - // Disarming all dimms due to error nvdimmDisarm(i_nvdimmTargetList); + // Committing the error as we don't want this to interrupt + // the boot. This will notify the user that action is needed + // on this module + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit( l_err, NVDIMM_COMP_ID ); o_arm_successful = false; + + // Cause the main loop to skip the rest of the arm procedure + // and move to the next target + l_continue_arm = true; break; } - else + + // Arm happens one module at a time. No need to set any offset on the counter + uint32_t l_poll = 0; + l_err = nvdimmPollArmDone(l_nvdimm, l_poll); + if (l_err) { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm)); + l_arm_timeout = true; + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + + // Committing the error as we don't want this to interrupt + // the boot. This will notify the user that action is needed + // on this module + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit( l_err, NVDIMM_COMP_ID ); - continue; + o_arm_successful = false; } - } - l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout); - if (l_err) - { - TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm", get_huid(l_nvdimm)); + // Pass l_arm_timeout value in for health status check + l_continue = l_arm_timeout; - // Disarming all dimms due to error - nvdimmDisarm(i_nvdimmTargetList); + // Sleep for 1 second before checking the health status + // to let the glitches settle in case there were any + nanosleep(1, 0); + + // Check health status registers and exit if required + l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue ); - l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); - if (l_err_t) + // Check for health status failure + // Any fail picked up by the health check is a legit fail + if (l_err) { - errlCommit( l_err_t, NVDIMM_COMP_ID ); + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm)); + + // The arm timeout variable is used here as the continue variable for the + // health status check. This was done to include the timeout for use in the check + // If true either the arm timed out with a health status fail or the + // health status check failed with another disarm and exit condition + if (!l_continue) + { + errlCommit( l_err, NVDIMM_COMP_ID ); + + // Disarming all dimms due to error + nvdimmDisarm(i_nvdimmTargetList); + o_arm_successful = false; + + // Cause the main loop to exit out of the main arm procedure + l_break = true; + break; + } + else + { + errlCommit( l_err, NVDIMM_COMP_ID ); + + // Cause the main loop to skip the rest of the arm procedure + // and move to the next target + l_continue_arm = true; + break; + } } - // Committing the error as we don't want this to interrupt - // the boot. This will notify the user that action is needed - // on this module - l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); - l_err->collectTrace(NVDIMM_COMP_NAME); - - // Dump Traces for error logs - nvdimmTraceRegs( l_nvdimm, l_RegInfo ); - nvdimmAddPage4Regs(l_nvdimm,l_err); + l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout); - // Add reg traces to the error log - NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + // At this point we have passed the health check. If the arm were + // to fail now, it is likely it was due to some glitch. Let's retry + // the arm again as long as the fail is not due to timeout. + // A timeout would mean a charging issue, it would have been caught + // by the health check. + l_is_retryable = !l_arm_timeout && l_retry < ARM_MAX_RETRY_COUNT; + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm. %s retryable.", + get_huid(l_nvdimm), l_is_retryable? "IS" : "NOT"); + + if (l_is_retryable) + { + // Save the original error + l_err_retry = l_err; + + /*@ + *@errortype + *@reasoncode NVDIMM_ARM_RETRY + *@severity ERRORLOG_SEV_INFORMATIONAL + *@moduleid NVDIMM_ARM_ERASE + *@userdata1[0:31] Target Huid + *@userdata1[32:39] l_is_retryable + *@userdata1[40:47] MAX arm retry count + *@userdata2[0:31] Original errlog plid + *@userdata2[32:63] Original errlog reason code + *@devdesc NVDIMM encountered a glitch causing the initial + * arm to fail. System firmware will retry the arm + *@custdesc NVDIMM requires an arm retry + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_INFORMATIONAL, + NVDIMM_ARM_ERASE, + NVDIMM_ARM_RETRY, + NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(l_nvdimm), + FOUR_UINT8_TO_UINT32(l_is_retryable, ARM_MAX_RETRY_COUNT,0,0)), + TWO_UINT32_TO_UINT64(l_err_retry->plid(), l_err_retry->reasonCode()), + ERRORLOG::ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace( NVDIMM_COMP_NAME ); + + // Callout the dimm + l_err->addHwCallout( l_nvdimm, + HWAS::SRCI_PRIORITY_LOW, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL); + + errlCommit( l_err, NVDIMM_COMP_ID ); + } + else + { + // Handle retryable error + if (l_err_retry) + { + ERRORLOG::ErrlUserDetailsString("Arm RETRY failed").addToLog(l_err_retry); + + // Delete the current errlog and use the original errlog for callout + delete l_err; + l_err = l_err_retry; + l_err_retry = nullptr; + } + + // Disarming all dimms due to error + nvdimmDisarm(i_nvdimmTargetList); + + l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED); + if (l_err_t) + { + errlCommit( l_err_t, NVDIMM_COMP_ID ); + } + + // Committing the error as we don't want this to interrupt + // the boot. This will notify the user that action is needed + // on this module + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + + // Dump Traces for error logs + nvdimmTraceRegs( l_nvdimm, l_RegInfo ); + nvdimmAddPage4Regs(l_nvdimm,l_err); + + // Add reg traces to the error log + NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err); + + errlCommit(l_err, NVDIMM_COMP_ID); + o_arm_successful = false; + + // Cause the main loop to exit out of the main arm procedure + l_break = true; + break; + } + } + else + { + // Arm worked. Exit the retry loop + break; + } // close nvdimmCheckArmSuccess check + } // close arm retry loop - errlCommit(l_err, NVDIMM_COMP_ID); - o_arm_successful = false; + if (l_continue_arm) + { + continue; + } + else if (l_break) + { break; } |