summaryrefslogtreecommitdiffstats
path: root/src/usr/isteps
diff options
context:
space:
mode:
authorCorey Swenson <cswenson@us.ibm.com>2019-11-04 21:45:12 -0600
committerDaniel M Crowell <dcrowell@us.ibm.com>2019-11-19 23:31:41 -0600
commitd432c4503559abd127ebd170891a50da3ce2683d (patch)
tree65fb2d08b89e496b5b0a182cd55309ed42e72f56 /src/usr/isteps
parente3870cb85913e2a92843e157427f68ef86e566f8 (diff)
downloadtalos-hostboot-d432c4503559abd127ebd170891a50da3ce2683d.tar.gz
talos-hostboot-d432c4503559abd127ebd170891a50da3ce2683d.zip
Fix auto-arming NVDIMMs for ESS systems
ESS requires HB to arm the NVDIMMs during IPL after PM complex is started. Requires moving the nvdimmArm() and associated functions from nvdimm_rt.C to nvdimm.C. Change-Id: Ie7fd08b27ad7959b21086f0f291764c2156e5bd9 CQ:SW479909 CMVC-Prereq:1096403 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/86489 Reviewed-by: Matt Derksen <mderkse1@us.ibm.com> Reviewed-by: Roland Veloz <rveloz@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/isteps')
-rw-r--r--src/usr/isteps/istep21/call_host_runtime_setup.C18
-rw-r--r--src/usr/isteps/nvdimm/nvdimm.C825
-rw-r--r--src/usr/isteps/nvdimm/runtime/nvdimm_rt.C812
3 files changed, 841 insertions, 814 deletions
diff --git a/src/usr/isteps/istep21/call_host_runtime_setup.C b/src/usr/isteps/istep21/call_host_runtime_setup.C
index 55d46e1d3..bc6a9d69e 100644
--- a/src/usr/isteps/istep21/call_host_runtime_setup.C
+++ b/src/usr/isteps/istep21/call_host_runtime_setup.C
@@ -68,6 +68,7 @@
#ifdef CONFIG_NVDIMM
#include "call_nvdimm_update.H"
+#include <isteps/nvdimm/nvdimm.H>
#endif
using namespace ERRORLOG;
@@ -736,6 +737,19 @@ void* call_host_runtime_setup (void *io_pArgs)
errlCommit(l_err, ISTEP_COMP_ID);
pmStartSuccess = false;
}
+ else
+ {
+#ifdef CONFIG_NVDIMM
+ // Arm the nvdimms
+ // Only get here if is_sapphire_load and PM started and have NVDIMMs
+ TARGETING::TargetHandleList l_nvdimmTargetList;
+ NVDIMM::nvdimm_getNvdimmList(l_nvdimmTargetList);
+ if (l_nvdimmTargetList.size() != 0)
+ {
+ NVDIMM::nvdimmArm(l_nvdimmTargetList);
+ }
+#endif
+ }
#ifdef CONFIG_HTMGT
// Report PM status to HTMGT
@@ -767,7 +781,7 @@ void* call_host_runtime_setup (void *io_pArgs)
uint8_t l_skip_fir_attr_reset = 1;
// Since we are not leaving the PM complex alive, we will
// explicitly put it into reset and clean up any memory
- l_err = HBPM::resetPMAll(HBPM::RESET_AND_CLEAR_ATTRIBUTES,
+ l_err = HBPM::resetPMAll(HBPM::RESET_AND_CLEAR_ATTRIBUTES,
l_skip_fir_attr_reset);
if (l_err)
{
@@ -846,7 +860,7 @@ void* call_host_runtime_setup (void *io_pArgs)
break;
}
}
-
+
// Update the MDRT Count and PDA Table Entries from Attribute
TargetService& l_targetService = targetService();
Target* l_sys = nullptr;
diff --git a/src/usr/isteps/nvdimm/nvdimm.C b/src/usr/isteps/nvdimm/nvdimm.C
index a359f3710..3c6c1a055 100644
--- a/src/usr/isteps/nvdimm/nvdimm.C
+++ b/src/usr/isteps/nvdimm/nvdimm.C
@@ -152,6 +152,11 @@ static constexpr uint8_t OPERATION_SLEEP_SECONDS = 0x1;
// Bit mask for checking the fw slot running
static constexpr uint8_t RUNNING_FW_SLOT = 0xF0;
+// NOTE: If the ARM_MAX_RETRY_COUNT is greater than 1 then
+// previous error logs may be lost and not reported
+static constexpr size_t ARM_MAX_RETRY_COUNT = 1;
+static constexpr uint8_t FW_OPS_UPDATE = 0x04;
+
#ifndef __HOSTBOOT_RUNTIME
// Warning thresholds
static constexpr uint8_t THRESHOLD_ES_LIFETIME = 0x07; // 7%
@@ -4726,4 +4731,824 @@ errlHndl_t nvdimmGetRunningSlot(TARGETING::Target *i_nvdimm, uint8_t & o_slot)
return l_err;
}
+/**
+ * @brief This function polls the command status register for arm completion
+ *
+ * @param[in] i_nvdimm - nvdimm target with NV controller
+ *
+ * @param[out] o_poll - total polled time in ms
+ *
+ * @return errlHndl_t - Null if successful, otherwise a pointer to
+ * the error log.
+ */
+errlHndl_t nvdimmPollArmDone(Target* i_nvdimm,
+ uint32_t &o_poll)
+{
+ TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmPollArmDone() nvdimm[%X]", get_huid(i_nvdimm) );
+
+ errlHndl_t l_err = nullptr;
+
+ l_err = nvdimmPollStatus ( i_nvdimm, ARM, o_poll);
+
+ TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollArmDone() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ return l_err;
+}
+
+/**
+ * @brief This function checks the arm status register to make sure
+ * the trigger has been armed to ddr_reset_n
+ *
+ * @param[in] i_nvdimm - nvdimm target with NV controller
+ * @param[in] i_arm_timeout - nvdimm local timeout status
+ *
+ * @return errlHndl_t - Null if successful, otherwise a pointer to
+ * the error log.
+ */
+errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout)
+{
+ TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ errlHndl_t l_err = nullptr;
+ uint8_t l_data = 0;
+
+ l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data);
+
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
+ "failed to read arm status reg!",get_huid(i_nvdimm));
+ }
+ else if (((l_data & ARM_ERROR) == ARM_ERROR) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED) || i_arm_timeout)
+ {
+
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
+ "failed to arm! ARM status 0x%X ARM timeout %d"
+ ,get_huid(i_nvdimm),l_data,i_arm_timeout);
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ARM_FAILED
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_SET_ARM
+ *@userdata1[0:31] Related ops (0xff = NA)
+ *@userdata1[32:63] Target Huid
+ *@userdata2[0:31] ARM Status
+ *@userdata2[32:63] ARM Timeout
+ *@devdesc Encountered error arming the catastrophic save
+ * trigger on NVDIMM. Make sure an energy source
+ * is connected to the NVDIMM and the ES policy
+ * is set properly
+ *@custdesc NVDIMM encountered error arming save trigger
+ */
+ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_SET_ARM,
+ NVDIMM_ARM_FAILED,
+ TWO_UINT32_TO_UINT64(ARM, get_huid(i_nvdimm)),
+ TWO_UINT32_TO_UINT64(l_data, i_arm_timeout),
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace(NVDIMM_COMP_NAME, 256 );
+ nvdimmAddVendorLog(i_nvdimm, l_err);
+
+ // Failure to arm could mean internal NV controller error or
+ // even error on the battery pack. NVDIMM will lose persistency
+ // if failed to arm trigger
+ l_err->addHwCallout( i_nvdimm,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_Fatal);
+ }
+
+ TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]",
+ get_huid(i_nvdimm), l_data);
+
+ return l_err;
+}
+
+/**
+ * @brief This function performs arm precheck.
+ *
+ * @param[in] i_nvdimm - nvdimm target with NV controller
+ *
+ * @return errlHndl_t - Null if successful, otherwise a pointer to
+ * the error log.
+ */
+errlHndl_t nvdimmArmPreCheck(Target* i_nvdimm)
+{
+ TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArmPreCheck() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ errlHndl_t l_err = nullptr;
+ uint8_t l_ready = 0;
+ uint8_t l_fwupdate = 0;
+ uint8_t l_module_health = 0;
+ uint8_t l_continue = true;
+ auto l_RegInfo = nvdimm_reg_t();
+
+ do
+ {
+ // Read out the Module Health status register
+ l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_module_health);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Module Health Status",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ break;
+ }
+
+ // Read out the NVDimm Ready register
+ l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_ready);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read NVDimm Ready register",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ break;
+ }
+
+ // Read out the FW OPs Status register
+ l_err = nvdimmReadReg(i_nvdimm, FIRMWARE_OPS_STATUS, l_fwupdate);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Firmware OPs Status register",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ }
+
+ }while(0);
+
+ // Check ARM pre-requisites
+ // All nvdimms in i_nvdimmTargetList must pass the pre-req checks
+ // before continuing with arm.
+ if ((!l_continue) || (l_module_health & NVM_LIFETIME_ERROR)
+ || (l_ready != NV_READY)
+ || (l_fwupdate & FW_OPS_UPDATE))
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed NVDimm Arm prechecks",
+ get_huid(i_nvdimm));
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ARM_PRE_CHECK_FAILED
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_ARM_PRE_CHECK
+ *@userdata1[0:31] Target Huid
+ *@userdata1[32:39] l_continue
+ *@userdata1[40:47] l_module_health
+ *@userdata1[48:56] l_ready
+ *@userdata1[57:63] l_fwupdate
+ *@userdata2 <UNUSED>
+ *@devdesc NVDIMM failed arm precheck. Refer to FFDC for exact reason
+ *@custdesc NVDIMM failed the arm precheck and is unable to arm
+ */
+ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_ARM_PRE_CHECK,
+ NVDIMM_ARM_PRE_CHECK_FAILED,
+ NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(i_nvdimm),
+ FOUR_UINT8_TO_UINT32(l_continue, l_module_health, l_ready, l_fwupdate)),
+ 0x0,
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace( NVDIMM_COMP_NAME );
+
+ // Callout the dimm
+ l_err->addHwCallout( i_nvdimm,
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
+
+ // Read relevant regs for trace data
+ nvdimmTraceRegs(i_nvdimm, l_RegInfo);
+ nvdimmAddPage4Regs(i_nvdimm,l_err);
+ nvdimmAddVendorLog(i_nvdimm, l_err);
+
+ // Add reg traces to the error log
+ NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
+
+ }
+
+ TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArmPreCheck() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ return l_err;
+}
+
+
+bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
+{
+ bool o_arm_successful = true;
+ bool l_continue = true;
+ bool l_arm_timeout = false;
+ uint8_t l_data;
+ auto l_RegInfo = nvdimm_reg_t();
+ uint64_t l_writeData;
+ uint32_t l_writeAddress;
+ size_t l_writeSize = sizeof(l_writeData);
+
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() numNvdimm[%d]",
+ i_nvdimmTargetList.size());
+
+ errlHndl_t l_err = nullptr;
+ errlHndl_t l_err_t = nullptr;
+
+ // Prerequisite Arm Checks
+ for (auto const l_nvdimm : i_nvdimmTargetList)
+ {
+ l_err = nvdimmArmPreCheck(l_nvdimm);
+
+ // If we are failing the precheck, commit the error then exit
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() failed arm precheck, exiting");
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ return false;
+ }
+ }
+
+ // Encryption unlocked check
+ // Check one nvdimm at a time
+ for (auto const l_nvdimm : i_nvdimmTargetList)
+ {
+ // Unlock function will create an error log
+ // Create another here to make it clear that the arm failed
+ TargetHandleList l_nvdimmTargetList;
+ l_nvdimmTargetList.push_back(l_nvdimm);
+ if (!nvdimm_encrypt_unlock(l_nvdimmTargetList))
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed NVDimm Arm encryption unlock",
+ get_huid(l_nvdimm));
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_ARM
+ *@userdata1 Target Huid
+ *@userdata2 <UNUSED>
+ *@devdesc NVDIMM failed to unlock encryption during arming
+ *@custdesc NVDIMM failed to ARM
+ */
+ l_err = new ERRORLOG::ErrlEntry(
+ ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_ARM,
+ NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED,
+ get_huid(l_nvdimm),
+ 0x0,
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace( NVDIMM_COMP_NAME );
+
+ // Callout the dimm
+ l_err->addHwCallout( l_nvdimm,
+ HWAS::SRCI_PRIORITY_MED,
+ HWAS::DELAYED_DECONFIG,
+ HWAS::GARD_NULL);
+
+ // Read relevant regs for trace data
+ nvdimmTraceRegs(l_nvdimm, l_RegInfo);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
+
+ // Add reg traces to the error log
+ NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
+
+ // Commit the error then exit
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ return false;
+ }
+ }
+
+ // Mask MBACALFIR EventN to separate ARM handling
+ for (TargetHandleList::iterator it = i_nvdimmTargetList.begin();
+ it != i_nvdimmTargetList.end();)
+ {
+ TargetHandleList l_mcaList;
+ getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA);
+ assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA.");
+
+ l_writeAddress = MBACALFIR_OR_MASK_REG;
+ l_writeData = MBACALFIR_EVENTN_OR_BIT;
+ l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+ it++;
+ }
+
+ for (auto const l_nvdimm : i_nvdimmTargetList)
+ {
+ l_arm_timeout = false;
+
+ // skip if the nvdimm is already armed
+ ATTR_NVDIMM_ARMED_type l_armed_state = {};
+ l_armed_state = l_nvdimm->getAttr<ATTR_NVDIMM_ARMED>();
+ if (l_armed_state.armed)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] called when already armed", get_huid(l_nvdimm));
+ continue;
+ }
+
+ // Set ES Policy, contains all of its status checks
+ l_err = nvdimmSetESPolicy(l_nvdimm);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set ES Policy", get_huid(l_nvdimm));
+ o_arm_successful = false;
+
+ nvdimmDisarm(i_nvdimmTargetList);
+
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+
+ // Callout the nvdimm on high and gard
+ l_err->addHwCallout( l_nvdimm,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_Fatal);
+
+ errlCommit( l_err, NVDIMM_COMP_ID );
+
+ break;
+ }
+
+ // Clear all status registers in case of leftover bits
+ l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_MGT_CMD0, CLEAR_ALL_STATUS);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - error clearing all status registers",
+ get_huid(l_nvdimm));
+ o_arm_successful = false;
+ break;
+ }
+
+ bool l_is_retryable = true;
+ //continue flag set by the retry loop to continue on the outer loop
+ bool l_continue_arm = false;
+ //break flag set by the retry loop to break on the outer loop
+ bool l_break = false;
+ errlHndl_t l_err_retry = nullptr;
+
+ // Attempt arm multiple times in case of glitches
+ for (size_t l_retry = 0; l_retry <= ARM_MAX_RETRY_COUNT; l_retry++)
+ {
+
+ l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER);
+ // If we run into any error here we will just
+ // commit the error log and move on. Let the
+ // system continue to boot and let the user
+ // salvage the data
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm));
+
+ nvdimmDisarm(i_nvdimmTargetList);
+
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ o_arm_successful = false;
+
+ // Cause the main loop to skip the rest of the arm procedure
+ // and move to the next target
+ l_continue_arm = true;
+ break;
+ }
+
+ // Arm happens one module at a time. No need to set any offset on the counter
+ uint32_t l_poll = 0;
+ l_err = nvdimmPollArmDone(l_nvdimm, l_poll);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm));
+ l_arm_timeout = true;
+
+ l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
+ if (l_err_t)
+ {
+ errlCommit( l_err_t, NVDIMM_COMP_ID );
+ }
+
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ o_arm_successful = false;
+ }
+
+ // Pass l_arm_timeout value in for health status check
+ l_continue = l_arm_timeout;
+
+ // Sleep for 1 second before checking the health status
+ // to let the glitches settle in case there were any
+ nanosleep(1, 0);
+
+ // Check health status registers and exit if required
+ l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue );
+
+ // Check for health status failure
+ // Any fail picked up by the health check is a legit fail
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm));
+
+ // The arm timeout variable is used here as the continue variable for the
+ // health status check. This was done to include the timeout for use in the check
+ // If true either the arm timed out with a health status fail or the
+ // health status check failed with another disarm and exit condition
+ if (!l_continue)
+ {
+ errlCommit( l_err, NVDIMM_COMP_ID );
+
+ // Disarming all dimms due to error
+ nvdimmDisarm(i_nvdimmTargetList);
+ o_arm_successful = false;
+
+ // Cause the main loop to exit out of the main arm procedure
+ l_break = true;
+ break;
+ }
+ else
+ {
+ errlCommit( l_err, NVDIMM_COMP_ID );
+
+ // Cause the main loop to skip the rest of the arm procedure
+ // and move to the next target
+ l_continue_arm = true;
+ break;
+ }
+ }
+
+ l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout);
+
+ // At this point we have passed the health check. If the arm were
+ // to fail now, it is likely it was due to some glitch. Let's retry
+ // the arm again as long as the fail is not due to timeout.
+ // A timeout would mean a charging issue, it would have been caught
+ // by the health check.
+ l_is_retryable = !l_arm_timeout && l_retry < ARM_MAX_RETRY_COUNT;
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm. %s retryable.",
+ get_huid(l_nvdimm), l_is_retryable? "IS" : "NOT");
+
+ if (l_is_retryable)
+ {
+ // Save the original error
+ // If a previous error was saved then delete it
+ if (l_err_retry)
+ {
+ delete l_err_retry;
+ }
+ l_err_retry = l_err;
+
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ARM_RETRY
+ *@severity ERRORLOG_SEV_INFORMATIONAL
+ *@moduleid NVDIMM_ARM_ERASE
+ *@userdata1[0:31] Target Huid
+ *@userdata1[32:39] l_is_retryable
+ *@userdata1[40:47] MAX arm retry count
+ *@userdata2[0:31] Original errlog plid
+ *@userdata2[32:63] Original errlog reason code
+ *@devdesc NVDIMM encountered a glitch causing the initial
+ * arm to fail. System firmware will retry the arm
+ *@custdesc NVDIMM requires an arm retry
+ */
+ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_INFORMATIONAL,
+ NVDIMM_ARM_ERASE,
+ NVDIMM_ARM_RETRY,
+ NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(l_nvdimm),
+ FOUR_UINT8_TO_UINT32(l_is_retryable, ARM_MAX_RETRY_COUNT,0,0)),
+ TWO_UINT32_TO_UINT64(l_err_retry->plid(), l_err_retry->reasonCode()),
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace( NVDIMM_COMP_NAME );
+
+ // Callout the dimm
+ l_err->addHwCallout( l_nvdimm,
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
+
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+ else
+ {
+ // Handle retryable error
+ if (l_err_retry)
+ {
+ ERRORLOG::ErrlUserDetailsString("Arm RETRY failed").addToLog(l_err_retry);
+
+ // Delete the current errlog and use the original errlog for callout
+ delete l_err;
+ l_err = l_err_retry;
+ l_err_retry = nullptr;
+ }
+
+ // Disarming all dimms due to error
+ nvdimmDisarm(i_nvdimmTargetList);
+
+ l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
+ if (l_err_t)
+ {
+ errlCommit( l_err_t, NVDIMM_COMP_ID );
+ }
+
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+
+ // Dump Traces for error logs
+ nvdimmTraceRegs( l_nvdimm, l_RegInfo );
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+
+ // Add reg traces to the error log
+ NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
+
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ o_arm_successful = false;
+
+ // Cause the main loop to exit out of the main arm procedure
+ l_break = true;
+ break;
+ }
+ }
+ else
+ {
+ // Arm worked. Exit the retry loop
+ break;
+ } // close nvdimmCheckArmSuccess check
+ } // close arm retry loop
+
+ if (l_continue_arm)
+ {
+ continue;
+ }
+ else if (l_break)
+ {
+ break;
+ }
+
+ // After arming the trigger, erase the image to prevent the possible
+ // stale image getting the restored on the next boot in case of failed
+ // save.
+ l_err = nvdimmEraseNF(l_nvdimm);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to erase post arm", get_huid(l_nvdimm));
+
+ // Disarming all dimms due to error
+ nvdimmDisarm(i_nvdimmTargetList);
+
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ o_arm_successful = false;
+ break;
+ }
+
+ // Arm successful, update armed status
+ l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm,
+ NVDIMM::NVDIMM_ARMED);
+ if (l_err)
+ {
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ }
+
+ // Enable Persistency and Warning Threshold notifications
+ l_err = nvdimmWriteReg(l_nvdimm, SET_EVENT_NOTIFICATION_CMD, ENABLE_NOTIFICATIONS);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] setting persistency notification",
+ TARGETING::get_huid(l_nvdimm));
+ break;
+ }
+
+ // Check notification status and errors
+ l_err = nvdimmReadReg(l_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data);
+ if (l_err)
+ {
+ break;
+ }
+ else if (((l_data & SET_EVENT_NOTIFICATION_ERROR) == SET_EVENT_NOTIFICATION_ERROR)
+ || ((l_data & NOTIFICATIONS_ENABLED) != NOTIFICATIONS_ENABLED))
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set event notification",
+ get_huid(l_nvdimm));
+
+ // Set NVDIMM Status flag to partial working, as error detected but data might persist
+ notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_RISKY_HW_ERROR);
+
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_SET_EVENT_NOTIFICATION_ERROR
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_SET_EVENT_NOTIFICATION
+ *@userdata1[0:31] Target Huid
+ *@userdata2 <UNUSED>
+ *@devdesc NVDIMM threw an error or failed to set event
+ * notifications during arming
+ *@custdesc NVDIMM failed to enable event notificaitons
+ */
+ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_SET_EVENT_NOTIFICATION,
+ NVDIMM_SET_EVENT_NOTIFICATION_ERROR,
+ TARGETING::get_huid(l_nvdimm),
+ 0x0,
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace( NVDIMM_COMP_NAME );
+
+ // Callout the dimm
+ l_err->addHwCallout( l_nvdimm,
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
+
+ // Read relevant regs for trace data
+ nvdimmTraceRegs(l_nvdimm, l_RegInfo);
+ nvdimmAddPage4Regs(l_nvdimm,l_err);
+ nvdimmAddVendorLog(l_nvdimm, l_err);
+
+ // Add reg traces to the error log
+ NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
+
+ errlCommit( l_err, NVDIMM_COMP_ID );
+
+ // We are after the arm step now, so on any error cases let's log it
+ // then move to the next nvdimm
+ continue;
+ }
+
+ // Re-check health status registers
+ l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_POST_ARM, l_continue );
+
+ // Check for health status failure
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed final health status check", get_huid(l_nvdimm));
+
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ continue;
+ }
+
+ }
+
+ // Check for uncommited i2c fail error logs
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "nvdimmArm() failed an i2c read/write");
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ nvdimmDisarm(i_nvdimmTargetList);
+ return false;
+ }
+
+ // Unmask firs if the arm completed successfully
+ if (o_arm_successful)
+ {
+ // Unmask MBACALFIR EventN and set to recoverable
+ for (TargetHandleList::iterator it = i_nvdimmTargetList.begin();
+ it != i_nvdimmTargetList.end();)
+ {
+ TargetHandleList l_mcaList;
+ getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA);
+ assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA.");
+
+ // Set MBACALFIR_ACTION0 to recoverable
+ l_writeAddress = MBACALFIR_ACTION0_REG;
+ l_writeData = 0;
+ l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+
+
+ l_writeData &= MBACALFIR_EVENTN_AND_BIT;
+ l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+
+ // Set MBACALFIR_ACTION1 to recoverable
+ l_writeAddress = MBACALFIR_ACTION1_REG;
+ l_writeData = 0;
+ l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+
+ l_writeData |= MBACALFIR_EVENTN_OR_BIT;
+ l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+
+ // Unmask MBACALFIR[8]
+ l_writeAddress = MBACALFIR_AND_MASK_REG;
+ l_writeData = MBACALFIR_UNMASK_BIT;
+ l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
+ DEVICE_SCOM_ADDRESS(l_writeAddress));
+ if(l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
+ l_writeAddress);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ }
+
+ it++;
+ }
+
+ }
+
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d",
+ o_arm_successful);
+ return o_arm_successful;
+}
+
+bool nvdimmDisarm(TargetHandleList &i_nvdimmTargetList)
+{
+ bool o_disarm_successful = true;
+
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmDisarm() %d",
+ i_nvdimmTargetList.size());
+
+ errlHndl_t l_err = nullptr;
+
+ for (auto const l_nvdimm : i_nvdimmTargetList)
+ {
+ l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER);
+ // If we run into any error here we will just
+ // commit the error log and move on. Let the
+ // system continue to boot and let the user
+ // salvage the data
+ if (l_err)
+ {
+ // Committing the error as we don't want this to interrupt
+ // the boot. This will notify the user that action is needed
+ // on this module
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ o_disarm_successful = false;
+ continue;
+ }
+
+ // Disarm successful, update armed status
+ l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm,
+ NVDIMM::NVDIMM_DISARMED);
+ if (l_err)
+ {
+ l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
+ l_err->collectTrace(NVDIMM_COMP_NAME);
+ errlCommit(l_err, NVDIMM_COMP_ID);
+ }
+ }
+
+ TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmDisarm() returning %d",
+ o_disarm_successful);
+ return o_disarm_successful;
+}
+
+
} // end NVDIMM namespace
diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
index b0e20d46e..32c4dd18f 100644
--- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
+++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
@@ -64,818 +64,6 @@ namespace NVDIMM
static constexpr uint64_t DARN_ERROR_CODE = 0xFFFFFFFFFFFFFFFFull;
static constexpr uint32_t MAX_DARN_ERRORS = 10;
-static constexpr uint8_t FW_OPS_UPDATE = 0x04;
-static constexpr size_t ARM_MAX_RETRY_COUNT = 1;
-/**
- * @brief This function polls the command status register for arm completion
- * (does not indicate success or fail)
- *
- * @param[in] i_nvdimm - nvdimm target with NV controller
- *
- * @param[out] o_poll - total polled time in ms
- *
- * @return errlHndl_t - Null if successful, otherwise a pointer to
- * the error log.
- */
-errlHndl_t nvdimmPollArmDone(Target* i_nvdimm,
- uint32_t &o_poll)
-{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmPollArmDone() nvdimm[%X]", get_huid(i_nvdimm) );
-
- errlHndl_t l_err = nullptr;
-
- l_err = nvdimmPollStatus ( i_nvdimm, ARM, o_poll);
-
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmPollArmDone() nvdimm[%X]",
- get_huid(i_nvdimm));
-
- return l_err;
-}
-
-/**
- * @brief This function checks the arm status register to make sure
- * the trigger has been armed to ddr_reset_n
- *
- * @param[in] i_nvdimm - nvdimm target with NV controller
- * @param[in] i_arm_timeout - nvdimm local timeout status
- *
- * @return errlHndl_t - Null if successful, otherwise a pointer to
- * the error log.
- */
-errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout)
-{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmCheckArmSuccess() nvdimm[%X]",
- get_huid(i_nvdimm));
-
- errlHndl_t l_err = nullptr;
- uint8_t l_data = 0;
-
- l_err = nvdimmReadReg(i_nvdimm, ARM_STATUS, l_data);
-
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
- "failed to read arm status reg!",get_huid(i_nvdimm));
- }
- else if (((l_data & ARM_ERROR) == ARM_ERROR) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED) || i_arm_timeout)
- {
-
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmCheckArmSuccess() nvdimm[%X]"
- "failed to arm!",get_huid(i_nvdimm));
- /*@
- *@errortype
- *@reasoncode NVDIMM_ARM_FAILED
- *@severity ERRORLOG_SEV_PREDICTIVE
- *@moduleid NVDIMM_SET_ARM
- *@userdata1[0:31] Related ops (0xff = NA)
- *@userdata1[32:63] Target Huid
- *@userdata2 <UNUSED>
- *@devdesc Encountered error arming the catastrophic save
- * trigger on NVDIMM. Make sure an energy source
- * is connected to the NVDIMM and the ES policy
- * is set properly
- *@custdesc NVDIMM encountered error arming save trigger
- */
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
- NVDIMM_SET_ARM,
- NVDIMM_ARM_FAILED,
- TWO_UINT32_TO_UINT64(ARM, get_huid(i_nvdimm)),
- 0x0,
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace(NVDIMM_COMP_NAME, 256 );
- nvdimmAddVendorLog(i_nvdimm, l_err);
-
- // Failure to arm could mean internal NV controller error or
- // even error on the battery pack. NVDIMM will lose persistency
- // if failed to arm trigger
- l_err->addHwCallout( i_nvdimm,
- HWAS::SRCI_PRIORITY_HIGH,
- HWAS::NO_DECONFIG,
- HWAS::GARD_Fatal);
- }
-
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmCheckArmSuccess() nvdimm[%X] ret[%X]",
- get_huid(i_nvdimm), l_data);
-
- return l_err;
-}
-
-/**
- * @brief This function performs arm precheck.
- *
- * @param[in] i_nvdimm - nvdimm target with NV controller
- *
- * @return errlHndl_t - Null if successful, otherwise a pointer to
- * the error log.
- */
-errlHndl_t nvdimmArmPreCheck(Target* i_nvdimm)
-{
- TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArmPreCheck() nvdimm[%X]",
- get_huid(i_nvdimm));
-
- errlHndl_t l_err = nullptr;
- uint8_t l_ready = 0;
- uint8_t l_fwupdate = 0;
- uint8_t l_module_health = 0;
- uint8_t l_continue = true;
- auto l_RegInfo = nvdimm_reg_t();
-
- do
- {
- // Read out the Module Health status register
- l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_module_health);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Module Health Status",
- get_huid(i_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- break;
- }
-
- // Read out the NVDimm Ready register
- l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_ready);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read NVDimm Ready register",
- get_huid(i_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- break;
- }
-
- // Read out the FW OPs Status register
- l_err = nvdimmReadReg(i_nvdimm, FIRMWARE_OPS_STATUS, l_fwupdate);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Firmware OPs Status register",
- get_huid(i_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- }
-
- }while(0);
-
- // Check ARM pre-requisites
- // All nvdimms in i_nvdimmTargetList must pass the pre-req checks
- // before continuing with arm.
- if ((!l_continue) || (l_module_health & NVM_LIFETIME_ERROR)
- || (l_ready != NV_READY)
- || (l_fwupdate & FW_OPS_UPDATE))
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed NVDimm Arm prechecks",
- get_huid(i_nvdimm));
- /*@
- *@errortype
- *@reasoncode NVDIMM_ARM_PRE_CHECK_FAILED
- *@severity ERRORLOG_SEV_PREDICTIVE
- *@moduleid NVDIMM_ARM_PRE_CHECK
- *@userdata1[0:31] Target Huid
- *@userdata1[32:39] l_continue
- *@userdata1[40:47] l_module_health
- *@userdata1[48:56] l_ready
- *@userdata1[57:63] l_fwupdate
- *@userdata2 <UNUSED>
- *@devdesc NVDIMM failed arm precheck. Refer to FFDC for exact reason
- *@custdesc NVDIMM failed the arm precheck and is unable to arm
- */
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
- NVDIMM_ARM_PRE_CHECK,
- NVDIMM_ARM_PRE_CHECK_FAILED,
- NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(i_nvdimm),
- FOUR_UINT8_TO_UINT32(l_continue, l_module_health, l_ready, l_fwupdate)),
- 0x0,
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace( NVDIMM_COMP_NAME );
-
- // Callout the dimm
- l_err->addHwCallout( i_nvdimm,
- HWAS::SRCI_PRIORITY_LOW,
- HWAS::NO_DECONFIG,
- HWAS::GARD_NULL);
-
- // Read relevant regs for trace data
- nvdimmTraceRegs(i_nvdimm, l_RegInfo);
- nvdimmAddPage4Regs(i_nvdimm,l_err);
- nvdimmAddVendorLog(i_nvdimm, l_err);
-
- // Add reg traces to the error log
- NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
-
- }
-
- TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArmPreCheck() nvdimm[%X]",
- get_huid(i_nvdimm));
-
- return l_err;
-}
-
-bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
-{
- bool o_arm_successful = true;
- bool l_continue = true;
- bool l_arm_timeout = false;
- uint8_t l_data;
- auto l_RegInfo = nvdimm_reg_t();
- uint64_t l_writeData;
- uint32_t l_writeAddress;
- size_t l_writeSize = sizeof(l_writeData);
-
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() numNvdimm[%d]",
- i_nvdimmTargetList.size());
-
- errlHndl_t l_err = nullptr;
- errlHndl_t l_err_t = nullptr;
-
- // Prerequisite Arm Checks
- for (auto const l_nvdimm : i_nvdimmTargetList)
- {
- l_err = nvdimmArmPreCheck(l_nvdimm);
-
- // If we are failing the precheck, commit the error then exit
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() failed arm precheck, exiting");
- errlCommit(l_err, NVDIMM_COMP_ID);
- return false;
- }
- }
-
- // Encryption unlocked check
- // Check one nvdimm at a time
- for (auto const l_nvdimm : i_nvdimmTargetList)
- {
- // Unlock function will create an error log
- // Create another here to make it clear that the arm failed
- TargetHandleList l_nvdimmTargetList;
- l_nvdimmTargetList.push_back(l_nvdimm);
- if (!nvdimm_encrypt_unlock(l_nvdimmTargetList))
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed NVDimm Arm encryption unlock",
- get_huid(l_nvdimm));
- /*@
- *@errortype
- *@reasoncode NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED
- *@severity ERRORLOG_SEV_PREDICTIVE
- *@moduleid NVDIMM_ARM
- *@userdata1 Target Huid
- *@userdata2 <UNUSED>
- *@devdesc NVDIMM failed to unlock encryption during arming
- *@custdesc NVDIMM failed to ARM
- */
- l_err = new ERRORLOG::ErrlEntry(
- ERRORLOG::ERRL_SEV_PREDICTIVE,
- NVDIMM_ARM,
- NVDIMM_ARM_ENCRYPTION_UNLOCK_FAILED,
- get_huid(l_nvdimm),
- 0x0,
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace( NVDIMM_COMP_NAME );
-
- // Callout the dimm
- l_err->addHwCallout( l_nvdimm,
- HWAS::SRCI_PRIORITY_MED,
- HWAS::DELAYED_DECONFIG,
- HWAS::GARD_NULL);
-
- // Read relevant regs for trace data
- nvdimmTraceRegs(l_nvdimm, l_RegInfo);
- nvdimmAddPage4Regs(l_nvdimm,l_err);
- nvdimmAddVendorLog(l_nvdimm, l_err);
-
- // Add reg traces to the error log
- NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
-
- // Commit the error then exit
- errlCommit(l_err, NVDIMM_COMP_ID);
- return false;
- }
- }
-
- // Mask MBACALFIR EventN to separate ARM handling
- for (TargetHandleList::iterator it = i_nvdimmTargetList.begin();
- it != i_nvdimmTargetList.end();)
- {
- TargetHandleList l_mcaList;
- getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA);
- assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA.");
-
- l_writeAddress = MBACALFIR_OR_MASK_REG;
- l_writeData = MBACALFIR_EVENTN_OR_BIT;
- l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
- it++;
- }
-
- for (auto const l_nvdimm : i_nvdimmTargetList)
- {
- l_arm_timeout = false;
-
- // skip if the nvdimm is already armed
- ATTR_NVDIMM_ARMED_type l_armed_state = {};
- l_armed_state = l_nvdimm->getAttr<ATTR_NVDIMM_ARMED>();
- if (l_armed_state.armed)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] called when already armed", get_huid(l_nvdimm));
- continue;
- }
-
- // Set ES Policy, contains all of its status checks
- l_err = nvdimmSetESPolicy(l_nvdimm);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set ES Policy", get_huid(l_nvdimm));
- o_arm_successful = false;
-
- nvdimmDisarm(i_nvdimmTargetList);
-
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
-
- // Callout the nvdimm on high and gard
- l_err->addHwCallout( l_nvdimm,
- HWAS::SRCI_PRIORITY_HIGH,
- HWAS::NO_DECONFIG,
- HWAS::GARD_Fatal);
-
- errlCommit( l_err, NVDIMM_COMP_ID );
-
- break;
- }
-
- // Clear all status registers in case of leftover bits
- l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_MGT_CMD0, CLEAR_ALL_STATUS);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - error clearing all status registers",
- get_huid(l_nvdimm));
- break;
- }
-
- bool l_is_retryable = true;
- //continue flag set by the retry loop to continue on the outer loop
- bool l_continue_arm = false;
- //break flag set by the retry loop to break on the outer loop
- bool l_break = false;
- errlHndl_t l_err_retry = nullptr;
-
- // Attempt arm multiple times in case of glitches
- for (size_t l_retry = 0; l_retry <= ARM_MAX_RETRY_COUNT; l_retry++)
- {
-
- l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, ARM_TRIGGER);
- // If we run into any error here we will just
- // commit the error log and move on. Let the
- // system continue to boot and let the user
- // salvage the data
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm));
-
- nvdimmDisarm(i_nvdimmTargetList);
-
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
-
- // Cause the main loop to skip the rest of the arm procedure
- // and move to the next target
- l_continue_arm = true;
- break;
- }
-
- // Arm happens one module at a time. No need to set any offset on the counter
- uint32_t l_poll = 0;
- l_err = nvdimmPollArmDone(l_nvdimm, l_poll);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] arm command timed out", get_huid(l_nvdimm));
- l_arm_timeout = true;
-
- l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err_t)
- {
- errlCommit( l_err_t, NVDIMM_COMP_ID );
- }
-
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
-
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- }
-
- // Pass l_arm_timeout value in for health status check
- l_continue = l_arm_timeout;
-
- // Sleep for 1 second before checking the health status
- // to let the glitches settle in case there were any
- nanosleep(1, 0);
-
- // Check health status registers and exit if required
- l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue );
-
- // Check for health status failure
- // Any fail picked up by the health check is a legit fail
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm));
-
- // The arm timeout variable is used here as the continue variable for the
- // health status check. This was done to include the timeout for use in the check
- // If true either the arm timed out with a health status fail or the
- // health status check failed with another disarm and exit condition
- if (!l_continue)
- {
- errlCommit( l_err, NVDIMM_COMP_ID );
-
- // Disarming all dimms due to error
- nvdimmDisarm(i_nvdimmTargetList);
- o_arm_successful = false;
-
- // Cause the main loop to exit out of the main arm procedure
- l_break = true;
- break;
- }
- else
- {
- errlCommit( l_err, NVDIMM_COMP_ID );
-
- // Cause the main loop to skip the rest of the arm procedure
- // and move to the next target
- l_continue_arm = true;
- break;
- }
- }
-
- l_err = nvdimmCheckArmSuccess(l_nvdimm, l_arm_timeout);
-
- // At this point we have passed the health check. If the arm were
- // to fail now, it is likely it was due to some glitch. Let's retry
- // the arm again as long as the fail is not due to timeout.
- // A timeout would mean a charging issue, it would have been caught
- // by the health check.
- l_is_retryable = !l_arm_timeout && l_retry < ARM_MAX_RETRY_COUNT;
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to succesfully arm. %s retryable.",
- get_huid(l_nvdimm), l_is_retryable? "IS" : "NOT");
-
- if (l_is_retryable)
- {
- // Save the original error
- l_err_retry = l_err;
-
- /*@
- *@errortype
- *@reasoncode NVDIMM_ARM_RETRY
- *@severity ERRORLOG_SEV_INFORMATIONAL
- *@moduleid NVDIMM_ARM_ERASE
- *@userdata1[0:31] Target Huid
- *@userdata1[32:39] l_is_retryable
- *@userdata1[40:47] MAX arm retry count
- *@userdata2[0:31] Original errlog plid
- *@userdata2[32:63] Original errlog reason code
- *@devdesc NVDIMM encountered a glitch causing the initial
- * arm to fail. System firmware will retry the arm
- *@custdesc NVDIMM requires an arm retry
- */
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_INFORMATIONAL,
- NVDIMM_ARM_ERASE,
- NVDIMM_ARM_RETRY,
- NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(l_nvdimm),
- FOUR_UINT8_TO_UINT32(l_is_retryable, ARM_MAX_RETRY_COUNT,0,0)),
- TWO_UINT32_TO_UINT64(l_err_retry->plid(), l_err_retry->reasonCode()),
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace( NVDIMM_COMP_NAME );
-
- // Callout the dimm
- l_err->addHwCallout( l_nvdimm,
- HWAS::SRCI_PRIORITY_LOW,
- HWAS::NO_DECONFIG,
- HWAS::GARD_NULL);
-
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
- else
- {
- // Handle retryable error
- if (l_err_retry)
- {
- ERRORLOG::ErrlUserDetailsString("Arm RETRY failed").addToLog(l_err_retry);
-
- // Delete the current errlog and use the original errlog for callout
- delete l_err;
- l_err = l_err_retry;
- l_err_retry = nullptr;
- }
-
- // Disarming all dimms due to error
- nvdimmDisarm(i_nvdimmTargetList);
-
- l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err_t)
- {
- errlCommit( l_err_t, NVDIMM_COMP_ID );
- }
-
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
-
- // Dump Traces for error logs
- nvdimmTraceRegs( l_nvdimm, l_RegInfo );
- nvdimmAddPage4Regs(l_nvdimm,l_err);
-
- // Add reg traces to the error log
- NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
-
- errlCommit(l_err, NVDIMM_COMP_ID);
- o_arm_successful = false;
-
- // Cause the main loop to exit out of the main arm procedure
- l_break = true;
- break;
- }
- }
- else
- {
- // Arm worked. Exit the retry loop
- break;
- } // close nvdimmCheckArmSuccess check
- } // close arm retry loop
-
- if (l_continue_arm)
- {
- continue;
- }
- else if (l_break)
- {
- break;
- }
-
- // After arming the trigger, erase the image to prevent the possible
- // stale image getting the restored on the next boot in case of failed
- // save.
- l_err = nvdimmEraseNF(l_nvdimm);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to erase post arm", get_huid(l_nvdimm));
-
- // Disarming all dimms due to error
- nvdimmDisarm(i_nvdimmTargetList);
-
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- break;
- }
-
- // Arm successful, update armed status
- l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm,
- NVDIMM::NVDIMM_ARMED);
- if (l_err)
- {
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit(l_err, NVDIMM_COMP_ID);
- }
-
- // Enable Persistency and Warning Threshold notifications
- l_err = nvdimmWriteReg(l_nvdimm, SET_EVENT_NOTIFICATION_CMD, ENABLE_NOTIFICATIONS);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"NDVIMM HUID[%X] setting persistency notification",
- TARGETING::get_huid(l_nvdimm));
- break;
- }
-
- // Check notification status and errors
- l_err = nvdimmReadReg(l_nvdimm, SET_EVENT_NOTIFICATION_STATUS, l_data);
- if (l_err)
- {
- break;
- }
- else if (((l_data & SET_EVENT_NOTIFICATION_ERROR) == SET_EVENT_NOTIFICATION_ERROR)
- || ((l_data & NOTIFICATIONS_ENABLED) != NOTIFICATIONS_ENABLED))
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to set event notification",
- get_huid(l_nvdimm));
-
- // Set NVDIMM Status flag to partial working, as error detected but data might persist
- notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_RISKY_HW_ERROR);
-
- /*@
- *@errortype
- *@reasoncode NVDIMM_SET_EVENT_NOTIFICATION_ERROR
- *@severity ERRORLOG_SEV_PREDICTIVE
- *@moduleid NVDIMM_SET_EVENT_NOTIFICATION
- *@userdata1[0:31] Target Huid
- *@userdata2 <UNUSED>
- *@devdesc NVDIMM threw an error or failed to set event
- * notifications during arming
- *@custdesc NVDIMM failed to enable event notificaitons
- */
- l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
- NVDIMM_SET_EVENT_NOTIFICATION,
- NVDIMM_SET_EVENT_NOTIFICATION_ERROR,
- TARGETING::get_huid(l_nvdimm),
- 0x0,
- ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
-
- l_err->collectTrace( NVDIMM_COMP_NAME );
-
- // Callout the dimm
- l_err->addHwCallout( l_nvdimm,
- HWAS::SRCI_PRIORITY_LOW,
- HWAS::NO_DECONFIG,
- HWAS::GARD_NULL);
-
- // Read relevant regs for trace data
- nvdimmTraceRegs(l_nvdimm, l_RegInfo);
- nvdimmAddPage4Regs(l_nvdimm,l_err);
- nvdimmAddVendorLog(l_nvdimm, l_err);
-
- // Add reg traces to the error log
- NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
-
- errlCommit( l_err, NVDIMM_COMP_ID );
-
- // We are after the arm step now, so on any error cases let's log it
- // then move to the next nvdimm
- continue;
- }
-
- // Re-check health status registers
- l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_POST_ARM, l_continue );
-
- // Check for health status failure
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed final health status check", get_huid(l_nvdimm));
-
- errlCommit( l_err, NVDIMM_COMP_ID );
- continue;
- }
-
- }
-
- // Check for uncommited i2c fail error logs
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "nvdimmArm() failed an i2c read/write");
- errlCommit( l_err, NVDIMM_COMP_ID );
- nvdimmDisarm(i_nvdimmTargetList);
- return false;
- }
-
- // Unmask firs if the arm completed successfully
- if (o_arm_successful)
- {
- // Unmask MBACALFIR EventN and set to recoverable
- for (TargetHandleList::iterator it = i_nvdimmTargetList.begin();
- it != i_nvdimmTargetList.end();)
- {
- TargetHandleList l_mcaList;
- getParentAffinityTargets(l_mcaList, *it, CLASS_UNIT, TYPE_MCA);
- assert(l_mcaList.size(), "nvdimmArm() failed to find parent MCA.");
-
- // Set MBACALFIR_ACTION0 to recoverable
- l_writeAddress = MBACALFIR_ACTION0_REG;
- l_writeData = 0;
- l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
-
- l_writeData &= MBACALFIR_EVENTN_AND_BIT;
- l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
- // Set MBACALFIR_ACTION1 to recoverable
- l_writeAddress = MBACALFIR_ACTION1_REG;
- l_writeData = 0;
- l_err = deviceRead(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
- l_writeData |= MBACALFIR_EVENTN_OR_BIT;
- l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
- // Unmask MBACALFIR[8]
- l_writeAddress = MBACALFIR_AND_MASK_REG;
- l_writeData = MBACALFIR_UNMASK_BIT;
- l_err = deviceWrite(l_mcaList[0], &l_writeData, l_writeSize,
- DEVICE_SCOM_ADDRESS(l_writeAddress));
- if(l_err)
- {
- TRACFCOMP(g_trac_nvdimm, "SCOM to address 0x%08x failed",
- l_writeAddress);
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
- it++;
- }
-
- }
-
- TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArm() returning %d",
- o_arm_successful);
- return o_arm_successful;
-}
-
-bool nvdimmDisarm(TargetHandleList &i_nvdimmTargetList)
-{
- bool o_disarm_successful = true;
-
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmDisarm() %d",
- i_nvdimmTargetList.size());
-
- errlHndl_t l_err = nullptr;
-
- for (auto const l_nvdimm : i_nvdimmTargetList)
- {
- l_err = NVDIMM::nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER);
- // If we run into any error here we will just
- // commit the error log and move on. Let the
- // system continue to boot and let the user
- // salvage the data
- if (l_err)
- {
- // Committing the error as we don't want this to interrupt
- // the boot. This will notify the user that action is needed
- // on this module
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit( l_err, NVDIMM_COMP_ID );
- o_disarm_successful = false;
- continue;
- }
-
- // Disarm successful, update armed status
- l_err = NVDIMM::notifyNvdimmProtectionChange(l_nvdimm,
- NVDIMM::NVDIMM_DISARMED);
- if (l_err)
- {
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit(l_err, NVDIMM_COMP_ID);
- }
- }
-
- TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmDisarm() returning %d",
- o_disarm_successful);
- return o_disarm_successful;
-}
/**
* @brief Check nvdimm error state
OpenPOWER on IntegriCloud