summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMATTHEW I. HICKMAN <matthew.hickman@ibm.com>2019-09-20 14:30:55 -0500
committerDaniel M Crowell <dcrowell@us.ibm.com>2019-09-27 13:50:01 -0500
commit7b87226522ddaf755b241c8352f5c7e9ed273741 (patch)
tree079b1d7e3d31ee97f7ad7bc14538e7fc9690a786 /src
parent5db287b6e70d353b32c90ff0209be10ba5491f0d (diff)
downloadtalos-hostboot-7b87226522ddaf755b241c8352f5c7e9ed273741.tar.gz
talos-hostboot-7b87226522ddaf755b241c8352f5c7e9ed273741.zip
Fixed error handling issues in arm path
Addresses the following: 1. nvdimms not getting disarmed in error cases 2. nvdimms getting garded on BPM-specific errors Change-Id: Ibf9b391cb94c1dd247406f960298a1e55ebf186f CQ:SW475860 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84061 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r--src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H2
-rw-r--r--src/usr/isteps/nvdimm/nvdimm.C6
-rw-r--r--src/usr/isteps/nvdimm/nvdimm.H8
-rw-r--r--src/usr/isteps/nvdimm/nvdimmErrorLog.C40
-rw-r--r--src/usr/isteps/nvdimm/runtime/nvdimm_rt.C220
5 files changed, 155 insertions, 121 deletions
diff --git a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H
index 720c6dc91..6b2909899 100644
--- a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H
+++ b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H
@@ -107,6 +107,7 @@ enum nvdimmModuleId
NVDIMM_WAIT_OPER_OPS_COMPLETE = 0x39,
NVDIMM_COMPARE_CKSUM = 0x3A,
NVDIMM_CHECK_FW_SLOT = 0x3B,
+ NVDIMM_ARM_PRE_CHECK = 0x3C,
};
/**
@@ -199,6 +200,7 @@ enum nvdimmReasonCode
NVDIMM_VENDOR_LOG_CKSUM_FAILED = NVDIMM_COMP_ID | 0x4F, // Vendor log for FFDC checksum fail
NVDIMM_INVALID_FW_SLOT = NVDIMM_COMP_ID | 0x50,
NVDIMM_ERASE_ERROR = NVDIMM_COMP_ID | 0x51,
+ NVDIMM_ARM_PRE_CHECK_FAILED = NVDIMM_COMP_ID | 0x52,
};
enum UserDetailsTypes
diff --git a/src/usr/isteps/nvdimm/nvdimm.C b/src/usr/isteps/nvdimm/nvdimm.C
index 3f1962e77..bf2af1cde 100644
--- a/src/usr/isteps/nvdimm/nvdimm.C
+++ b/src/usr/isteps/nvdimm/nvdimm.C
@@ -71,12 +71,6 @@ TRAC_INIT(&g_trac_nvdimm, NVDIMM_COMP_NAME, 2*KILOBYTE);
namespace NVDIMM
{
#define NUM_OFFSET 2
-#define NVDIMM_SET_USER_DATA_1(left_32_ops_id, right_32_huid) \
- TWO_UINT32_TO_UINT64(left_32_ops_id, right_32_huid)
-
-#define NVDIMM_SET_USER_DATA_2_TIMEOUT(left_32_polled, right_32_timeout) \
- NVDIMM_SET_USER_DATA_1(left_32_polled, right_32_timeout)
-
typedef struct ops_timeoutInfo{
const char * desc;
diff --git a/src/usr/isteps/nvdimm/nvdimm.H b/src/usr/isteps/nvdimm/nvdimm.H
index 58bb3c542..8645d6353 100644
--- a/src/usr/isteps/nvdimm/nvdimm.H
+++ b/src/usr/isteps/nvdimm/nvdimm.H
@@ -41,6 +41,12 @@ extern trace_desc_t* g_trac_nvdimm;
namespace NVDIMM
{
+#define NVDIMM_SET_USER_DATA_1(left_32_ops_id, right_32_huid) \
+ TWO_UINT32_TO_UINT64(left_32_ops_id, right_32_huid)
+
+#define NVDIMM_SET_USER_DATA_2_TIMEOUT(left_32_polled, right_32_timeout) \
+ NVDIMM_SET_USER_DATA_1(left_32_polled, right_32_timeout)
+
// I2C registers for page 0-3, extracted from JEDEC BAEBI spec
// Refer to BAEBI spec for details
@@ -369,7 +375,7 @@ enum i2c_out_values : uint8_t
RSTR_ERROR = 0x02,
SAVE_ERROR = 0x02,
ERASE_ERROR = 0x02,
- ARM_CLEAR_ALL = 0x3C,
+ CLEAR_ALL_STATUS = 0x3C, //Clears CAVE, RESTORE, ERASE, and ARM status regs
};
// Timeout-related enum
diff --git a/src/usr/isteps/nvdimm/nvdimmErrorLog.C b/src/usr/isteps/nvdimm/nvdimmErrorLog.C
index 305d61a6e..9fbd27d14 100644
--- a/src/usr/isteps/nvdimm/nvdimmErrorLog.C
+++ b/src/usr/isteps/nvdimm/nvdimmErrorLog.C
@@ -384,11 +384,11 @@ bool nvdimmCalloutDimm(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
}
else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED))
{
- l_continue = true;
+ l_continue = false;
}
// Check arm status and set dimm status accordingly
- if(!l_continue)
+ if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
notifyNvdimmProtectionChange(i_nvdimm,NVDIMM_RISKY_HW_ERROR);
@@ -549,7 +549,7 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
}
else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED))
{
- l_continue = true;
+ l_continue = false;
}
// Callout BPM and Cable but cannot deconfig or gard
@@ -561,31 +561,17 @@ bool nvdimmBPMCableCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::SRCI_PRIORITY_HIGH);
// Check arm status and set dimm status accordingly
- if(!l_continue)
+ if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
notifyNvdimmProtectionChange(i_nvdimm,NVDIMM_RISKY_HW_ERROR);
-
- // Callout dimm but do not deconfig or gard
- o_err->addHwCallout( i_nvdimm,
- HWAS::SRCI_PRIORITY_LOW,
- HWAS::NO_DECONFIG,
- HWAS::GARD_NULL);
- }
- else
- {
- // Set ATTR_NV_STATUS_FLAG to dimm diarmed
- l_err = notifyNvdimmProtectionChange(i_nvdimm, NVDIMM_DISARMED);
- if (l_err)
- {
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
- // Callout dimm, deconfig and gard
- o_err->addHwCallout( i_nvdimm,
- HWAS::SRCI_PRIORITY_HIGH,
- HWAS::DECONFIG,
- HWAS::GARD_Fatal);
}
+
+ // Callout dimm but do not deconfig or gard
+ o_err->addHwCallout( i_nvdimm,
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
break;
}
@@ -687,7 +673,7 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
}
else if (((l_data & ARM_SUCCESS) != ARM_SUCCESS) || ((l_data & RESET_N_ARMED) != RESET_N_ARMED))
{
- l_continue = true;
+ l_continue = false;
}
// Callout BPM on high
@@ -702,7 +688,7 @@ bool nvdimmBPMCallout(Target *i_nvdimm, uint8_t i_step, errlHndl_t& o_err)
HWAS::GARD_NULL);
// Check arm status and set dimm status accordingly
- if(!l_continue)
+ if(l_continue)
{
// Set ATTR_NV_STATUS_FLAG to partially working as data may still persist
notifyNvdimmProtectionChange(i_nvdimm,NVDIMM_RISKY_HW_ERROR);
@@ -1303,7 +1289,7 @@ errlHndl_t nvdimmHealthStatusCheck(Target *i_nvdimm, uint8_t i_step, bool& o_con
TARGETING::get_huid(i_nvdimm),
0x0,
ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
- o_continue = true;
+ o_continue = false;
// Callout dimm but no deconfig and gard
l_err_t->addHwCallout( i_nvdimm,
HWAS::SRCI_PRIORITY_LOW,
diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
index 6ff35cf6b..dfe84e3ff 100644
--- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
+++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C
@@ -158,20 +158,130 @@ errlHndl_t nvdimmCheckArmSuccess(Target *i_nvdimm, bool i_arm_timeout)
return l_err;
}
+/**
+ * @brief This function performs arm precheck.
+ *
+ * @param[in] i_nvdimm - nvdimm target with NV controller
+ *
+ * @return errlHndl_t - Null if successful, otherwise a pointer to
+ * the error log.
+ */
+errlHndl_t nvdimmArmPreCheck(Target* i_nvdimm)
+{
+ TRACUCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArmPreCheck() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ errlHndl_t l_err = nullptr;
+ uint8_t l_ready = 0;
+ uint8_t l_fwupdate = 0;
+ uint8_t l_module_health = 0;
+ uint8_t l_continue = true;
+ auto l_RegInfo = nvdimm_reg_t();
+
+ do
+ {
+ // Read out the Module Health status register
+ l_err = nvdimmReadReg(i_nvdimm, MODULE_HEALTH_STATUS0, l_module_health);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Module Health Status",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ break;
+ }
+
+ // Read out the NVDimm Ready register
+ l_err = nvdimmReadReg(i_nvdimm, NVDIMM_READY, l_ready);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read NVDimm Ready register",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ break;
+ }
+
+ // Read out the FW OPs Status register
+ l_err = nvdimmReadReg(i_nvdimm, FIRMWARE_OPS_STATUS, l_fwupdate);
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed to read Firmware OPs Status register",
+ get_huid(i_nvdimm));
+ errlCommit( l_err, NVDIMM_COMP_ID );
+ l_continue = false;
+ }
+
+ }while(0);
+
+ // Check ARM pre-requisites
+ // All nvdimms in i_nvdimmTargetList must pass the pre-req checks
+ // before continuing with arm.
+ if ((!l_continue) || (l_module_health & NVM_LIFETIME_ERROR)
+ || (l_ready != NV_READY)
+ || (l_fwupdate & FW_OPS_UPDATE))
+ {
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArmPreCheck() nvdimm[%X] - failed NVDimm Arm prechecks",
+ get_huid(i_nvdimm));
+ /*@
+ *@errortype
+ *@reasoncode NVDIMM_ARM_PRE_CHECK_FAILED
+ *@severity ERRORLOG_SEV_PREDICTIVE
+ *@moduleid NVDIMM_ARM_PRE_CHECK
+ *@userdata1[0:31] Target Huid
+ *@userdata1[32:39] l_continue
+ *@userdata1[40:47] l_module_health
+ *@userdata1[48:56] l_ready
+ *@userdata1[57:63] l_fwuupdate
+ *@userdata2 <UNUSED>
+ *@devdesc NVDIMM threw an error or failed to set event
+ * notifications during arming
+ *@custdesc NVDIMM failed to enable event notificaitons
+ */
+ l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_PREDICTIVE,
+ NVDIMM_ARM_PRE_CHECK,
+ NVDIMM_ARM_PRE_CHECK_FAILED,
+ NVDIMM_SET_USER_DATA_1(TARGETING::get_huid(i_nvdimm),
+ FOUR_UINT8_TO_UINT32(l_continue, l_module_health, l_ready, l_fwupdate)),
+ 0x0,
+ ERRORLOG::ErrlEntry::NO_SW_CALLOUT );
+
+ l_err->collectTrace( NVDIMM_COMP_NAME );
+
+ // Callout the dimm
+ l_err->addHwCallout( i_nvdimm,
+ HWAS::SRCI_PRIORITY_LOW,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL);
+
+ // Read relevant regs for trace data
+ nvdimmTraceRegs(i_nvdimm, l_RegInfo);
+ nvdimmAddPage4Regs(i_nvdimm,l_err);
+ nvdimmAddVendorLog(i_nvdimm, l_err);
+
+ // Add reg traces to the error log
+ NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
+
+ }
+
+ TRACUCOMP(g_trac_nvdimm, EXIT_MRK"nvdimmArmPreCheck() nvdimm[%X]",
+ get_huid(i_nvdimm));
+
+ return l_err;
+}
+
bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
{
bool o_arm_successful = true;
bool l_continue = true;
bool l_arm_timeout = false;
uint8_t l_data;
- uint8_t l_ready;
- uint8_t l_fwupdate;
auto l_RegInfo = nvdimm_reg_t();
uint64_t l_writeData;
uint32_t l_writeAddress;
size_t l_writeSize = sizeof(l_writeData);
- TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() %d",
+ TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvdimmArm() numNvdimm[%d]",
i_nvdimmTargetList.size());
errlHndl_t l_err = nullptr;
@@ -180,57 +290,13 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
// Prerequisite Arm Checks
for (auto const l_nvdimm : i_nvdimmTargetList)
{
- do
- {
- // Read out the Module Health status register
- l_err = nvdimmReadReg(l_nvdimm, MODULE_HEALTH_STATUS0, l_data);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read Module Health Status",
- get_huid(l_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- break;
- }
- // Read out the NVDimm Ready register
- l_err = nvdimmReadReg(l_nvdimm, NVDIMM_READY, l_ready);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read NVDimm Ready register",
- get_huid(l_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- break;
- }
- // Read out the FW OPs Status register
- l_err = nvdimmReadReg(l_nvdimm, FIRMWARE_OPS_STATUS, l_fwupdate);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed to read Firmware OPs Status register",
- get_huid(l_nvdimm));
- errlCommit( l_err, NVDIMM_COMP_ID );
- l_continue = false;
- }
-
- }while(0);
+ l_err = nvdimmArmPreCheck(l_nvdimm);
- // Check ARM pre-requisites
- if ((!l_continue) || (l_data & NVM_LIFETIME_ERROR)
- || (l_ready != NV_READY)
- || (l_fwupdate & FW_OPS_UPDATE))
+ // If we are failing the precheck, commit the error then exit
+ if (l_err)
{
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - failed NVDimm Arm prechecks",
- get_huid(l_nvdimm));
-
- // Disarming all dimms due to error
- nvdimmDisarm(i_nvdimmTargetList);
-
- l_err = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err)
- {
- errlCommit( l_err, NVDIMM_COMP_ID );
- }
-
+ TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() failed arm precheck, exiting");
+ errlCommit(l_err, NVDIMM_COMP_ID);
return false;
}
}
@@ -277,11 +343,6 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
o_arm_successful = false;
nvdimmDisarm(i_nvdimmTargetList);
- l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err_t)
- {
- errlCommit( l_err_t, NVDIMM_COMP_ID );
- }
// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
@@ -296,11 +357,12 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
HWAS::GARD_Fatal);
errlCommit( l_err, NVDIMM_COMP_ID );
+
break;
}
- // Clear all CMD error status registers
- l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_MGT_CMD0, ARM_CLEAR_ALL);
+ // Clear all status registers in case of leftover bits
+ l_err = nvdimmWriteReg(l_nvdimm, NVDIMM_MGT_CMD0, CLEAR_ALL_STATUS);
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X] - error clearing all status registers",
@@ -317,11 +379,7 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed to trigger arm", get_huid(l_nvdimm));
- l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err_t)
- {
- errlCommit( l_err_t, NVDIMM_COMP_ID );
- }
+ nvdimmDisarm(i_nvdimmTargetList);
// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
@@ -357,19 +415,22 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
o_arm_successful = false;
}
+ // Pass l_arm_timeout value in for health status check
+ l_continue = l_arm_timeout;
+
// Check health status registers and exit if required
- l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_arm_timeout );
+ l_err = nvdimmHealthStatusCheck( l_nvdimm, HEALTH_PRE_ARM, l_continue );
// Check for health status failure
if (l_err)
{
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed first health status check", get_huid(l_nvdimm));
-
+
// The arm timeout variable is used here as the continue variable for the
// health status check. This was done to include the timeout for use in the check
// If true either the arm timed out with a health status fail or the
// health status check failed with another disarm and exit condition
- if (l_arm_timeout)
+ if (!l_continue)
{
errlCommit( l_err, NVDIMM_COMP_ID );
@@ -429,12 +490,6 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
// Disarming all dimms due to error
nvdimmDisarm(i_nvdimmTargetList);
- l_err_t = notifyNvdimmProtectionChange(l_nvdimm, NVDIMM_DISARMED);
- if (l_err_t)
- {
- errlCommit( l_err_t, NVDIMM_COMP_ID );
- }
-
// Committing the error as we don't want this to interrupt
// the boot. This will notify the user that action is needed
// on this module
@@ -442,17 +497,6 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
l_err->collectTrace(NVDIMM_COMP_NAME);
errlCommit( l_err, NVDIMM_COMP_ID );
o_arm_successful = false;
-
- // If the erase failed let's disarm the trigger
- l_err = nvdimmChangeArmState(l_nvdimm, DISARM_TRIGGER);
- if (l_err)
- {
- TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvdimmArm() nvdimm[%X], error disarming the nvdimm!",
- get_huid(l_nvdimm));
- l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE);
- l_err->collectTrace(NVDIMM_COMP_NAME);
- errlCommit(l_err, NVDIMM_COMP_ID);
- }
break;
}
@@ -525,7 +569,10 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
NVDIMM::UdNvdimmOPParms( l_RegInfo ).addToLog(l_err);
errlCommit( l_err, NVDIMM_COMP_ID );
- break;
+
+ // We are after the arm step now, so on any error cases let's log it
+ // then move to the next nvdimm
+ continue;
}
// Re-check health status registers
@@ -537,8 +584,7 @@ bool nvdimmArm(TargetHandleList &i_nvdimmTargetList)
TRACFCOMP(g_trac_nvdimm, "nvdimmArm() nvdimm[%X] failed final health status check", get_huid(l_nvdimm));
errlCommit( l_err, NVDIMM_COMP_ID );
- o_arm_successful = false;
- break;
+ continue;
}
}
OpenPOWER on IntegriCloud