diff options
-rw-r--r-- | src/include/runtime/interface.h | 8 | ||||
-rw-r--r-- | src/include/usr/isteps/nvdimm/nvdimm.H | 51 | ||||
-rw-r--r-- | src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H | 158 | ||||
-rw-r--r-- | src/usr/isteps/nvdimm/nvdimm.H | 5 | ||||
-rw-r--r-- | src/usr/isteps/nvdimm/runtime/nvdimm_rt.C | 708 | ||||
-rw-r--r-- | src/usr/util/runtime/rt_cmds.C | 74 | ||||
-rw-r--r-- | src/usr/util/runtime/rt_fwnotify.C | 26 |
7 files changed, 807 insertions, 223 deletions
diff --git a/src/include/runtime/interface.h b/src/include/runtime/interface.h index 152226d54..638bf8b89 100644 --- a/src/include/runtime/interface.h +++ b/src/include/runtime/interface.h @@ -593,17 +593,19 @@ typedef struct hostInterfaces // Arm the NV logic HBRT_FW_NVDIMM_ARM = 0x0010, - /// The following operation pertains to the Health of the NVDIMM + /// The following operations pertain to the Health of the NVDIMM /// This operation can be performed with the arming/disarming /// operation, these operation types are orthogonal to each other - // Manufacturing energy source(ES) health check request + // Manufacturing(MNFG) energy source(ES) health check request HBRT_FW_MNFG_ES_HEALTH_CHECK = 0x0020, + // Manufacturing(MNFG) non-volatile memory(NVM) health check request + HBRT_FW_MNFG_NVM_HEALTH_CHECK = 0x0040 }; // NVDIMM (PHYP -> HBRT) message to request NVDIMM operation(s) struct nvdimm_operation_t { - uint64_t procId; // Retrieve all NVDIMMs under the processor ID, all + uint64_t procId; // Retrieve all NVDIMMs under the processor ID; all // FFs (HBRT_NVDIMM_OPERATION_APPLY_TO_ALL_NVDIMMS) // means operate on all NVDIMMs in the system uint32_t rsvd1; // reserved diff --git a/src/include/usr/isteps/nvdimm/nvdimm.H b/src/include/usr/isteps/nvdimm/nvdimm.H index 4f7804f3e..9d5e3c0e0 100644 --- a/src/include/usr/isteps/nvdimm/nvdimm.H +++ b/src/include/usr/isteps/nvdimm/nvdimm.H @@ -206,7 +206,8 @@ bool nvdimmArm(TARGETING::TargetHandleList &i_nvdimmTargetList); bool nvdimmDisarm(TARGETING::TargetHandleList &i_nvdimmTargetList); /** - * @brief Check the health status of the individual NVDIMMs supplied in list + * @brief Check the ES (enery source)/backup power module(BPM) health status of + * the individual NVDIMMs supplied in list * * @details The BPM will trigger the health check when power is applied at the * beginning of the IPL, with results ready to check about 20 mins @@ -219,25 +220,57 @@ bool nvdimmDisarm(TARGETING::TargetHandleList &i_nvdimmTargetList); * Bit 1 : Health Check Succeeded * Bit 2 : Health Check Failed * - * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail ES health check, else true */ -bool nvDimmCheckHealthStatus(TARGETING::TargetHandleList &i_nvdimmTargetList); +bool nvDimmEsCheckHealthStatus(const TARGETING::TargetHandleList + &i_nvdimmTargetList); /** - * @brief A wrapper around the call to nvDimmCheckHealthStatus + * @brief A wrapper around the call to nvDimmEsCheckHealthStatus * * @details This will aggregate all the NVDIMMs of the system and pass - * them to the call nvDimmCheckHealthStatus + * them to the call nvDimmEsCheckHealthStatus * - * @see nvDimmCheckHealthStatus for more details + * @see nvDimmEsCheckHealthStatus for more details * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail an ES health check, else true */ -bool nvDimmCheckHealthStatusOnSystem(); +bool nvDimmEsCheckHealthStatusOnSystem(); +/* + * @brief Check the NVM (non-volatile memory)/flash health status of the + * individual NVDIMMs supplied in list. + * + * @details This method will check the flash error count registers + * (FLASH_ERROR_COUNT0 to FLASH_ERROR_COUNT2) to determine if the + * number of flash error exceeds the maximum allowed. Will also check + * the flash bad block percentage register (FLASH_BAD_BLK_PCT) to + * determine if the percentage exceeds the maximum allowed. + * If any one of these or both of these fail their perspective + * maximums then a callout will be made with either or both failures. + * + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the NVM health of + * + * @return false if one or more NVDIMMs fail NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatus(const TARGETING::TargetHandleList + &i_nvdimmTargetList); + +/** + * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus + * + * @details This will aggregate all the NVDIMMs of the system and pass + * them to the call nvDimmNvmCheckHealthStatus + * + * @see nvDimmNvmCheckHealthStatus for more details + * + * @return false if one or more NVDIMMs fail an NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatusOnSystem(); + #endif /** diff --git a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H index da69fb86e..ad5c6be50 100644 --- a/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H +++ b/src/include/usr/isteps/nvdimm/nvdimmreasoncodes.H @@ -98,11 +98,12 @@ enum nvdimmModuleId SET_ATTR_NVDIMM_ENCRYPTION_KEYS_FW = 0x30, SEND_ATTR_NVDIMM_ARMED = 0x31, NVDIMM_FACTORY_RESET = 0x32, - NVDIMM_HEALTH_CHECK = 0x33, + NVDIMM_ES_HEALTH_CHECK = 0x33, // Health check on the ES (energy source)/backup power module NVDIMM_CHECK_RESETN = 0x34, NVDIMM_CHECK_CSAVE = 0x35, NVDIMM_MODULE_HEALTH_STATUS_CHECK = 0x36, NVDIMM_SET_EVENT_NOTIFICATION = 0x37, + NVDIMM_NVM_HEALTH_CHECK = 0x38, // Health check on the NVM (non-volatile memory)/flash }; /** @@ -113,83 +114,84 @@ enum nvdimmModuleId */ enum nvdimmReasonCode { - NVDIMM_INVALID_REASONCODE = NVDIMM_COMP_ID | 0x00, // Invalid Reasoncode - NVDIMM_INVALID_OPERATION = NVDIMM_COMP_ID | 0x01, - NVDIMM_INVALID_DEVICE_TYPE = NVDIMM_COMP_ID | 0x02, - NVDIMM_ATTR_INFO_NOT_FOUND = NVDIMM_COMP_ID | 0x03, - NVDIMM_INVALID_CHIP = NVDIMM_COMP_ID | 0x04, - NVDIMM_I2C_MASTER_PATH_ERROR = NVDIMM_COMP_ID | 0x05, - NVDIMM_TARGET_NULL = NVDIMM_COMP_ID | 0x06, - NVDIMM_INVALID_ADDR_OFFSET_SIZE = NVDIMM_COMP_ID | 0x07, - NVDIMM_OVERFLOW_ERROR = NVDIMM_COMP_ID | 0x08, - NVDIMM_I2C_WRITE_PAGE_SIZE_ZERO = NVDIMM_COMP_ID | 0x09, - NVDIMM_INVALID_OFFSET = NVDIMM_COMP_ID | 0x0A, - NVDIMM_READ_FAILURE = NVDIMM_COMP_ID | 0x0B, // NV Controller read failure - NVDIMM_WRITE_FAILURE = NVDIMM_COMP_ID | 0x0C, // NV Controller write failure - NVDIMM_BACKUP_TIMEOUT = NVDIMM_COMP_ID | 0x0D, // Backup/save timeout - NVDIMM_RESTORE_TIMEOUT = NVDIMM_COMP_ID | 0x0E, // Restore timeout - NVDIMM_ERASE_TIMEOUT = NVDIMM_COMP_ID | 0x0F, // Erase timeout - NVDIMM_CHARGE_TIMEOUT = NVDIMM_COMP_ID | 0x10, // Battery charging timeout - NVDIMM_ARM_TIMEOUT = NVDIMM_COMP_ID | 0x11, // Arming timeout - NVDIMM_SET_ES_ERROR = NVDIMM_COMP_ID | 0x12, // Failure to set the ES policy - NVDIMM_MSS_STR_ENTRY_ERROR = NVDIMM_COMP_ID | 0x13, // Failure to enter STR - NVDIMM_MSS_STR_EXIT_ERROR = NVDIMM_COMP_ID | 0x14, // Failure to exit STR - NVDIMM_MSS_POST_RSTR_ERROR = NVDIMM_COMP_ID | 0x15, // Failure to perform post restore work - NVDIMM_OPEN_PAGE_TIMEOUT = NVDIMM_COMP_ID | 0x16, // Open page timeout - NVDIMM_STATUS_TIMEOUT = NVDIMM_COMP_ID | 0x17, // Status timeout - NVDIMM_ARM_FAILED = NVDIMM_COMP_ID | 0x18, // Failure to arm reset_n - NVDIMM_ERASE_FAILED = NVDIMM_COMP_ID | 0x19, // Failure to erase - NVDIMM_RESTORE_FAILED = NVDIMM_COMP_ID | 0x1A, // Failure to restore - NVDIMM_NOT_READY = NVDIMM_COMP_ID | 0x1B, // NVDIMM not ready for host to access - NVDIMM_NULL_FIRMWARE_REQUEST_PTR = NVDIMM_COMP_ID | 0x1C, // Firmware request is NULL - NVDIMM_UNSUPPORTED_NVDIMM_TYPE = NVDIMM_COMP_ID | 0x1D, // Unsupported NVDIMM type for update - NVDIMM_OPERATION_IN_PROGRESS = NVDIMM_COMP_ID | 0x1E, // NV controller is busy - NVDIMM_CHECKSUM_ERROR = NVDIMM_COMP_ID | 0x1F, // Checksum error between host and nv calculated - NVDIMM_ZERO_TOTAL_REGIONS = NVDIMM_COMP_ID | 0x20, // Zero write regions calculated - NVDIMM_UPDATE_MODE_UNCHANGED = NVDIMM_COMP_ID | 0x21, // Unable to change update mode - NVDIMM_FW_OPS_IN_PROGRESS_TIMEOUT = NVDIMM_COMP_ID | 0x22, // Operations In Progress timeout - NVDIMM_DATA_SIZE_TOO_LARGE = NVDIMM_COMP_ID | 0x23, // Trying to write too much data - NVDIMM_DATA_SIZE_INVALID = NVDIMM_COMP_ID | 0x24, // Data size is invalid - NVDIMM_BLOCK_NOT_RECEIVED = NVDIMM_COMP_ID | 0x25, // Block data not received - NVDIMM_FW_OPS_NOT_SUCCESSFUL = NVDIMM_COMP_ID | 0x26, // Unsuccessful Firmware Operation - NVDIMM_UPDATE_NOT_SUPPORTED = NVDIMM_COMP_ID | 0x27, // NV controller cannot be updated - NVDIMM_START_UPDATE = NVDIMM_COMP_ID | 0x28, // start update - NVDIMM_UPDATE_COMPLETE = NVDIMM_COMP_ID | 0x29, // update completed - NVDIMM_TPM_NOT_FOUND = NVDIMM_COMP_ID | 0x2A, // TPM not found - NVDIMM_POWER_SAVE_FAILURE = NVDIMM_COMP_ID | 0x2B, // Save failed due to power loss - NVDIMM_CSAVE_ERROR = NVDIMM_COMP_ID | 0x2C, // CSave failed due to error - NVDIMM_VOLTAGE_REGULATOR_FAILED = NVDIMM_COMP_ID | 0x2D, - NVDIMM_VDD_LOST = NVDIMM_COMP_ID | 0x2E, - NVDIMM_VPP_LOST = NVDIMM_COMP_ID | 0x2F, - NVDIMM_VTT_LOST = NVDIMM_COMP_ID | 0x30, - NVDIMM_DRAM_NOT_SELF_REFRESH = NVDIMM_COMP_ID | 0x31, - NVDIMM_CONTROLLER_HARDWARE_ERROR = NVDIMM_COMP_ID | 0x32, - NVDIMM_NVM_CONTROLLER_ERROR = NVDIMM_COMP_ID | 0x33, - NVDIMM_NVM_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x34, - NVDIMM_NOT_ENOUGH_ENERGY_FOR_CSAVE = NVDIMM_COMP_ID | 0x35, - NVDIMM_INVALID_FIRMWARE_ERROR = NVDIMM_COMP_ID | 0x36, // Module Health Status Registers - NVDIMM_CONFIG_DATA_ERROR = NVDIMM_COMP_ID | 0x37, - NVDIMM_NO_ES_PRESENT = NVDIMM_COMP_ID | 0x38, - NVDIMM_ES_POLICY_NOT_SET = NVDIMM_COMP_ID | 0x39, - NVDIMM_ES_HARDWARE_FAILURE = NVDIMM_COMP_ID | 0x3A, - NVDIMM_ES_HEALTH_ASSESSMENT_ERROR = NVDIMM_COMP_ID | 0x3B, - NVDIMM_ES_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x3C, - NVDIMM_ES_TEMP_ERROR = NVDIMM_COMP_ID | 0x3D, - NVDIMM_SET_EVENT_NOTIFICATION_ERROR = NVDIMM_COMP_ID | 0x3E, - NVDIMM_VERIF_BYTE_CHECK_FAILED = NVDIMM_COMP_ID | 0x3F, // Encryption key reg verif failed - NVDIMM_ENCRYPTION_ENABLE_FAILED = NVDIMM_COMP_ID | 0x40, // Encryption enable failed - NVDIMM_ENCRYPTION_ERASE_PENDING_FAILED = NVDIMM_COMP_ID | 0x41, // Encryption crypto erase pending failed - NVDIMM_ENCRYPTION_ERASE_FAILED = NVDIMM_COMP_ID | 0x42, // Encryption crypto erase failed - NVDIMM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x43, // Encryption unlock failed - NVDIMM_ENCRYPTION_INVALID_ATTRIBUTE = NVDIMM_COMP_ID | 0x44, // Encryption attribute key data invalid - NVDIMM_ENCRYPTION_KEY_ATTRS_INVALID = NVDIMM_COMP_ID | 0x45, // Encryption key attributes are both invalid - NVDIMM_ENCRYPTION_MAX_DARN_ERRORS = NVDIMM_COMP_ID | 0x46, // Darn random key gen reached max errors - NVDIMM_ENCRYPTION_BAD_RANDOM_DATA = NVDIMM_COMP_ID | 0x47, // Generated key data not valid - NVDIMM_CANNOT_MAKE_ATTRIBUTE = NVDIMM_COMP_ID | 0x48, // Cannot make Attribute - NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE = NVDIMM_COMP_ID | 0x49, // !< pertains to ES_CMD_STATUS0[0]; the health check in progress flag - NVDIMM_HEALTH_CHECK_REPORTED_FAILURE = NVDIMM_COMP_ID | 0x4A, // !< pertains to ES_CMD_STATUS0[2]; the health check reported a failure flag - NVDIMM_LIFETIME_MIN_REQ_NOT_MET = NVDIMM_COMP_ID | 0x4B, // !< pertains to ES_LIFETIME; BPM does not meet minimum requirement for a new BPM - NVDIMM_HEALTH_CHECK_NEVER_INITIATED = NVDIMM_COMP_ID | 0x4C, // !< A health check was never initiated at start of IPL + NVDIMM_INVALID_REASONCODE = NVDIMM_COMP_ID | 0x00, // Invalid Reasoncode + NVDIMM_INVALID_OPERATION = NVDIMM_COMP_ID | 0x01, + NVDIMM_INVALID_DEVICE_TYPE = NVDIMM_COMP_ID | 0x02, + NVDIMM_ATTR_INFO_NOT_FOUND = NVDIMM_COMP_ID | 0x03, + NVDIMM_INVALID_CHIP = NVDIMM_COMP_ID | 0x04, + NVDIMM_I2C_MASTER_PATH_ERROR = NVDIMM_COMP_ID | 0x05, + NVDIMM_TARGET_NULL = NVDIMM_COMP_ID | 0x06, + NVDIMM_INVALID_ADDR_OFFSET_SIZE = NVDIMM_COMP_ID | 0x07, + NVDIMM_OVERFLOW_ERROR = NVDIMM_COMP_ID | 0x08, + NVDIMM_I2C_WRITE_PAGE_SIZE_ZERO = NVDIMM_COMP_ID | 0x09, + NVDIMM_INVALID_OFFSET = NVDIMM_COMP_ID | 0x0A, + NVDIMM_READ_FAILURE = NVDIMM_COMP_ID | 0x0B, // NV Controller read failure + NVDIMM_WRITE_FAILURE = NVDIMM_COMP_ID | 0x0C, // NV Controller write failure + NVDIMM_BACKUP_TIMEOUT = NVDIMM_COMP_ID | 0x0D, // Backup/save timeout + NVDIMM_RESTORE_TIMEOUT = NVDIMM_COMP_ID | 0x0E, // Restore timeout + NVDIMM_ERASE_TIMEOUT = NVDIMM_COMP_ID | 0x0F, // Erase timeout + NVDIMM_CHARGE_TIMEOUT = NVDIMM_COMP_ID | 0x10, // Battery charging timeout + NVDIMM_ARM_TIMEOUT = NVDIMM_COMP_ID | 0x11, // Arming timeout + NVDIMM_SET_ES_ERROR = NVDIMM_COMP_ID | 0x12, // Failure to set the ES policy + NVDIMM_MSS_STR_ENTRY_ERROR = NVDIMM_COMP_ID | 0x13, // Failure to enter STR + NVDIMM_MSS_STR_EXIT_ERROR = NVDIMM_COMP_ID | 0x14, // Failure to exit STR + NVDIMM_MSS_POST_RSTR_ERROR = NVDIMM_COMP_ID | 0x15, // Failure to perform post restore work + NVDIMM_OPEN_PAGE_TIMEOUT = NVDIMM_COMP_ID | 0x16, // Open page timeout + NVDIMM_STATUS_TIMEOUT = NVDIMM_COMP_ID | 0x17, // Status timeout + NVDIMM_ARM_FAILED = NVDIMM_COMP_ID | 0x18, // Failure to arm reset_n + NVDIMM_ERASE_FAILED = NVDIMM_COMP_ID | 0x19, // Failure to erase + NVDIMM_RESTORE_FAILED = NVDIMM_COMP_ID | 0x1A, // Failure to restore + NVDIMM_NOT_READY = NVDIMM_COMP_ID | 0x1B, // NVDIMM not ready for host to access + NVDIMM_NULL_FIRMWARE_REQUEST_PTR = NVDIMM_COMP_ID | 0x1C, // Firmware request is NULL + NVDIMM_UNSUPPORTED_NVDIMM_TYPE = NVDIMM_COMP_ID | 0x1D, // Unsupported NVDIMM type for update + NVDIMM_OPERATION_IN_PROGRESS = NVDIMM_COMP_ID | 0x1E, // NV controller is busy + NVDIMM_CHECKSUM_ERROR = NVDIMM_COMP_ID | 0x1F, // Checksum error between host and nv calculated + NVDIMM_ZERO_TOTAL_REGIONS = NVDIMM_COMP_ID | 0x20, // Zero write regions calculated + NVDIMM_UPDATE_MODE_UNCHANGED = NVDIMM_COMP_ID | 0x21, // Unable to change update mode + NVDIMM_FW_OPS_IN_PROGRESS_TIMEOUT = NVDIMM_COMP_ID | 0x22, // Operations In Progress timeout + NVDIMM_DATA_SIZE_TOO_LARGE = NVDIMM_COMP_ID | 0x23, // Trying to write too much data + NVDIMM_DATA_SIZE_INVALID = NVDIMM_COMP_ID | 0x24, // Data size is invalid + NVDIMM_BLOCK_NOT_RECEIVED = NVDIMM_COMP_ID | 0x25, // Block data not received + NVDIMM_FW_OPS_NOT_SUCCESSFUL = NVDIMM_COMP_ID | 0x26, // Unsuccessful Firmware Operation + NVDIMM_UPDATE_NOT_SUPPORTED = NVDIMM_COMP_ID | 0x27, // NV controller cannot be updated + NVDIMM_START_UPDATE = NVDIMM_COMP_ID | 0x28, // start update + NVDIMM_UPDATE_COMPLETE = NVDIMM_COMP_ID | 0x29, // update completed + NVDIMM_TPM_NOT_FOUND = NVDIMM_COMP_ID | 0x2A, // TPM not found + NVDIMM_POWER_SAVE_FAILURE = NVDIMM_COMP_ID | 0x2B, // Save failed due to power loss + NVDIMM_CSAVE_ERROR = NVDIMM_COMP_ID | 0x2C, // CSave failed due to error + NVDIMM_VOLTAGE_REGULATOR_FAILED = NVDIMM_COMP_ID | 0x2D, + NVDIMM_VDD_LOST = NVDIMM_COMP_ID | 0x2E, + NVDIMM_VPP_LOST = NVDIMM_COMP_ID | 0x2F, + NVDIMM_VTT_LOST = NVDIMM_COMP_ID | 0x30, + NVDIMM_DRAM_NOT_SELF_REFRESH = NVDIMM_COMP_ID | 0x31, + NVDIMM_CONTROLLER_HARDWARE_ERROR = NVDIMM_COMP_ID | 0x32, + NVDIMM_NVM_CONTROLLER_ERROR = NVDIMM_COMP_ID | 0x33, + NVDIMM_NVM_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x34, + NVDIMM_NOT_ENOUGH_ENERGY_FOR_CSAVE = NVDIMM_COMP_ID | 0x35, + NVDIMM_INVALID_FIRMWARE_ERROR = NVDIMM_COMP_ID | 0x36, // Module Health Status Registers + NVDIMM_CONFIG_DATA_ERROR = NVDIMM_COMP_ID | 0x37, + NVDIMM_NO_ES_PRESENT = NVDIMM_COMP_ID | 0x38, + NVDIMM_ES_POLICY_NOT_SET = NVDIMM_COMP_ID | 0x39, + NVDIMM_ES_HARDWARE_FAILURE = NVDIMM_COMP_ID | 0x3A, + NVDIMM_ES_HEALTH_ASSESSMENT_ERROR = NVDIMM_COMP_ID | 0x3B, + NVDIMM_ES_LIFETIME_ERROR = NVDIMM_COMP_ID | 0x3C, + NVDIMM_ES_TEMP_ERROR = NVDIMM_COMP_ID | 0x3D, + NVDIMM_SET_EVENT_NOTIFICATION_ERROR = NVDIMM_COMP_ID | 0x3E, + NVDIMM_VERIF_BYTE_CHECK_FAILED = NVDIMM_COMP_ID | 0x3F, // Encryption key reg verif failed + NVDIMM_ENCRYPTION_ENABLE_FAILED = NVDIMM_COMP_ID | 0x40, // Encryption enable failed + NVDIMM_ENCRYPTION_ERASE_PENDING_FAILED = NVDIMM_COMP_ID | 0x41, // Encryption crypto erase pending failed + NVDIMM_ENCRYPTION_ERASE_FAILED = NVDIMM_COMP_ID | 0x42, // Encryption crypto erase failed + NVDIMM_ENCRYPTION_UNLOCK_FAILED = NVDIMM_COMP_ID | 0x43, // Encryption unlock failed + NVDIMM_ENCRYPTION_INVALID_ATTRIBUTE = NVDIMM_COMP_ID | 0x44, // Encryption attribute key data invalid + NVDIMM_ENCRYPTION_KEY_ATTRS_INVALID = NVDIMM_COMP_ID | 0x45, // Encryption key attributes are both invalid + NVDIMM_ENCRYPTION_MAX_DARN_ERRORS = NVDIMM_COMP_ID | 0x46, // Darn random key gen reached max errors + NVDIMM_ENCRYPTION_BAD_RANDOM_DATA = NVDIMM_COMP_ID | 0x47, // Generated key data not valid + NVDIMM_CANNOT_MAKE_ATTRIBUTE = NVDIMM_COMP_ID | 0x48, // Cannot make Attribute + NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE = NVDIMM_COMP_ID | 0x49, // !< pertains to ES_CMD_STATUS0[0]; the ES health check in progress flag + NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE = NVDIMM_COMP_ID | 0x4A, // !< pertains to ES_CMD_STATUS0[2]; the ES health check reported a failure flag + NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET = NVDIMM_COMP_ID | 0x4B, // !< pertains to ES_LIFETIME; BPM does not meet minimum requirement for a new BPM + NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED = NVDIMM_COMP_ID | 0x4C, // !< An ES health check was never initiated at start of IPL + NVDIMM_NVM_HEALTH_CHECK_FAILED = NVDIMM_COMP_ID | 0x4D, // !< An NVM health check on the NVDIMM failed }; enum UserDetailsTypes diff --git a/src/usr/isteps/nvdimm/nvdimm.H b/src/usr/isteps/nvdimm/nvdimm.H index a99f1180a..d2d2985b6 100644 --- a/src/usr/isteps/nvdimm/nvdimm.H +++ b/src/usr/isteps/nvdimm/nvdimm.H @@ -275,6 +275,11 @@ enum i2cReg : uint16_t TYPED_BLOCK_DATA_BYTE30 = 0x39E, TYPED_BLOCK_DATA_BYTE31 = 0x39F, TYPED_BLOCK_DATA_OFFSET = 0x3E0, + FLASH_BAD_BLK_PCT = 0x41D, // Read only; Percentage of flash blocks + // in the flash array marked as bad blocks + FLASH_ERROR_COUNT0 = 0x428, // Read only; LSB[7:0] Flash error count + FLASH_ERROR_COUNT1 = 0x429, // Read only; [15:8] + FLASH_ERROR_COUNT2 = 0x42A, // Read only; MSB[23:16] BPM_MAGIC_REG1 = 0x430, BPM_MAGIC_REG2 = 0x431, SCAP_STATUS = 0x432, diff --git a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C index d615aa546..b38dd394d 100644 --- a/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C +++ b/src/usr/isteps/nvdimm/runtime/nvdimm_rt.C @@ -25,7 +25,11 @@ /** * @file nvdimm_rt.C * - * @brief NVDIMM functions only needed for runtime + * @brief NVDIMM functions only needed for runtime. These functions include + * but are not limited to arming/disarming the NVDIMM along with methods + * to poll the arming and check the status of the arming. Checking the + * error state of the NVDIMM, getting a random number with the darn + * instruction and checking the ES or NVM health status. */ /// BPM - Backup Power Module @@ -734,65 +738,68 @@ errlHndl_t nvdimm_getRandom(uint8_t* o_genData) } /* - * @brief Check the health status of the individual NVDIMMs supplied in list + * @brief Check the ES (enery source)/backup power module(BPM) health status of + * the individual NVDIMMs supplied in list * - * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the ES health of * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail ES health check, else true */ -bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) +bool nvDimmEsCheckHealthStatus(const TargetHandleList &i_nvdimmTargetList) { - TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatus(): " "Target list size(%d)", i_nvdimmTargetList.size()); - // The minimum lifetime value - const uint8_t LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% + // The minimum ES lifetime value + const uint8_t ES_LIFETIME_MINIMUM_REQUIREMENT = 0x62; // > 97% - // The health check status flags for the different states of a health check - const uint8_t HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 - const uint8_t HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 - const uint8_t HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 + // The ES health check status flags for the different states of an + // ES health check + const uint8_t ES_HEALTH_CHECK_IN_PROGRESS_FLAG = 0x01; // bit 0 + const uint8_t ES_HEALTH_CHECK_SUCCEEDED_FLAG = 0x02; // bit 1 + const uint8_t ES_HEALTH_CHECK_FAILED_FLAG = 0x04; // bit 2 // Handle to catch any errors errlHndl_t l_err(nullptr); - // The health check status from a health check call - uint8_t l_healthCheck(0); + // The ES health check status from an ES health check call + uint8_t l_esHealthCheck(0); - // Status of the accumulation of all calls related to the health check. + // Status of the accumulation of all calls related to the ES health check. // If any one call is bad/fails, then this will be false, else it stays true - bool l_didHealthCheckPass(true); + bool l_didEsHealthCheckPass(true); - // Iterate thru the NVDIMMs checking the health status of each one. + // Iterate thru the NVDIMMs checking the ES health status of each one. // Going with the assumption that the caller waited the allotted time, // roughly 20 to 30 minutes, after the start of an IPL. // Success case: - // * Health check initiated at start of the IPL, caller waited the + // * ES health check initiated at start of the IPL, caller waited the // allotted time (20 to 30 mins) before doing a health check, health // check returned success and the lifetime meets the minimum threshold // for a new BPM. // Error cases are: - // * Health check is in progress, will assume BPM is hung - // * Health check failed - // * Health check succeeded but lifetime does not meet a certain threshold + // * ES health check is in progress, will assume BPM is hung + // * ES health check failed + // * ES health check succeeded but lifetime does not meet a + // certain threshold // * If none of the above apply (success case and other error cases), - // then assume the health check was never initiated at the start of the - // IPL + // then assume the ES health check was never initiated at the start + // of the IPL // For each of these error cases do a predictive callout for (auto const l_nvdimm : i_nvdimmTargetList) { // Retrieve the Health Check status from the BPM - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "Reading NVDIMM(0x%.8X) health check data, " + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES health check data, " "register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); - l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_healthCheck); + l_err = nvdimmReadReg(l_nvdimm, ES_CMD_STATUS0, l_esHealthCheck); if (l_err) { - TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "NVDIMM(0x%X) failed to read the health check " + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) failed to read the ES health check " "data, register ES_CMD_STATUS0(0x%.2X)", get_huid(l_nvdimm), ES_CMD_STATUS0); @@ -801,43 +808,43 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; // Proceed to next NVDIMM, better luck next time continue; } // Trace out the returned data for inspection - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "NVDIMM(0x%X) returned value(0x%.2X) from health check " - "data, register ES_CMD_STATUS0(0x%.2X)", - get_huid(l_nvdimm), l_healthCheck, ES_CMD_STATUS0) + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "NVDIMM(0x%X) returned value(0x%.2X) from the ES health " + "check data, register ES_CMD_STATUS0(0x%.2X)", + get_huid(l_nvdimm), l_esHealthCheck, ES_CMD_STATUS0); - if (l_healthCheck & HEALTH_CHECK_IN_PROGRESS_FLAG) + if (l_esHealthCheck & ES_HEALTH_CHECK_IN_PROGRESS_FLAG) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " - "doing a health check on NVDIMM(0x%.8X), the BPM " - "is hung doing the health check.", + "doing an ES health check on NVDIMM(0x%.8X), the BPM " + "is hung doing the ES health check.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status + * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before - * doing a health check, then the BPM is hung doing - * the health check. - * @custdesc NVDIMM Health Check failed. + * doing an ES health check, then the BPM is hung doing + * the ES health check. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_IN_PROGRESS_FAILURE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_IN_PROGRESS_FAILURE, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -849,34 +856,33 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_healthCheck & HEALTH_CHECK_FAILED_FLAG) + else if (l_esHealthCheck & ES_HEALTH_CHECK_FAILED_FLAG) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "Assuming caller waited the allotted time before " - "doing a health check on NVDIMM(0x%.8X), the BPM " + "doing an ES health check on NVDIMM(0x%.8X), the BPM " "reported a failure.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_REPORTED_FAILURE + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status - * @devdesc NVDIMM Health Check failed + * @userdata2 ES health check status * @devdesc Assuming caller waited the allotted time before - * doing a health check, the BPM reported a failure - * while doing a health check. - * @custdesc NVDIMM Health Check failed. + * doing an ES health check, the BPM reported a failure + * while doing an ES health check. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_REPORTED_FAILURE, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_REPORTED_FAILURE, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -888,12 +894,12 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG) + else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) { - TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckHealthStatus(): " - "Reading NVDIMM(0x%.8X) es lifetime data, " + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmEsCheckHealthStatus(): " + "Reading NVDIMM(0x%.8X) ES lifetime data, " "register ES_LIFETIME(0x%.2X)", get_huid(l_nvdimm), ES_LIFETIME); @@ -905,7 +911,7 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) if (l_err) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " "NVDIMM(0x%.8X) failed to read the " "ES_LIFETIME(0x%.2X) data", get_huid(l_nvdimm), @@ -916,42 +922,42 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } - else if (l_lifetimePercentage < LIFETIME_MINIMUM_REQUIREMENT) + else if (l_lifetimePercentage < ES_LIFETIME_MINIMUM_REQUIREMENT) { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "Health check on NVDIMM(0x%.8X) succeeded but the " - "BPM's lifetime(%d) does not meet the minimum " + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "ES health check on NVDIMM(0x%.8X) succeeded but " + "the BPM's lifetime(%d) does not meet the minimum " "requirement(%d) needed to qualify as a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT ); + ES_LIFETIME_MINIMUM_REQUIREMENT ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_LIFETIME_MIN_REQ_NOT_MET + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET * @userdata1[00:31] HUID of NVDIMM target - * @userdata1[32:63] Health check status + * @userdata1[32:63] ES health check status * @userdata2[00:31] Retrieved lifetime percentage * @userdata2[32:63] lifetime minimum requirement - * @devdesc Health check succeeded but the BPM's + * @devdesc ES health check succeeded but the BPM's * lifetime does not meet the minimum * requirement needed to qualify as a * new BPM. - * @custdesc NVDIMM Health Check failed + * @custdesc NVDIMM ES health check failed */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_LIFETIME_MIN_REQ_NOT_MET, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_LIFETIME_MIN_REQ_NOT_MET, TWO_UINT32_TO_UINT64( get_huid(l_nvdimm), - l_healthCheck), + l_esHealthCheck), TWO_UINT32_TO_UINT64( l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT), + ES_LIFETIME_MINIMUM_REQUIREMENT), ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -963,45 +969,46 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } // end else if (l_lifetimePercentage ... else { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmCheckHealthStatus(): " - "Success: Health check on NVDIMM(0x%.8X) succeeded " - "and the BPM's lifetime(%d) meet's the minimum " - "requirement(%d) needed to qualify as a new BPM.", + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "Success: ES health check on NVDIMM(0x%.8X) " + "succeeded and the BPM's lifetime(%d) meet's the " + "minimum requirement(%d) needed to qualify as " + "a new BPM.", get_huid(l_nvdimm), l_lifetimePercentage, - LIFETIME_MINIMUM_REQUIREMENT ); + ES_LIFETIME_MINIMUM_REQUIREMENT ); } - } // end else if (l_healthCheck & HEALTH_CHECK_SUCCEEDED_FLAG) - else // Assume the health check was never initiated at + } // end else if (l_esHealthCheck & ES_HEALTH_CHECK_SUCCEEDED_FLAG) + else // Assume the ES health check was never initiated at // the start of the IPL. { - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmCheckHealthStatus(): " - "The health check on NVDIMM(0x%.8X) shows no status (in " - "progress, fail or succeed) so assuming it was never " - "initiated at the start of the IPL.", + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvDimmEsCheckHealthStatus(): " + "The ES health check on NVDIMM(0x%.8X) shows no status " + "(in progress, fail or succeed) so assuming it was " + "never initiated at the start of the IPL.", get_huid(l_nvdimm) ); /*@ * @errortype * @severity ERRL_SEV_PREDICTIVE - * @moduleid NVDIMM_HEALTH_CHECK - * @reasoncode NVDIMM_HEALTH_CHECK_NEVER_INITIATED + * @moduleid NVDIMM_ES_HEALTH_CHECK + * @reasoncode NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED * @userdata1 HUID of NVDIMM target - * @userdata2 Health check status - * @devdesc The health check shows no status (in progress, fail - * or succeed) so assuming it was never initiated + * @userdata2 ES health check status + * @devdesc The ES health check shows no status (in progress, + * fail or succeed) so assuming it was never initiated * at the start of the IPL. - * @custdesc NVDIMM Health Check failed. + * @custdesc NVDIMM ES health check failed. */ l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, - NVDIMM_HEALTH_CHECK, - NVDIMM_HEALTH_CHECK_NEVER_INITIATED, + NVDIMM_ES_HEALTH_CHECK, + NVDIMM_ES_HEALTH_CHECK_NEVER_INITIATED, get_huid(l_nvdimm), - l_healthCheck, + l_esHealthCheck, ErrlEntry::NO_SW_CALLOUT ); l_err->collectTrace(NVDIMM_COMP_NAME); @@ -1013,42 +1020,509 @@ bool nvDimmCheckHealthStatus(TargetHandleList &i_nvdimmTargetList) errlCommit(l_err, NVDIMM_COMP_ID); // Let the caller know something went amiss - l_didHealthCheckPass = false; + l_didEsHealthCheckPass = false; } } // end for (auto const l_nvdimm : i_nvdimmTargetList) // Should not have any uncommitted errors - assert(l_err == NULL, "nvDimmCheckHealthStatus() - unexpected uncommitted" - "error found" ); + assert(l_err == NULL, "nvDimmEsCheckHealthStatus() - unexpected " + "uncommitted error found" ); - TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatus(): " - "Returning %s", l_didHealthCheckPass == true ? "true" : "false" ); + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatus(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false"); - return l_didHealthCheckPass; -} // end nvDimmCheckHealthStatus + return l_didEsHealthCheckPass; +} // end nvDimmEsCheckHealthStatus /** - * @brief A wrapper around the call to nvDimmCheckHealthStatus + * @brief A wrapper around the call to nvDimmEsCheckHealthStatus * - * @see nvDimmCheckHealthStatus for more details + * @see nvDimmEsCheckHealthStatus for more details * - * @return false if one or more NVDIMMs fail health check, else true + * @return false if one or more NVDIMMs fail an ES health check, else true */ -bool nvDimmCheckHealthStatusOnSystem() +bool nvDimmEsCheckHealthStatusOnSystem() { - TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmCheckHealthStatusOnSystem()"); + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmEsCheckHealthStatusOnSystem()"); // Get the list of NVDIMM Targets from the system TargetHandleList l_nvDimmTargetList; nvdimm_getNvdimmList(l_nvDimmTargetList); // Return status of doing a check health status - bool l_didHealthCheckPass = nvDimmCheckHealthStatus(l_nvDimmTargetList); + bool l_didEsHealthCheckPass = nvDimmEsCheckHealthStatus(l_nvDimmTargetList); - TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmCheckHealthStatusOnSystem(): " - "Returning %s", l_didHealthCheckPass == true ? "true" : "false" ); + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmEsCheckHealthStatusOnSystem(): " + "Returning %s", l_didEsHealthCheckPass == true ? "true" : "false" ); - return l_didHealthCheckPass; + return l_didEsHealthCheckPass; } // end nvDimmCheckHealthStatusOnSystem +/* + * @brief Check the bad flash block percentage against a given maximum allowed. + * + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_badFlashBlockPercentage will contain what the retrieved + * flash block percentage is. + * If false is returned and the o_badFlashBlockPercentage is zero, then + * the check failed because of a register read fail + * If false is returned and the o_badFlashBlockPercentage is not zero, + * then the check failed because the retrieved bad flash block + * percentage exceeds the given maximum allowed + * + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxPercentageAllowed - The maximum percentage of bad flash + * block allowed + * @param[out] o_badFlashBlockPercentage - The retrieved bad flash block + * percentage from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true + */ +bool nvDimmCheckBadFlashBlockPercentage(TargetHandle_t i_nvDimm, + const uint8_t i_maxPercentageAllowed, + uint8_t &o_badFlashBlockPercentage) +{ + // The status of the check on the bad block percentage + bool l_didBadFlashBlockPercentageCheckPass(false); + + // The retrieved flash block percentage from register, initialize to zero + o_badFlashBlockPercentage = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); + + // Retrieve the percentage of bad blocks and validate + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "Reading NVDIMM(0x%.8X) percentage of bad blocks from " + "register FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); + + l_err = nvdimmReadReg(i_nvDimm, + FLASH_BAD_BLK_PCT, + o_badFlashBlockPercentage); + + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: NVDIMM(0x%.8X) failed to read the percentage of " + "bad blocks from register FLASH_BAD_BLK_PCT(0x%.4X), " + "marking as a fail", + l_nvDimmHuid, FLASH_BAD_BLK_PCT); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didBadFlashBlockPercentageCheckPass = false; + o_badFlashBlockPercentage = 0; + } + else + { + // Trace out the returned data for inspection + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "NVDIMM(0x%.8X) returned value (%d) from the " + "percentage of bad blocks, register " + "FLASH_BAD_BLK_PCT(0x%.4X)", + l_nvDimmHuid, + o_badFlashBlockPercentage, + FLASH_BAD_BLK_PCT); + + // Check to see if the bad flash block percentage + // exceeds maximum allowed. + if (o_badFlashBlockPercentage > i_maxPercentageAllowed) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "FAIL: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d) exceeds the maximum percentage " + "of bad flash blocks allowed (%d), marking this " + "as a fail", + l_nvDimmHuid, + o_badFlashBlockPercentage, + i_maxPercentageAllowed); + + // Set up the fail state, so caller can determine that the fail was + // due to percentage exceeding the max percentage allowed. + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = false; + } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckBadFlashBlockPercentage(): " + "SUCCESS: For NVDIMM (0x%.8X), the percentage of bad " + "flash blocks (%d) is less than or meets the maximum " + "percentage of bad flash blocks allowed (%d), " + "marking this as a pass", + l_nvDimmHuid, + o_badFlashBlockPercentage, + i_maxPercentageAllowed); + + // Set up the pass state + // Note: Leave the value in o_badFlashBlockPercentage so caller + // can inspect, if they wish + l_didBadFlashBlockPercentageCheckPass = true; + } // end if (l_badFlashBlockPercentage > i_maxPercentageAllowed) + } // end if (l_err) ... else + + return l_didBadFlashBlockPercentageCheckPass; +} + +/* + * @brief Check the flash error count against a given maximum allowed. + * + * @details This returns a tristate - 1 pass, 2 different fails + * If true is returned, then the check passed and + * o_readFlashErrorCount will contain what the retrieved + * flash error count is. + * If false is returned and the o_readFlashErrorCount is zero, then + * the check failed because of a register read fail + * If false is returned and the o_readFlashErrorCount is not zero, + * then the check failed because the retrieved flash error + * count exceeds the given maximum allowed + * + * @param[in] i_nvDimm - The NVDIMM to check + * @param[in] i_maxFlashErrorsAllowed - The maximum number of flash errors + * allowed + * @param[out] o_readFlashErrorCount - The retrieved bad flash error + * count from i_nvDimm, if no + * register read error. + * + * @return false if check failed or register read failed, else true + */ +bool nvDimmCheckFlashErrorCount(TargetHandle_t i_nvDimm, + const uint32_t i_maxFlashErrorsAllowed, + uint32_t &o_readFlashErrorCount) +{ + // The status of the check on the flash error count + bool l_didFlashErrorCountCheckPass(false); + + // The retrieved flash error count from register, initialize to zero + o_readFlashErrorCount = 0; + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( i_nvDimm ); + + // The retrieved flash error count from a register + uint8_t l_readFlashErrorCountByte(0); + + // Read the flash error count registers starting from MSB to LSB + for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; + l_flashErrorRegister >= FLASH_ERROR_COUNT0; + --l_flashErrorRegister) + { + // Reset this for every iteration, may be redundant + l_readFlashErrorCountByte = 0; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "Reading NVDIMM(0x%.8X) flash error count from " + "register FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, l_flashErrorRegister); + + l_err = nvdimmReadReg(i_nvDimm, + static_cast<i2cReg >(l_flashErrorRegister), + l_readFlashErrorCountByte); + + if (l_err) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: NVDIMM(0x%.8X) failed to read flash error " + "count from register FLASH_ERROR_COUNT(0x%.4X) " + "marking as a fail", + l_nvDimmHuid, l_flashErrorRegister); + + l_err->setSev(ERRORLOG::ERRL_SEV_PREDICTIVE); + l_err->collectTrace(NVDIMM_COMP_NAME); + errlCommit(l_err, NVDIMM_COMP_ID); + + // Set up the fail state, so caller can determine that the fail was + // due to a register read error + l_didFlashErrorCountCheckPass = false; + o_readFlashErrorCount = 0; + + break; + } + + // If we get here, then the read was successful + // Append the read flash error count byte to the LSB of the + // aggregated flash error count bytes. + o_readFlashErrorCount = (o_readFlashErrorCount << 8) | + l_readFlashErrorCountByte; + + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) returned value (0x%.2X) from the " + "partial flash error count, register " + "FLASH_ERROR_COUNT(0x%.4X)", + l_nvDimmHuid, + l_readFlashErrorCountByte, + l_flashErrorRegister); + + } // end for (int16_t l_flashErrorRegister = FLASH_ERROR_COUNT2; ... + + // If o_readFlashErrorCount is not zero, then register read was successful + if (o_readFlashErrorCount) + { + TRACDCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "NVDIMM(0x%.8X) flash error count = %d ", + l_nvDimmHuid, o_readFlashErrorCount); + + // Check the validity of the flash error count + if (o_readFlashErrorCount > i_maxFlashErrorsAllowed) + { + TRACFCOMP(g_trac_nvdimm, ERR_MRK"nvDimmCheckFlashErrorCount(): " + "FAIL: For NVDIMM (0x%.8X), the flash error " + "count (%d) exceeds the maximum number of flash " + "errors allowed (%d), marking this as a fail", + l_nvDimmHuid, + o_readFlashErrorCount, + i_maxFlashErrorsAllowed); + + // Set up the fail state, so caller can determine that the fail was + // due to error count exceeding the max errors allowed. + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = false; + } + else + { + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmCheckFlashErrorCount(): " + "SUCCESS: For NVDIMM(0x%.8X), the flash error counts " + "(%d) is less than or meets the maximum number of " + "errors allowed (%d), marking this as a pass", + l_nvDimmHuid, + o_readFlashErrorCount, + i_maxFlashErrorsAllowed); + + // Set up the pass state + // Note: Leave the value in o_readFlashErrorCount so caller + // can inspect, if they wish + l_didFlashErrorCountCheckPass = true; + } + } // end if (o_readFlashErrorCount) + + return l_didFlashErrorCountCheckPass; +} + +/* + * @brief Check the NVM (non-volatile memory)/flash health of the individual + * NVDIMMs supplied in list. + * + * @param[in] i_nvdimmTargetList - list of NVDIMMs to check the health of flash + * + * @return false if one or more NVDIMMs fail NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatus(const TargetHandleList &i_nvDimmTargetList) +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatus(): " + "Target list size(%d)", i_nvDimmTargetList.size()); + + // The following maximums are the same values used by SMART's + // manufacturing and recommended that we use. + // The maximum percentage of bad flash blocks + // Fail if over 19% of bad flash blocks is encountered + const uint8_t MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED = 19; + // The maximum number of flash memory errors allowed + // Fail if over 300 flash memory errors is encountered + const uint32_t MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED = 300; + + // Status of the accumulation of all calls related to the NVM health check. + // If any one call is bad/fails, then this will be false, else it stays true + bool l_didNvmHealthCheckPass(true); + + // Handle to catch any errors + errlHndl_t l_err(nullptr); + + // The retrieved flash block percentage from register + uint8_t l_badFlashBlockPercentage(0); + // The retrieved flash error count from register + uint32_t l_flashErrorCount(0); + + // The status of the checks on the percentage of bad blocks and + // flash error count + // Default to true + bool l_badFlashBlockPercentageCheckPassed(true); + bool l_flashErrorCountCheckPassed(true); + + // Iterate thru the supplied NVDIMMs checking the health of the NVM + for (auto const l_nvDimm : i_nvDimmTargetList) + { + // Cache the HUID of the NVDIMM + uint32_t l_nvDimmHuid = get_huid( l_nvDimm ); + + // Reset these for every NVDIMM that is checked + l_badFlashBlockPercentage = 0; + l_flashErrorCount = 0; + l_badFlashBlockPercentageCheckPassed = true; + l_flashErrorCountCheckPassed = true; + + // Check the validity of bad flash block percentage + if (!nvDimmCheckBadFlashBlockPercentage( + l_nvDimm, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED, + l_badFlashBlockPercentage)) + { + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_badFlashBlockPercentage, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_badFlashBlockPercentage) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_badFlashBlockPercentageCheckPassed = false; + } + + // Check the validity of the flash error count + if (!nvDimmCheckFlashErrorCount( + l_nvDimm, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED, + l_flashErrorCount)) + { + // Set this to false to indicate that the overall check on the + // NVDIMMs had at least one failure + l_didNvmHealthCheckPass = false; + + // If no data in the variable l_flashErrorCount, then + // this is a read register fail. Move onto the next NVDIMM + // this is a dud + if (!l_flashErrorCount) + { + continue; + } + + // Set the check to false, to facilitate error reporting + l_flashErrorCountCheckPassed = false; + } + + /// Now we assess the health of the flash based on data gathered above + if ( !l_badFlashBlockPercentageCheckPassed || + !l_flashErrorCountCheckPassed ) + { + // First set the NVDIMM HUID to the first 32 bits of user data 1 + uint64_t l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, 0); + + // If an issue with the bad flash block percentage, then append + // data to user data 1 + if (!l_badFlashBlockPercentageCheckPassed && + l_badFlashBlockPercentage) + { + // Setting the HUID here is redundant but easier than trying to + // do some clever code that will set the HUID for user data 1 + // when this path is not taken, but the next check on the flash + // error count is taken + l_badFlashBlockPercentageUserData1 = + TWO_UINT32_TO_UINT64(l_nvDimmHuid, + TWO_UINT16_TO_UINT32( + l_badFlashBlockPercentage, + MAXIMUM_PERCENTAGE_OF_BAD_FLASH_BLOCKS_ALLOWED)); + } + + // If an issue with the flash error count, then set user + // data 2 to contain the flash error count value + uint64_t l_flashErrorCountUserData2(0); + if (!l_flashErrorCountCheckPassed && + l_flashErrorCount) + { + l_flashErrorCountUserData2 = + TWO_UINT32_TO_UINT64(l_flashErrorCount, + MAXIMUM_NUMBER_OF_FLASH_MEMORY_ERRORS_ALLOWED); + } + + /*@ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid NVDIMM_NVM_HEALTH_CHECK + * @reasoncode NVDIMM_NVM_HEALTH_CHECK_FAILED + * @userdata1[0:31] HUID of NVDIMM target + * @userdata1[32:47] The retrieved bad flash block percentage, + * if error with, else 0 + * @userdata1[48:63] The maximum percentage of bad flash blocks + * allowed, if bad flash block percentage + * exceeds this maximum, else 0 + * @userdata2[0:31] The retrieved flash error count, + * if error with, else 0 + * @userdata2[32:63] The maximum number of flash errors + * allowed, if flash error exceeds this + * maximum, else 0 + * @devdesc Either the NVDIMM NVM bad flash block + * percentage exceeded the maximum percentage + * allowed or the NVDIMM NVM number of flash + * error exceeds the maximum count allowed + * or both. + * @custdesc NVDIMM NVM health check failed. + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + NVDIMM_NVM_HEALTH_CHECK, + NVDIMM_NVM_HEALTH_CHECK_FAILED, + l_badFlashBlockPercentageUserData1, + l_flashErrorCountUserData2, + ErrlEntry::NO_SW_CALLOUT ); + + l_err->collectTrace(NVDIMM_COMP_NAME); + + // Collect the error + errlCommit(l_err, NVDIMM_COMP_ID); + + // Let the caller know something went amiss + l_didNvmHealthCheckPass = false; + } + else + { + // This NVDIMM passed the NVM health check + TRACFCOMP(g_trac_nvdimm, INFO_MRK"nvDimmNvmCheckHealthStatus(): " + "Success: NVDIMM (0x%.8X) passed the NVM health check.", + l_nvDimmHuid); + } // end if ( !l_badFlashBlockPercentageCheckPassed .. else + } // end for (auto const l_nvdimm : i_nvdimmTargetList) + + // Should not have any uncommitted errors + assert(l_err == NULL, "nvDimmNvmCheckHealthStatus() - unexpected " + "uncommitted error found"); + + TRACFCOMP(g_trac_nvdimm,EXIT_MRK"nvDimmNvmCheckHealthStatus(): Returning %s", + l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmNvmCheckHealthStatus + +/** + * @brief A wrapper around the call to nvDimmNvmCheckHealthStatus + * + * @see nvDimmNvmCheckHealthStatus for more details + * + * @return false if one or more NVDIMMs fail an NVM health check, else true + */ +bool nvDimmNvmCheckHealthStatusOnSystem() +{ + TRACFCOMP(g_trac_nvdimm, ENTER_MRK"nvDimmNvmCheckHealthStatusOnSystem()"); + + // Get the list of NVDIMM Targets from the system + TargetHandleList l_nvDimmTargetList; + nvdimm_getNvdimmList(l_nvDimmTargetList); + + // Return status of doing a check health status + bool l_didNvmHealthCheckPass = nvDimmNvmCheckHealthStatus(l_nvDimmTargetList); + + TRACFCOMP(g_trac_nvdimm, EXIT_MRK"nvDimmNvmCheckHealthStatusOnSystem(): " + "Returning %s", l_didNvmHealthCheckPass == true ? "true" : "false" ); + + return l_didNvmHealthCheckPass; +} // end nvDimmCheckHealthStatusOnSystem + + } // end NVDIMM namespace diff --git a/src/usr/util/runtime/rt_cmds.C b/src/usr/util/runtime/rt_cmds.C index c669aae4f..bf0c51749 100644 --- a/src/usr/util/runtime/rt_cmds.C +++ b/src/usr/util/runtime/rt_cmds.C @@ -1179,25 +1179,59 @@ void cmd_nvdimm_protection_msg( char* &o_output, uint32_t i_huid, } } -void cmd_nvdimmCheckHealthStatus( char* &o_output) +/** + * @brief Check the ES (energy source) health status of all NVDIMMs in the + * system. If check fails, see HBRT traces for further details. + * @param[out] o_output Output display buffer, memory allocated here. + * Will inform caller if ES health check passes or fails. + */ +void cmd_nvDimmEsCheckHealthStatus( char* &o_output) +{ + o_output = new char[500]; + if (NVDIMM::nvDimmEsCheckHealthStatusOnSystem()) + { + sprintf( o_output, "cmd_nvDimmEsCheckHealthStatus: " + "ES (energy source) health status check passed."); + + } + else + { + sprintf( o_output, "cmd_nvDimmEsCheckHealthStatus: " + "ES (energy source) health status check failed. " + "Inspect HBRT traces for further details."); + + } + + return; +} // end cmd_nvDimmEsCheckHealthStatus + +/** + * @brief Check the NVM (non-volatile memory) health status of all NVDIMMs in + * the system. If check fails, see HBRT traces for further details. + * @param[out] o_output Output display buffer, memory allocated here. + * Will inform caller if NVM health check passes or fails. + */ + +void cmd_nvdDmmNvmCheckHealthStatus( char* &o_output) { o_output = new char[500]; - if (NVDIMM::nvDimmCheckHealthStatusOnSystem()) + if (NVDIMM::nvDimmNvmCheckHealthStatusOnSystem()) { - sprintf( o_output, "cmd_doNvDimmCheckHealthStatus: " - "health status check passed."); + sprintf( o_output, "cmd_nvdDmmNvmCheckHealthStatus: " + "NVM (non-volatile memory) health status check passed."); } else { - sprintf( o_output, "cmd_doNvDimmCheckHealthStatus: " - "health status check failed. Inspect HBRT traces " - "for further details."); + sprintf( o_output, "cmd_nvdDmmNvmCheckHealthStatus: " + "NVM (non-volatile memory) health status check failed. " + "Inspect HBRT traces for further details."); } return; -} // end cmd_nvdimmCheckHealthStatus +} // end cmd_nvdDmmNvmCheckHealthStatus + #endif @@ -1535,18 +1569,31 @@ int hbrtCommand( int argc, sprintf(*l_output, "ERROR: nvdimm_protection <huid> <0 or 1>"); } } - else if( !strcmp( argv[0], "nvdimm_check_status" ) ) + else if( !strcmp( argv[0], "nvdimm_es_check_status" ) ) { if (argc == 1) { - cmd_nvdimmCheckHealthStatus( *l_output ); + cmd_nvDimmEsCheckHealthStatus( *l_output ); } else { *l_output = new char[100]; - sprintf(*l_output, "Usage: nvdimm_check_status"); + sprintf(*l_output, "Usage: nvdimm_es_check_status"); } } + else if( !strcmp( argv[0], "nvdimm_nvm_check_status" ) ) + { + if (argc == 1) + { + cmd_nvdDmmNvmCheckHealthStatus( *l_output ); + } + else + { + *l_output = new char[100]; + sprintf(*l_output, "Usage: nvdimm_nvm_check_status"); + } + } + #endif else { @@ -1587,8 +1634,11 @@ int hbrtCommand( int argc, #ifdef CONFIG_NVDIMM sprintf( l_tmpstr, "nvdimm_protection <huid> <0 or 1>\n"); strcat( *l_output, l_tmpstr ); - sprintf( l_tmpstr, "nvdimm_check_status\n"); + sprintf( l_tmpstr, "nvdimm_es_check_status\n"); + strcat( *l_output, l_tmpstr ); + sprintf( l_tmpstr, "nvdimm_nvm_check_status\n"); strcat( *l_output, l_tmpstr ); + #endif } diff --git a/src/usr/util/runtime/rt_fwnotify.C b/src/usr/util/runtime/rt_fwnotify.C index e9ebabe6d..350f4d1da 100644 --- a/src/usr/util/runtime/rt_fwnotify.C +++ b/src/usr/util/runtime/rt_fwnotify.C @@ -622,22 +622,40 @@ int doNvDimmOperation(const hostInterfaces::nvdimm_operation_t& i_nvDimmOp) } // end if (nvDimmOp.opType & hostInterfaces::HBRT_FW_NVDIMM_ARM) } while (0); // end Perform the arming/disarming operations. - // Perform the health check operation + // Perform the ES (energy source) health check operation if (i_nvDimmOp.opType & hostInterfaces::HBRT_FW_MNFG_ES_HEALTH_CHECK) { - if (!nvDimmCheckHealthStatus(l_nvDimmTargetList)) + if (!nvDimmEsCheckHealthStatus(l_nvDimmTargetList)) { TRACFCOMP(g_trac_runtime, "doNvDimmOperation: " - "Call to do a health check failed."); + "Call to do an ES (energy source) health check failed."); rc = -1; break; } else { TRACFCOMP(g_trac_runtime, "doNvDimmOperation: " - "Call to do a health check succeeded."); + "Call to do an ES (energy source) health check succeeded."); } } + + // Perform the NVM (non-volatile memory) health check operation + if (i_nvDimmOp.opType & hostInterfaces::HBRT_FW_MNFG_NVM_HEALTH_CHECK) + { + if (!nvDimmNvmCheckHealthStatus(l_nvDimmTargetList)) + { + TRACFCOMP(g_trac_runtime, "doNvDimmOperation: " + "Call to do a NVM (non-volatile memory) health check failed."); + rc = -1; + break; + } + else + { + TRACFCOMP(g_trac_runtime, "doNvDimmOperation: " + "Call to do a NVM (non-volatile memory) health check succeeded."); + } + } + } while(0); // end Perform the operations requested if (l_err) |