From 153dcaa15050071e25b0177618ad0c9d829606e2 Mon Sep 17 00:00:00 2001 From: Roland Veloz Date: Tue, 13 Aug 2019 19:47:39 -0500 Subject: Implement an HBRT interface to log a gard event from PHYP/OPAL Added a firmware notify interface, gard_event_t, to accept a gard event message, from PHYP/OPAL, to log that event. Change-Id: I9bcf684f0850c9a07ab7d46635aa07a2c1e9917c RTC: 210201 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/82199 Tested-by: Jenkins Server Reviewed-by: Matt Derksen Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins Reviewed-by: Christian R Geddes Reviewed-by: Daniel M Crowell --- src/include/runtime/README.md | 9 +- src/include/runtime/interface.h | 58 +++++++++-- src/include/usr/runtime/runtime_reasoncodes.H | 2 + src/usr/util/runtime/rt_fwnotify.C | 136 ++++++++++++++++++++++++++ 4 files changed, 191 insertions(+), 14 deletions(-) diff --git a/src/include/runtime/README.md b/src/include/runtime/README.md index 8a4e7298a..68b7382be 100755 --- a/src/include/runtime/README.md +++ b/src/include/runtime/README.md @@ -2,7 +2,7 @@ How to create an HBRT to FW request message interface 0) If passing an HBRT to FSP via MBOX or receiving a firmware notify message, then use instruction 'generic_hbrt_fsp_message.H::GenericFspMboxMessage_t' - below. + and/or 'How to create an HBRT Firmware Notify message' below. 1) The biggest part will be defining the interface. Inspect the current interfaces (req_hcode_update, error_log, etc) for inspiration. 2) Once an interface has been designed, add it to the anonymous @@ -35,13 +35,14 @@ How to create an HBRT to FW request message interface 1) The biggest part will be defining the interface. Inspect the current interfaces (AttributeSetter_t, SingleScomOpHbrtFspData_t, TargetDeconfigHbrtFspData_t, etc) for inspiration. - 2) Once an interface has been designed, add the structure to this file - with the other interfaces. + 2) Once an interface has been designed, add the structure to the file, + generic_hbrt_fsp_message.H, among the other interfaces. 3) Create an MBOX message queue enum for the interface and add to: /hostboot/src/include/usr/mbox/mbox_queues.H::queue_id_t see current message queues for example 4) Add a new message type for the interface to: - enum generic_hbrt_fsp_message.H::GENERIC_FSP_MBOX_MESSAGE_MSG_TYPE. + enum GenericFspMboxMessage_t::GENERIC_FSP_MBOX_MESSAGE_MSG_TYPE in + file generic_hbrt_fsp_message.H. 5) How to use the new interface to pass a message a) Make sure g_hostInterfaces and g_hostInterfaces->firmware_request are not NULL. diff --git a/src/include/runtime/interface.h b/src/include/runtime/interface.h index 9a0cb478e..152226d54 100644 --- a/src/include/runtime/interface.h +++ b/src/include/runtime/interface.h @@ -552,17 +552,18 @@ typedef struct hostInterfaces enum // hbrt_fw_msg::io_type the struct associated with io_type { - HBRT_FW_MSG_TYPE_REQ_NOP = 0, - HBRT_FW_MSG_TYPE_RESP_NOP = 1, // struct resp_generic - HBRT_FW_MSG_TYPE_RESP_GENERIC = 2, // struct resp_generic - HBRT_FW_MSG_TYPE_REQ_HCODE_UPDATE = 3, // struct req_hcode_update - HBRT_FW_MSG_HBRT_FSP_REQ = 4, // struct GenericFspMboxMessage_t - HBRT_FW_MSG_TYPE_ERROR_LOG = 5, // struct error_log - HBRT_FW_MSG_HBRT_FSP_RESP = 6, // struct GenericFspMboxMessage_t - HBRT_FW_MSG_TYPE_I2C_LOCK = 7, // struct req_i2c_lock - HBRT_FW_MSG_TYPE_SBE_STATE = 8, // struct sbe_state - HBRT_FW_MSG_TYPE_NVDIMM_PROTECTION = 9, // struct nvdimm_protection_state + HBRT_FW_MSG_TYPE_REQ_NOP = 0, + HBRT_FW_MSG_TYPE_RESP_NOP = 1, // struct resp_generic + HBRT_FW_MSG_TYPE_RESP_GENERIC = 2, // struct resp_generic + HBRT_FW_MSG_TYPE_REQ_HCODE_UPDATE = 3, // struct req_hcode_update + HBRT_FW_MSG_HBRT_FSP_REQ = 4, // struct GenericFspMboxMessage_t + HBRT_FW_MSG_TYPE_ERROR_LOG = 5, // struct error_log + HBRT_FW_MSG_HBRT_FSP_RESP = 6, // struct GenericFspMboxMessage_t + HBRT_FW_MSG_TYPE_I2C_LOCK = 7, // struct req_i2c_lock + HBRT_FW_MSG_TYPE_SBE_STATE = 8, // struct sbe_state + HBRT_FW_MSG_TYPE_NVDIMM_PROTECTION = 9, // struct nvdimm_protection_state HBRT_FW_MSG_TYPE_NVDIMM_OPERATION = 10, // struct nvdimm_operation_t + HBRT_FW_MSG_TYPE_GARD_EVENT = 11, // struct gard_event_t }; // NVDIMM protection state enum @@ -611,6 +612,39 @@ typedef struct hostInterfaces // see @note associated with NVDIMM_Op_t above } __attribute__ ((packed)); + // Gard event error type + // @note This needs to stay in sync with the FSP Mailbox specification for + // command : Gard-able Error Detected - cmd 0xCE, s/c 0x63, mod 01 + enum GARD_ERROR_t: uint32_t + { + HBRT_GARD_ERROR_UNKNOWN = 0x0000, + HBRT_GARD_ERROR_COMPUTATION_TEST_FAILURE = 0x0001, + HBRT_GARD_ERROR_SLB = 0x0002, + HBRT_GARD_ERROR_CHIP_TOD_FAILURE = 0x0003, + HBRT_GARD_ERROR_TIMEFAC_FAILURE = 0x0004, + HBRT_GARD_ERROR_PROC_RECOVERY_THRESHOLD = 0x0005, + HBRT_GARD_ERROR_NX = 0x0008, + HBRT_GARD_ERROR_SLW = 0x0009, + HBRT_GARD_ERROR_CAPP_UNIT = 0x000A, + + // Mark the end of the gard error types. + // This is not valid, just a marker + HBRT_GARD_ERROR_LAST, + }; + + // Gard event (PHYP/OPAL -> HBRT) + struct gard_event_t + { + GARD_ERROR_t i_error_type; // Gard event error type enum + uint32_t i_procId; // Processor ID for + // error types 0x0001 to 0x0005 + // Chip ID for + // error types 0x0008 to 0x000A + uint32_t i_plid; // Platform log identifier + uint16_t i_sub_unit_mask; // Currently not being used + uint16_t i_recovery_level; // Currently not being used + } __attribute__ ((packed)); + struct hbrt_fw_msg // define struct hbrt_fw_msg { hbrt_fw_msg() { req_hcode_update = { 0 }; }; // ctor @@ -685,6 +719,10 @@ typedef struct hostInterfaces // io_type set to HBRT_FW_MSG_TYPE_NVDIMM_OPERATION struct nvdimm_operation_t nvdimm_operation; + // This struct is sent from PHYP/OPAL to HBRT with + // io_type set to HBRT_FW_MSG_TYPE_GARD_EVENT + struct gard_event_t gard_event; + // This struct is sent from HBRT with // io_type set to HBRT_FW_MSG_HBRT_FSP_REQ or // HBRT_FW_MSG_HBRT_FSP_RESP diff --git a/src/include/usr/runtime/runtime_reasoncodes.H b/src/include/usr/runtime/runtime_reasoncodes.H index 5cfdce5c2..b54f29751 100644 --- a/src/include/usr/runtime/runtime_reasoncodes.H +++ b/src/include/usr/runtime/runtime_reasoncodes.H @@ -147,6 +147,8 @@ namespace RUNTIME RC_NO_SPACE_FOR_ATTRIBUTE_SERIALIZATION = RUNTIME_COMP_ID | 0x47, RC_CANNOT_MAKE_ATTRIBUTE = RUNTIME_COMP_ID | 0x48, RT_NO_OMI_TARGET_FOUND = RUNTIME_COMP_ID | 0x49, + RC_LOG_GARD_EVENT_UNKNOWN_ERROR_TYPE = RUNTIME_COMP_ID | 0x4A, + RC_LOG_GARD_EVENT = RUNTIME_COMP_ID | 0x4B, }; enum UserDetailsTypes diff --git a/src/usr/util/runtime/rt_fwnotify.C b/src/usr/util/runtime/rt_fwnotify.C index a96b24bba..e9ebabe6d 100644 --- a/src/usr/util/runtime/rt_fwnotify.C +++ b/src/usr/util/runtime/rt_fwnotify.C @@ -652,6 +652,117 @@ int doNvDimmOperation(const hostInterfaces::nvdimm_operation_t& i_nvDimmOp) return rc; } +/** + * @brief Log the gard event from PHYP/OPAL + * + * @param[in] i_gardEvent - The details of the gard event + * @see hostInterfaces::gard_event_t for more info + * + **/ +void logGardEvent(const hostInterfaces::gard_event_t& i_gardEvent) +{ + // Trace input components + TRACFCOMP(g_trac_runtime, + ENTER_MRK"logGardEvent: Gard Event Data: " + "error type(0x%.8X), processor ID(0x%.8X), " + "PLID(0x%.8X), sub unit mask(0x.%4X), " + "recovery level(0x.%4X)", + i_gardEvent.i_error_type, + i_gardEvent.i_procId, + i_gardEvent.i_plid, + i_gardEvent.i_sub_unit_mask, + i_gardEvent.i_recovery_level); + + errlHndl_t l_err{nullptr}; + + do + { + // Make sure the error type is valid, if not, log it + if ((i_gardEvent.i_error_type == hostInterfaces::HBRT_GARD_ERROR_UNKNOWN ) || + (i_gardEvent.i_error_type >= hostInterfaces::HBRT_GARD_ERROR_LAST) ) + { + TRACFCOMP(g_trac_runtime, "logGardEvent: ERROR: unknown/invalid " + "error type 0x%.8X", + i_gardEvent.i_error_type); + + /* @ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid MOD_RT_FIRMWARE_NOTIFY + * @reasoncode RC_LOG_GARD_EVENT_UNKNOWN_ERROR_TYPE + * @userdata1[0:31] GARD error type + * @userdata1[32:63] Processor ID + * @userdata2[0:31] Sub unit mask + * @userdata2[32:63] Recovery level + * @devdesc Unknown/invalid error type + * @custdesc Internal firmware error + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + MOD_RT_FIRMWARE_NOTIFY, + RC_LOG_GARD_EVENT_UNKNOWN_ERROR_TYPE, + TWO_UINT32_TO_UINT64( + i_gardEvent.i_error_type, + i_gardEvent.i_procId), + TWO_UINT32_TO_UINT64( + i_gardEvent.i_sub_unit_mask, + i_gardEvent.i_recovery_level), + ErrlEntry::ADD_SW_CALLOUT); + break; + } + + + // Get the Target associated with processor ID + TARGETING::TargetHandle_t l_procTarget{nullptr}; + l_err = RT_TARG::getHbTarget(i_gardEvent.i_procId, l_procTarget); + if (l_err) + { + TRACFCOMP(g_trac_runtime, "logGardEvent: Error getting " + "HB Target from processor ID 0x%0X, " + "exiting ...", + i_gardEvent.i_procId); + break; + } + + // Log the GARD event + /* @ + * @errortype + * @severity ERRL_SEV_PREDICTIVE + * @moduleid MOD_RT_FIRMWARE_NOTIFY + * @reasoncode RC_LOG_GARD_EVENT + * @userdata1[0:31] GARD error type + * @userdata1[32:63] Processor ID + * @userdata2[0:31] Sub unit mask + * @userdata2[32:63] Recovery level + * @devdesc Gard event from Opal/Phyp + * @custdesc Hardware error detected at runtime + */ + l_err = new ErrlEntry( ERRL_SEV_PREDICTIVE, + MOD_RT_FIRMWARE_NOTIFY, + RC_LOG_GARD_EVENT, + TWO_UINT32_TO_UINT64( + i_gardEvent.i_error_type, + i_gardEvent.i_procId), + TWO_UINT32_TO_UINT64( + i_gardEvent.i_sub_unit_mask, + i_gardEvent.i_recovery_level)); + + // Set the PLID to the given gard event PLID if it exist + if (i_gardEvent.i_plid) + { + l_err->plid(i_gardEvent.i_plid); + } + + // Do the actual gard + l_err->addHwCallout( l_procTarget, HWAS::SRCI_PRIORITY_MED, + HWAS::NO_DECONFIG, HWAS::GARD_PHYP); + } while(0); + + // Commit any error log that occurred. + errlCommit(l_err, RUNTIME_COMP_ID); + + TRACFCOMP(g_trac_runtime, EXIT_MRK"logGardEvent") +} + /** * @see src/include/runtime/interface.h for definition of call * @@ -764,6 +875,31 @@ void firmware_notify( uint64_t i_len, void *i_data ) } // END case hostInterfaces::HBRT_FW_MSG_TYPE_NVDIMM_OPERATION: break; + case hostInterfaces::HBRT_FW_MSG_TYPE_GARD_EVENT: + { + uint64_t l_minMsgSize = hostInterfaces::HBRT_FW_MSG_BASE_SIZE + + sizeof(hostInterfaces::hbrt_fw_msg::gard_event); + if (i_len < l_minMsgSize) + { + l_badMessage = true; + + TRACFCOMP(g_trac_runtime, ERR_MRK"firmware_notify: " + "Received message HBRT_FW_MSG_TYPE_GARD_EVENT, " + "but size of message data(%d) is not adequate for a " + "complete message of this type, with size requirement of " + "%d", i_len, l_minMsgSize ); + + // Pack user data 1 with the message input type, the only + // data that can be safely retrieved + l_userData1 = l_hbrt_fw_msg->io_type; + + break; + } + + logGardEvent(l_hbrt_fw_msg->gard_event); + } // END case hostInterfaces::HBRT_FW_MSG_TYPE_GARD_EVENT: + break; + default: { l_badMessage = true; -- cgit v1.2.1