diff options
author | Chris Cain <cjcain@us.ibm.com> | 2015-01-30 12:10:28 -0600 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2015-02-09 17:58:57 -0600 |
commit | 42c08eb3adaef09645324fb96358c80a18a4b96a (patch) | |
tree | c092bfbbeb6ffabcfbd87f1badc009aadc87a010 | |
parent | eb5642b95b3ba4e2968b727ba616ad6da24ff057 (diff) | |
download | talos-hostboot-42c08eb3adaef09645324fb96358c80a18a4b96a.tar.gz talos-hostboot-42c08eb3adaef09645324fb96358c80a18a4b96a.zip |
OCC Poll Validation and startup checkpoint monitoring
Change-Id: I2f6e6d31ccd10bb6add9d608363db3e5048975a8
RTC: 117248
Reviewed-on: http://gfw160.aus.stglabs.ibm.com:8080/gerrit/15483
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Tested-by: Jenkins Server
-rw-r--r-- | src/include/runtime/interface.h | 50 | ||||
-rw-r--r-- | src/include/usr/htmgt/htmgt_reasoncodes.H | 5 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt.C | 14 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.C | 110 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_occ.H | 34 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_poll.C | 54 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_utility.C | 6 | ||||
-rw-r--r-- | src/usr/htmgt/htmgt_utility.H | 5 | ||||
-rw-r--r-- | src/usr/hwpf/hwp/occ/runtime/rt_occ.C | 84 |
9 files changed, 321 insertions, 41 deletions
diff --git a/src/include/runtime/interface.h b/src/include/runtime/interface.h index da321e231..9f52c09c1 100644 --- a/src/include/runtime/interface.h +++ b/src/include/runtime/interface.h @@ -241,11 +241,16 @@ typedef struct runtimeInterfaces int (*occ_stop)(uint64_t* i_chip, size_t i_num_chips); - /** Reset OCC upon failure - * @param [in]: i_chipId: Id of processor with failing OCC - * @return NONE + /** + * @brief Notify HTMGT that an OCC has an error to report + * + * @details When an OCC has encountered an error that it wants to + * be reported, this interface will be called to trigger + * HTMGT to collect and commit the error. + * + * @param[in] i_chipId ChipID which identifies the OCC reporting an error */ - void (*occ_error) (uint64_t i_chipId); + void (*process_occ_error)(uint64_t i_chipId); /** Enable chip attentions * @@ -271,6 +276,43 @@ typedef struct runtimeInterfaces uint64_t i_ipollStatus, uint64_t i_ipollMask); + /** + * @brief Notify HTMGT that an OCC has failed and needs to be reset + * + * @details When BMC detects an OCC failure that requires a reset, + * this interface will be called to trigger the OCC reset. + * HTMGT maintains a reset count and if there are additional + * resets available, the OCCs get reset/reloaded. + * If the recovery attempts have been exhauseted or the OCC + * fails to go active, an unrecoverable error will be logged + * and the system will remain in safe mode. + * + * @param[in] i_chipId ChipID which identifies the failing OCC + */ + void (*process_occ_reset)(uint64_t i_chipId); + + /** + * @brief Change the OCC state + * + * @details This is a blocking call that will change the OCC state. + * The OCCs will only actuate (update processor frequency/ + * voltages) when in Active state. The OCC will only be + * monitoring/observing when in Observation state. + * + * @note When the OCCs are initially started, the state will default + * to Active. If the state is changed to Observation, that + * state will be retained until the next IPL. (If the OCC would + * get reset, it would return to the last requested state) + * + * + * @param[in] i_occ_activation set to 0 to move OCC to Observation state + * or any other value to move OCC to Active state + * + * @returns 0 on success, or return code if the state did not + * change. + */ + int (*enable_occ_actuation)(int i_occ_activation); + // Reserve some space for future growth. void (*reserved[32])(void); diff --git a/src/include/usr/htmgt/htmgt_reasoncodes.H b/src/include/usr/htmgt/htmgt_reasoncodes.H index 1c374e3e1..98cdb5ecb 100644 --- a/src/include/usr/htmgt/htmgt_reasoncodes.H +++ b/src/include/usr/htmgt/htmgt_reasoncodes.H @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: src/include/usr/htmgt/htmgtreasoncodes.H $ */ +/* $Source: src/include/usr/htmgt/htmgt_reasoncodes.H $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014 */ +/* Contributors Listed Below - COPYRIGHT 2014,2015 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -63,6 +63,7 @@ namespace HTMGT HTMGT_RC_OCC_CMD_FAIL = HTMGT_COMP_ID | 0x1B, HTMGT_RC_INVALID_LENGTH = HTMGT_COMP_ID | 0x1C, HTMGT_RC_OCC_ERROR_LOG = HTMGT_COMP_ID | 0x1D, + HTMGT_RC_INVALID_ROLE = HTMGT_COMP_ID | 0x22, HTMGT_RC_INVALID_DATA = HTMGT_COMP_ID | 0x23, HTMGT_RC_OCC_UNAVAILABLE = HTMGT_COMP_ID | 0x24, HTMGT_RC_OCC_START_FAIL = HTMGT_COMP_ID | 0x25, diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C index d38c1df27..1c8cef57e 100644 --- a/src/usr/htmgt/htmgt.C +++ b/src/usr/htmgt/htmgt.C @@ -49,9 +49,6 @@ namespace HTMGT { - const uint32_t HTMGT_DELAY_BEFORE_COMM = 5; - - // Move the OCCs to active state or log unrecoverable error and // stay in safe mode void processOccStartStatus(const bool i_startCompleted, @@ -75,11 +72,6 @@ namespace HTMGT { do { - // TODO: RTC 119831 - remove hardcoded delay - // Delay before communication with OCCs to make sure - // they are ready (since there is no initial attention) - nanosleep(HTMGT_DELAY_BEFORE_COMM, 0); - #ifndef __HOSTBOOT_RUNTIME if (false == occMgr::instance().iv_configDataBuilt) { @@ -97,6 +89,9 @@ namespace HTMGT } #endif + // Make sure OCCs are ready for communication + occMgr::instance().waitForOccCheckpoint(); + // Send poll to establish comm TMGT_INF("Send initial poll to all OCCs to" " establish comm"); @@ -105,7 +100,6 @@ namespace HTMGT { // Continue even if failed (poll will be retried) ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - l_err = NULL; } // Send ALL config data @@ -125,7 +119,6 @@ namespace HTMGT { // Continue even if failed to update sensor ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - l_err = NULL; } // @TODO RTC 120059 remove after elog alerts supported @@ -141,7 +134,6 @@ namespace HTMGT if (l_err) { ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - l_err = NULL; } #endif diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C index 1571076e5..d31694c6d 100644 --- a/src/usr/htmgt/htmgt_occ.C +++ b/src/usr/htmgt/htmgt_occ.C @@ -36,6 +36,9 @@ #include <targeting/common/attributes.H> #include <targeting/common/targetservice.H> #include <console/consoleif.H> +#include <sys/time.h> +#include <ecmdDataBufferBase.H> +#include <hwpf/hwp/occ/occAccess.H> namespace HTMGT @@ -57,6 +60,7 @@ namespace HTMGT iv_homer(i_homer), iv_target(i_target), iv_lastPollValid(false), + iv_occsPresent(1 << i_instance), iv_version(0x01) { } @@ -151,6 +155,22 @@ namespace HTMGT } // end Occ::setState() + // Update master occsPresent bits for poll rsp validataion + void Occ::updateOccPresentBits(uint8_t i_slavePresent) + { + if (iv_occsPresent & i_slavePresent) + { + // Flag error because multiple OCCs have same chip ID + TMGT_ERR("updateOccPreset: slave 0x%02X already exists (0x%02X)", + i_slavePresent, iv_occsPresent); + iv_needsReset = true; + } + else + { + iv_occsPresent |= i_slavePresent; + } + }; + ///////////////////////////////////////////////////////////////// @@ -355,7 +375,6 @@ namespace HTMGT if (false == needsRetry) { ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - l_err = NULL; needsRetry = true; } else @@ -390,7 +409,6 @@ namespace HTMGT { TMGT_ERR("_setOccState: Poll all OCCs failed"); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); - l_err = NULL; } // Make sure all OCCs went to active state @@ -463,6 +481,88 @@ namespace HTMGT } // end OccManager::_setOccState() + // Wait for all OCCs to reach communications checkpoint + void OccManager::_waitForOccCheckpoint() + { +#ifdef CONFIG_HTMGT + // Wait up to 10 seconds for all OCCs to be ready (100 * 100ms = 10s) + const size_t NS_BETWEEN_READ = 100 * NS_PER_MSEC; + const size_t READ_RETRY_LIMIT = 100; + + if (iv_occArray.size() > 0) + { + uint8_t retryCount = 0; + bool throttleErrors = false; + + for (std::vector<Occ*>::iterator pOcc = iv_occArray.begin(); + pOcc < iv_occArray.end(); + pOcc++) + { + bool occReady = false; + + while ((!occReady) && (retryCount++ < READ_RETRY_LIMIT)) + { + // Read SRAM response buffer to check for OCC checkpoint + errlHndl_t l_err = NULL; + const uint16_t l_length = 8; + ecmdDataBufferBase l_buffer(l_length*8); // convert to bits + l_err = HBOCC::readSRAM((*pOcc)->getTarget(), + OCC_RSP_SRAM_ADDR, + l_buffer); + if (NULL == l_err) + { + // Check response status for checkpoint + if ((0x0E == l_buffer.getByte(6)) && + (0xFF == l_buffer.getByte(7))) + { + TMGT_INF("waitForOccCheckpoint OCC%d ready!", + (*pOcc)->getInstance()); + + occReady = true; + break; + } + } + else + { + if (false == throttleErrors) + { + throttleErrors = true; + TMGT_ERR("waitForOccCheckpoint: error trying to " + "read OCC%d SRAM (rc=0x%04X)", + (*pOcc)->getInstance(), + l_err->reasonCode()); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); + } + else + { + delete l_err; + l_err = NULL; + } + } + + nanosleep(0, NS_BETWEEN_READ); + } + + if (!occReady) + { + TMGT_ERR("waitForOccCheckpoint OCC%d still NOT ready!", + (*pOcc)->getInstance()); + } + + if ((OCC_ROLE_MASTER != (*pOcc)->getRole()) && + (NULL != iv_occMaster)) + { + // update master occsPresent bit for each slave OCC + iv_occMaster-> + updateOccPresentBits((*pOcc)->getPresentBits()); + } + } + } +#endif + } + + + uint8_t OccManager::getNumOccs() { return Singleton<OccManager>::instance()._getNumOccs(); @@ -499,6 +599,12 @@ namespace HTMGT } + void OccManager::waitForOccCheckpoint() + { + return Singleton<OccManager>::instance()._waitForOccCheckpoint(); + } + + #if 0 // TODO: RTC 115296 void update_occ_data() diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H index 398cf928e..72834346d 100644 --- a/src/usr/htmgt/htmgt_occ.H +++ b/src/usr/htmgt/htmgt_occ.H @@ -176,6 +176,26 @@ namespace HTMGT occStateId getState() { return iv_state; }; + /** + * @brief Return OCCs present bits + * + * @return bitmask representing this OCC position + */ + uint8_t getPresentBits() { return iv_occsPresent; }; + + + /** + * @brief Update OCCs present bits in the master OCC + * + * @note Should only be called for Maseter OCC. This is + * used to ensure the master can see all Slave OCCs + * and that no two slaves have same chip id. + * + * @param[in] i_slavePresent Bitmask for slave OCC to add + */ + void updateOccPresentBits(uint8_t i_slavePresent); + + protected: // Instance number of this OCC: 0 = first physical OCC uint8_t iv_instance; @@ -201,6 +221,8 @@ namespace HTMGT uint8_t iv_lastPollResponse[OCC_POLL_DATA_MIN_SIZE]; // true if lastPollResponse contains valid data bool iv_lastPollValid; + // expected occsPresent byte in POLL response + uint8_t iv_occsPresent; private: @@ -304,6 +326,16 @@ namespace HTMGT + /** + * @brief Wait for all of the OCCs to reach their checkpoint + * state. That indicates that the OCCs are ready to + * communicate and start handling commands. This + * function will wait up to 10 seconds for all OCCs + * before returning to the caller. + */ + void waitForOccCheckpoint(); + + private: Occ * iv_occMaster; std::vector<Occ*> iv_occArray; @@ -354,6 +386,8 @@ namespace HTMGT /* See setOccState() above */ errlHndl_t _setOccState(const occStateId i_state); + void _waitForOccCheckpoint(); + }; typedef Singleton<OccManager> occMgr; diff --git a/src/usr/htmgt/htmgt_poll.C b/src/usr/htmgt/htmgt_poll.C index bd78a0772..29ca416cc 100644 --- a/src/usr/htmgt/htmgt_poll.C +++ b/src/usr/htmgt/htmgt_poll.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014 */ +/* Contributors Listed Below - COPYRIGHT 2014,2015 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -48,6 +48,8 @@ namespace HTMGT errlHndl_t l_err = NULL; uint8_t * l_poll_rsp = NULL; + TMGT_INF("sendOccPoll(flush=%c)", i_flushAllErrors?'y':'n'); + // Loop through all functional OCCs std::vector<Occ*> occList = occMgr::instance().getOccArray(); for (std::vector<Occ*>::iterator itr = occList.begin(); @@ -57,6 +59,7 @@ namespace HTMGT Occ * occ = (*itr); const uint8_t occInstance = occ->getInstance(); + TMGT_INF("sendOccPoll: Polling OCC%d", occInstance); bool continuePolling = false; size_t elogCount = 10; do @@ -217,6 +220,8 @@ namespace HTMGT if ((OCC_STATE_ACTIVE == pollRsp->state) || (OCC_STATE_OBSERVATION == pollRsp->state)) { + errlHndl_t l_err = NULL; + // Check role status if (((OCC_ROLE_SLAVE == iv_role) && ((pollRsp->status & OCC_STATUS_MASTER) != 0)) || @@ -228,18 +233,59 @@ namespace HTMGT iv_instance, iv_role, pollRsp->status, pollRsp->extStatus); iv_needsReset = true; + /*@ + * @errortype + * @reasoncode HTMGT_RC_INVALID_ROLE + * @moduleid HTMGT_MOD_OCC_POLL + * @userdata1[0-15] OCC instance + * @userdata[16-31] response state + * @userdata2[0-15] expected role + * @userdata2[16-31] response status byte + * @devdesc Invalid role is POLL response + */ + bldErrLog(l_err, HTMGT_MOD_OCC_POLL, + HTMGT_RC_INVALID_ROLE, + iv_instance, pollRsp->state, + iv_role, pollRsp->status, + ERRORLOG::ERRL_SEV_INFORMATIONAL); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); // TODO RTC 109224 //iv_resetReason = OCC_RESET_REASON_ERROR; break; } + + if (pollRsp->occsPresent != iv_occsPresent) + { + TMGT_ERR("pollRspHandler: OCC%d present mismatch" + " (expected 0x%02X, but received 0x%02X)", + iv_instance, iv_occsPresent, + pollRsp->occsPresent); + iv_needsReset = true; + /*@ + * @errortype + * @reasoncode HTMGT_RC_INVALID_DATA + * @moduleid HTMGT_MOD_OCC_POLL + * @userdata1[0-15] OCC instance + * @userdata[16-31] response OCC present + * @userdata2[0-15] expected OCC present + * @userdata2[16-31] response status byte + * @devdesc Invalid OCC present data in POLL response + */ + bldErrLog(l_err, HTMGT_MOD_OCC_POLL, + HTMGT_RC_INVALID_DATA, + iv_instance, pollRsp->occsPresent, + iv_occsPresent, pollRsp->status, + ERRORLOG::ERRL_SEV_INFORMATIONAL); + ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); + // TODO RTC 109224 + //iv_resetReason = OCC_RESET_REASON_ERROR; + } } - //iv_requestedFormat = (occCfgDataFormat)pollRsp->requestedCfg; if (pollRsp->requestedCfg != 0x00) { TMGT_INF("pollRspHandler: OCC%d is requesting cfg format" - " 0x%02X", iv_instance, - pollRsp->requestedCfg); + " 0x%02X", iv_instance, pollRsp->requestedCfg); } // Check for state change diff --git a/src/usr/htmgt/htmgt_utility.C b/src/usr/htmgt/htmgt_utility.C index 8a2d61813..5cb689121 100644 --- a/src/usr/htmgt/htmgt_utility.C +++ b/src/usr/htmgt/htmgt_utility.C @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: src/usr/htmgt/tmgtutility.C $ */ +/* $Source: src/usr/htmgt/htmgt_utility.C $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014 */ +/* Contributors Listed Below - COPYRIGHT 2014,2015 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -36,7 +36,7 @@ namespace HTMGT // Debug flags uint32_t G_debug_data = 0; - uint32_t G_debug_trace = DEBUG_TRACE_FULL_NONVERBOSE; // TODO RTC 117248 + uint32_t G_debug_trace = DEBUG_TRACE_FULL_NONVERBOSE; // Create/Build an Error log and add HTMGT component trace diff --git a/src/usr/htmgt/htmgt_utility.H b/src/usr/htmgt/htmgt_utility.H index d8ef14830..3cb772f7b 100644 --- a/src/usr/htmgt/htmgt_utility.H +++ b/src/usr/htmgt/htmgt_utility.H @@ -1,11 +1,11 @@ /* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ -/* $Source: src/usr/htmgt/tmgtutility.H $ */ +/* $Source: src/usr/htmgt/htmgt_utility.H $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014 */ +/* Contributors Listed Below - COPYRIGHT 2014,2015 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -49,6 +49,7 @@ #define TMGT_BIN( _fmt_, _args_...) \ TRACFBIN( g_trac_htmgt, _fmt_, ##_args_ ) + inline uint16_t UINT16_GET(const uint8_t * i_ptr) { return (*i_ptr) << 8 | *(i_ptr+1); diff --git a/src/usr/hwpf/hwp/occ/runtime/rt_occ.C b/src/usr/hwpf/hwp/occ/runtime/rt_occ.C index 0d5dc806c..29509a1b9 100644 --- a/src/usr/hwpf/hwp/occ/runtime/rt_occ.C +++ b/src/usr/hwpf/hwp/occ/runtime/rt_occ.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014 */ +/* Contributors Listed Below - COPYRIGHT 2014,2015 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -33,6 +33,7 @@ #include <errl/errlentry.H> #include <errl/errlmanager.H> #include <util/utillidmgr.H> +#include <htmgt/htmgt.H> // targeting support #include <targeting/common/commontargeting.H> @@ -65,21 +66,76 @@ namespace RT_OCC //------------------------------------------------------------------------ - void occ_error (uint64_t i_chipId) + void process_occ_error (uint64_t i_chipId) { - do +#ifdef CONFIG_HTMGT + TARGETING::Target* l_reportingOccTarget = NULL; + errlHndl_t err = RT_TARG::getHbTarget(i_chipId,l_reportingOccTarget); + if (err) + { + TRACFCOMP (g_fapiTd, ERR_MRK"process_occ_error: getHbTarget" + " failed at %d chipId", i_chipId); + errlCommit (err, HWPF_COMP_ID); + } + else + { + HTMGT::processOccError(l_reportingOccTarget); + } +#else + TRACFCOMP(g_fapiTd, ERR_MRK"Unexpected call to process_occ_error(%d)" + " when HTMGT is not enabled", i_chipId); +#endif + } + + //------------------------------------------------------------------------ + + void process_occ_reset (uint64_t i_chipId) + { +#ifdef CONFIG_HTMGT + TARGETING::Target* l_failedOccTarget = NULL; + errlHndl_t err = RT_TARG::getHbTarget(i_chipId,l_failedOccTarget); + if (err) + { + TRACFCOMP (g_fapiTd, ERR_MRK"process_occ_reset: getHbTarget" + " failed at %d chipId", i_chipId); + errlCommit (err, HWPF_COMP_ID); + } + else { - TARGETING::Target* l_failedOccTarget = NULL; - errlHndl_t l_errl =RT_TARG::getHbTarget(i_chipId,l_failedOccTarget); - if (l_errl) + HTMGT::processOccReset(l_failedOccTarget); + } +#else + TRACFCOMP(g_fapiTd, ERR_MRK"Unexpected call to process_occ_reset(%d)" + " when HTMGT is not enabled", i_chipId); +#endif + } + + //------------------------------------------------------------------------ + + int enable_occ_actuation (int i_occ_activation) + { + int rc = 0; +#ifdef CONFIG_HTMGT + errlHndl_t err = HTMGT::enableOccActuation(0 != i_occ_activation); + if (err) + { + rc = err->reasonCode(); + if (0 == rc) { - TRACFCOMP (g_fapiTd, "occ_error: getHbTarget failed at %d chipId", i_chipId); - errlCommit (l_errl, HWPF_COMP_ID); - break; + // If there was a failure, be sure to return non-zero status + rc = -1; } - //TODO RTC: 114906 - //HTMGT::htmgtProcessOccError(l_failedOccTarget); - } while (0); + TRACFCOMP (g_fapiTd,ERR_MRK"enable_occ_actuation: OCC state change" + " failed with rc=0x%04X (actuate=%d)", + err->reasonCode(), i_occ_activation); + errlCommit (err, HWPF_COMP_ID); + } +#else + rc = -1; + TRACFCOMP(g_fapiTd,ERR_MRK"Unexpected call to enable_occ_actuation(%d)" + " when HTMGT is not enabled", i_occ_activation); +#endif + return rc; } //------------------------------------------------------------------------ @@ -368,7 +424,9 @@ namespace RT_OCC rt_intf->occ_load = &executeLoadOCC; rt_intf->occ_start = &executeStartOCCs; rt_intf->occ_stop = &executeStopOCCs; - rt_intf->occ_error = &occ_error; + rt_intf->process_occ_error = &process_occ_error; + rt_intf->process_occ_reset = &process_occ_reset; + rt_intf->enable_occ_actuation = &enable_occ_actuation; // If we already loaded OCC during the IPL we need to fix up // the virtual address because we're now not using virtual |