/* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ /* $Source: src/usr/htmgt/htmgt_occ.C $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ /* Contributors Listed Below - COPYRIGHT 2014,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ /* implied. See the License for the specific language governing */ /* permissions and limitations under the License. */ /* */ /* IBM_PROLOG_END_TAG */ #include #include #include "htmgt_utility.H" #include "htmgt_occcmd.H" #include "htmgt_cfgdata.H" #include "htmgt_occ.H" #include "htmgt_poll.H" #include #include #include #include #include #include #include #include #include #include #include namespace HTMGT { Occ::Occ(const uint8_t i_instance, const bool i_masterCapable, uint8_t * i_homer, TARGETING::TargetHandle_t i_target, const occRole i_role) :iv_instance(i_instance), iv_masterCapable(i_masterCapable), iv_role(i_role), iv_state(OCC_STATE_UNKNOWN), iv_commEstablished(false), iv_needsReset(false), iv_needsWofReset(false), iv_wofResetCount(0), iv_wofResetReasons(0), iv_failed(false), iv_seqNumber(0), iv_homer(i_homer), iv_target(i_target), iv_lastPollValid(false), iv_occsPresent(1 << i_instance), iv_gpuCfg(0), iv_resetReason(OCC_RESET_REASON_NONE), iv_exceptionLogged(0), iv_resetCount(0), iv_version(0x01) { } Occ::~Occ() { } // Return true if specified status bit is set in last poll response bool Occ::statusBitSet(const uint8_t i_statusBit) { bool isSet = false; if (iv_lastPollValid) { const occPollRspStruct_t *lastPoll = (occPollRspStruct_t*)iv_lastPollResponse; isSet = ((lastPoll->status & i_statusBit) == i_statusBit); } return isSet; } // Set state of the OCC errlHndl_t Occ::setState(const occStateId i_state) { errlHndl_t l_err = nullptr; if (OCC_ROLE_MASTER == iv_role) { const uint8_t l_cmdData[3] = { 0x00, // version i_state, 0x00 // reserved }; OccCmd cmd(this, OCC_CMD_SET_STATE, sizeof(l_cmdData), l_cmdData); l_err = cmd.sendOccCmd(); if (l_err != nullptr) { TMGT_ERR("setState: Failed to set OCC%d state, rc=0x%04X", iv_instance, l_err->reasonCode()); } else { if (OCC_RC_SUCCESS != cmd.getRspStatus()) { TMGT_ERR("setState: Set OCC%d state failed" " with OCC status 0x%02X", iv_instance, cmd.getRspStatus()); /*@ * @errortype * @moduleid HTMGT_MOD_OCC_SET_STATE * @reasoncode HTMGT_RC_OCC_CMD_FAIL * @userdata1[0-31] OCC instance * @userdata1[32-63] Requested state * @userdata2[0-31] OCC response status * @userdata2[32-63] current OCC state * @devdesc Set of OCC state failed */ bldErrLog(l_err, HTMGT_MOD_OCC_SET_STATE, HTMGT_RC_OCC_CMD_FAIL, iv_instance, i_state, cmd.getRspStatus(), iv_state, ERRORLOG::ERRL_SEV_INFORMATIONAL); } } } else { TMGT_ERR("setState: State only allowed to be set on master OCC"); /*@ * @errortype * @moduleid HTMGT_MOD_OCC_SET_STATE * @reasoncode HTMGT_RC_INTERNAL_ERROR * @userdata1 OCC instance * @userdata2 Requested state * @devdesc Set state only allowed on master OCC */ bldErrLog(l_err, HTMGT_MOD_OCC_SET_STATE, HTMGT_RC_INTERNAL_ERROR, 0, iv_instance, 0, i_state, ERRORLOG::ERRL_SEV_INFORMATIONAL); } return l_err; } // end Occ::setState() // Update master occsPresent bits for poll rsp validataion void Occ::updateOccPresentBits(uint8_t i_slavePresent) { if (iv_occsPresent & i_slavePresent) { // Flag error because multiple OCCs have same chip ID TMGT_ERR("updateOccPresentBits: slave 0x%02X already " "exists (0x%02X)", i_slavePresent, iv_occsPresent); iv_needsReset = true; } else { iv_occsPresent |= i_slavePresent; } }; // Reset OCC bool Occ::resetPrep() { errlHndl_t err = nullptr; bool atThreshold = false; // Send resetPrep command uint8_t cmdData[2]; cmdData[0] = OCC_RESET_CMD_VERSION; TMGT_INF("resetPrep: OCC%d (failed=%c, reset count=%d)" " reset reason=0x%x", iv_instance, iv_failed?'y':'n', iv_resetCount, iv_resetReason); if(iv_failed) { cmdData[1] = OCC_RESET_FAIL_THIS_OCC; ++iv_resetCount; TMGT_INF("resetPrep: OCC%d failed, incrementing reset count to %d", iv_instance, iv_resetCount); if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD) { atThreshold = true; } } else if( iv_needsWofReset ) //If WOF reset, increment count { iv_wofResetCount++; TMGT_INF("resetPrep(): WOF reset requested. Reset Count = %d", iv_wofResetCount ); } else { cmdData[1] = OCC_RESET_FAIL_THIS_OCC; } if (iv_commEstablished) { OccCmd cmd(this, OCC_CMD_RESET_PREP, sizeof(cmdData), cmdData); err = cmd.sendOccCmd(); if(err) { // log error and keep going TMGT_ERR("OCC::resetPrep: OCC%d resetPrep failed, rc=0x%04x", iv_instance, err->reasonCode()); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } // poll and flush error logs from OCC - Check Ex return code err = pollForErrors(true); if(err) { ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } // else comm to OCC has not been established yet return atThreshold; } void Occ::postResetClear() { iv_state = OCC_STATE_UNKNOWN; iv_commEstablished = false; iv_needsReset = false; iv_needsWofReset = false; iv_failed = false; iv_lastPollValid = false; iv_resetReason = OCC_RESET_REASON_NONE; iv_exceptionLogged = 0; } // Add channel 1 (circular buffer) SCOM data to elog void Occ::collectCheckpointScomData(errlHndl_t i_err) { if (i_err) { TARGETING::ConstTargetHandle_t procTarget = TARGETING::getParentChip(iv_target); ERRORLOG::ErrlUserDetailsLogRegister l_scom_data(procTarget); // Grab circular buffer scom data: (channel 1) l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C020));//OCB_OCI_IOSR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C024));//OCB_OCI_OIMR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C210));//OCB_OCI_OCBSLBR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C211));//OCB_OCI_OCBSLCS1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C213));//OCB_OCI_OCBSHBR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C214));//OCB_OCI_OCBSHCS1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C216));//OCB_OCI_OCBSES1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C217));//OCB_OCI_OCBICR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C218));//OCB_OCI_OCBLWCR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C21A));//OCB_OCI_OCBLWSR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6C21C));//*_OCI_OCBLWSBR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D010));//OCB_PIB_OCBAR0 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D030));//OCB_PIB_OCBAR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D031));//OCB_PIB_OCBCSR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D034));//OCB_PIB_OCBESR1 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D050));//OCB_PIB_OCBAR2 l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x6D070));//OCB_PIB_OCBAR3 l_scom_data.addToLog(i_err); } else { TMGT_ERR("collectCheckpointScomData: No error " "handle supplied for OCC%d", iv_instance); } } // end Occ::collectCheckpointScomData() // Utility function to actually add trace buffer data void addOccTraceBuffer( errlHndl_t & io_errl, TARGETING::Target * i_pOcc, uint32_t i_address ) { errlHndl_t l_errl = nullptr; uint8_t l_sramData[OCC_TRACE_BUFFER_SIZE]; //Initialize to 0; memset(l_sramData, 0, OCC_TRACE_BUFFER_SIZE); l_errl = HBOCC::readSRAM(i_pOcc, i_address, reinterpret_cast(l_sramData), OCC_TRACE_BUFFER_SIZE ); if ( ( l_errl == nullptr ) && ( l_sramData != 0 ) && ( io_errl != nullptr ) ) { // Strip off all but last 32 bytes of 00s uint32_t l_dataSize; // find first byte of data that is not 00s for( l_dataSize = OCC_TRACE_BUFFER_SIZE; ( l_dataSize > 32) && (l_sramData[l_dataSize-1] == 0x00); --l_dataSize); // pad 32 bytes l_dataSize += 32; if(l_dataSize > OCC_TRACE_BUFFER_SIZE) { l_dataSize = OCC_TRACE_BUFFER_SIZE; } // Add trace buffer to error log io_errl->addFFDC( HTMGT_COMP_ID, l_sramData, l_dataSize, 1, //version SUBSEC_ADDITIONAL_SRC ); #ifdef CONFIG_CONSOLE_OUTPUT_OCC_COMM char header[64]; sprintf(header, "OCC Trace 0x%08X: (0x%04X bytes)", i_address, l_dataSize); dumpToConsole(header, (const uint8_t *)l_sramData, l_dataSize); #endif } else { TMGT_ERR("addOccTraceBuffers: Unable to read OCC trace " "buffer from SRAM address (0x%08X)", i_address ); if( l_errl != nullptr ) { ERRORLOG::errlCommit(l_errl, HTMGT_COMP_ID); } } } void Occ::addOccTrace( errlHndl_t & io_errl ) { // Add ERR trace buffer addOccTraceBuffer( io_errl, iv_target, OCC_TRACE_ERR ); // Add IMP trace buffer addOccTraceBuffer( io_errl, iv_target, OCC_TRACE_IMP ); // Add INF trace buffer addOccTraceBuffer( io_errl, iv_target, OCC_TRACE_INF ); } // Notify HostBoot which GPUs are present (after OCC goes active) void Occ::updateGpuPresence() { TARGETING::ConstTargetHandle_t const_proc_target = TARGETING::getParentChip(iv_target); SENSOR::StatusSensor::statusEnum gpu_status[MAX_GPUS] = { SENSOR::StatusSensor::NOT_PRESENT, SENSOR::StatusSensor::NOT_PRESENT, SENSOR::StatusSensor::NOT_PRESENT }; if (iv_gpuCfg & GPUCFG_GPU0_PRESENT) gpu_status[0] = SENSOR::StatusSensor::PRESENT_FUNCTIONAL; if (iv_gpuCfg & GPUCFG_GPU1_PRESENT) gpu_status[1] = SENSOR::StatusSensor::PRESENT_FUNCTIONAL; if (iv_gpuCfg & GPUCFG_GPU2_PRESENT) gpu_status[2] = SENSOR::StatusSensor::PRESENT_FUNCTIONAL; TMGT_INF("updateGpuPresence: OCC%d - GPU0:%d, GPU1:%d, GPU2:%d", iv_instance, gpu_status[0], gpu_status[1], gpu_status[2]); SENSOR::updateGpuSensorStatus(const_cast (const_proc_target), gpu_status); } ///////////////////////////////////////////////////////////////// uint32_t OccManager::cv_safeReturnCode = 0; uint32_t OccManager::cv_safeOccInstance = 0; OccManager::OccManager() :iv_occMaster(nullptr), iv_state(OCC_STATE_UNKNOWN), iv_targetState(OCC_STATE_ACTIVE), iv_sysResetCount(0) { } OccManager::~OccManager() { _removeAllOccs(); } // Remove all OCC objects void OccManager::_removeAllOccs() { iv_occMaster = nullptr; if (iv_occArray.size() > 0) { for( const auto & occ : iv_occArray ) { TMGT_INF("removeAllOccs: Removing OCC%d", occ->getInstance()); delete occ; } iv_occArray.clear(); } } // Query the functional OCCs and build OCC objects errlHndl_t OccManager::_buildOccs(const bool i_occStart, const bool i_skipComm) { errlHndl_t err = nullptr; bool safeModeNeeded = false; TMGT_INF("_buildOccs called"); // Only build OCC objects once. if((iv_occArray.size() > 0) && (iv_occMaster != nullptr)) { TMGT_INF("_buildOccs: Existing OCC Targets kept = %d", iv_occArray.size()); return err; } // Remove existing OCC objects _removeAllOccs(); // Get all functional processors TARGETING::TargetHandleList pProcs; TARGETING::getChipResources(pProcs, TARGETING::TYPE_PROC, TARGETING::UTIL_FILTER_FUNCTIONAL); if (pProcs.size() > 0) { // for each functional processor for(const auto & proc : pProcs ) { // Instance number for this Processor/OCC const uint8_t instance = proc->getAttr(); TMGT_INF("_buildOccs: PROC%d is functional", instance); // Get HOMER virtual address uint8_t * homer = (uint8_t*) (proc->getAttr()); const uint8_t * homerPhys = (uint8_t*) (proc->getAttr()); TMGT_INF("_buildOccs: homer = 0x%08llX (virt) / 0x%08llX (phys)" " for Proc%d", homer, homerPhys, instance); //TODO: Remove once Boston Simics Model works with OCC #ifdef SIMICS_TESTING // Starting of OCCs is not supported in SIMICS, so fake out // HOMER memory area for testing if (nullptr == homer) { extern uint8_t * G_simicsHomerBuffer; if (nullptr == G_simicsHomerBuffer) { // Allocate a fake HOMER area G_simicsHomerBuffer = new uint8_t [OCC_CMD_ADDR+0x2000]; } homer = G_simicsHomerBuffer; TMGT_ERR("_buildOccs: Using hardcoded HOMER of 0x%08lX", homer); } #endif if ((nullptr != homer) && (nullptr != homerPhys)) { // Get functional OCC (one per proc) TARGETING::TargetHandleList occs; getChildChiplets(occs, proc, TARGETING::TYPE_OCC); if (occs.size() > 0) { const unsigned long huid = occs[0]->getAttr(); const bool masterCapable = occs[0]-> getAttr(); TMGT_INF("_buildOccs: Found OCC%d - HUID: 0x%0lX, " "masterCapable: %c, homer: 0x%0lX", instance, huid, masterCapable?'Y':'N', homer); _addOcc(instance, masterCapable, homer, occs[0]); } else { // OCC must not be functional TMGT_ERR("_buildOccs: OCC%d not functional", instance); } } else { // OCC will not be functional with no HOMER address TMGT_ERR("_buildOccs: HOMER address for OCC%d is nullptr!", instance); safeModeNeeded = true; if (nullptr == err) { /*@ * @errortype * @moduleid HTMGT_MOD_BUILD_OCCS * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE * @userdata1 OCC Instance * @userdata2 homer virtual address * @devdesc Homer pointer is nullptr, unable to * communicate with the OCCs. * Leaving system in safe mode. */ bldErrLog(err, HTMGT_MOD_BUILD_OCCS, HTMGT_RC_OCC_CRIT_FAILURE, 0, instance, (uint64_t)homer>>32, (uint64_t)homer&0xFFFFFFFF, ERRORLOG::ERRL_SEV_UNRECOVERABLE); } } } // for each processor if (nullptr != iv_occMaster) { // update master occsPresent bit for each slave OCC for( const auto & occ : iv_occArray ) { if(occ != iv_occMaster) { iv_occMaster-> updateOccPresentBits(occ->getPresentBits()); } } } } else { TMGT_ERR("_buildOccs: No functional processors found"); } if (0 == _getNumOccs()) { TMGT_ERR("_buildOccs: Unable to find any functional OCCs"); if (nullptr == err) { /*@ * @errortype * @reasoncode HTMGT_RC_OCC_UNAVAILABLE * @moduleid HTMGT_MOD_BUILD_OCCS * @userdata1 functional processor count * @devdesc No functional OCCs were found */ bldErrLog(err, HTMGT_MOD_BUILD_OCCS, HTMGT_RC_OCC_UNAVAILABLE, 0, pProcs.size(), 0, 0, ERRORLOG::ERRL_SEV_UNRECOVERABLE); } safeModeNeeded = true; } if ((false == i_occStart) && (nullptr == err) && (false == i_skipComm)) { // Send poll to query state of all OCCs // and flush any errors reported by the OCCs err = sendOccPoll(true); if (err) { TMGT_ERR("_buildOccs: Poll all OCCs failed"); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } _syncOccStates(); } if (safeModeNeeded) { // Clear OCC active sensors errlHndl_t err2 = setOccActiveSensors(false); if (err2) { TMGT_ERR("_buildOccs: Set OCC active sensor to false failed."); ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); } // Reset all OCCs TMGT_INF("_buildOccs: Calling HBPM::resetPMAll"); err2 = HBPM::resetPMAll(); if (nullptr != err2) { TMGT_ERR("_buildOccs: HBPM::resetPMAll failed with rc 0x%04X", err2->reasonCode()); err2->collectTrace("HTMGT"); ERRORLOG::errlCommit(err2, HTMGT_COMP_ID); } updateForSafeMode(err); } TMGT_INF("_buildOccs: OCC Targets found = %d", _getNumOccs()); return err; } // end OccManager::_buildOccs() // Add a functional OCC to be monitored void OccManager::_addOcc(const uint8_t i_instance, const bool i_masterCapable, uint8_t * i_homer, TARGETING::TargetHandle_t i_target) { TMGT_INF("addOcc(%d, masterCapable=%c)", i_instance, i_masterCapable?'y':'n'); occRole role = OCC_ROLE_SLAVE; if (true == i_masterCapable) { if (nullptr == iv_occMaster) { // No master assigned yet, use this OCC TMGT_INF("addOcc: OCC%d will be the master", i_instance); role = OCC_ROLE_MASTER; } else { role = OCC_ROLE_BACKUP_MASTER; } } Occ * l_occ = new Occ(i_instance, i_masterCapable, i_homer, i_target, role); // Add OCC to the array iv_occArray.push_back(l_occ); if (OCC_ROLE_MASTER == role) { iv_occMaster = l_occ; } } // end OccManager::_addOcc() // Get pointer to specified OCC Occ * OccManager::_getOcc(const uint8_t i_instance) { Occ *targetOcc = nullptr; for( const auto & occ : iv_occArray ) { if (occ->getInstance() == i_instance) { targetOcc = occ; break; } } return targetOcc; } // eng OccManager::_getOcc() // Set the OCC state errlHndl_t OccManager::_setOccState(const occStateId i_state) { errlHndl_t l_err = nullptr; occStateId requestedState = i_state; if (OCC_STATE_NO_CHANGE == i_state) { // If no state was requested use the target state requestedState = iv_targetState; } if ((requestedState == OCC_STATE_ACTIVE) || (requestedState == OCC_STATE_OBSERVATION) || (requestedState == OCC_STATE_CHARACTERIZATION) ) { // Function is only called on initial IPL and when user/mfg // requests a new state, so we can update target here. iv_targetState = requestedState; l_err = _buildOccs(); // if not already built. if (nullptr == l_err) { // Send poll cmd to confirm comm has been established. // Flush old errors to ensure any new errors will be collected l_err = _sendOccPoll(true, nullptr); if (l_err) { TMGT_ERR("_setOccState: Poll OCCs failed."); // Proceed with reset even if failed ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } if (nullptr != iv_occMaster) { TMGT_INF("_setOccState(state=0x%02X)", requestedState); const uint8_t occInstance = iv_occMaster->getInstance(); bool needsRetry = false; do { l_err = iv_occMaster->setState(requestedState); if (nullptr == l_err) { needsRetry = false; } else { TMGT_ERR("_setOccState: Failed to set OCC%d state," " rc=0x%04X", occInstance, l_err->reasonCode()); if (false == needsRetry) { ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); needsRetry = true; } else { // Only one retry, return error handle needsRetry = false; } } } while (needsRetry); } else { /*@ * @errortype * @moduleid HTMGT_MOD_OCCMGR_SET_STATE * @reasoncode HTMGT_RC_INTERNAL_ERROR * @devdesc Unable to set state of master OCC */ bldErrLog(l_err, HTMGT_MOD_OCCMGR_SET_STATE, HTMGT_RC_INTERNAL_ERROR, 0, 0, 0, 0, ERRORLOG::ERRL_SEV_INFORMATIONAL); } if (nullptr == l_err) { // Send poll to query state of all OCCs // and flush any errors reported by the OCCs l_err = sendOccPoll(true); if (l_err) { TMGT_ERR("_setOccState: Poll all OCCs failed"); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } // Make sure all OCCs went to active state for( const auto & occ : iv_occArray ) { if (requestedState == occ->getState()) { // Update GPU present status occ->updateGpuPresence(); } else { TMGT_ERR("_setOccState: OCC%d is not in 0x%02X " "state", occ->getInstance(), requestedState); /*@ * @errortype * @moduleid HTMGT_MOD_OCCMGR_SET_STATE * @reasoncode HTMGT_RC_OCC_UNEXPECTED_STATE * @userdata1[0-31] requested state * @userdata1[32-63] OCC state * @userdata2 OCC instance * @devdesc OCC did not change to requested state */ bldErrLog(l_err, HTMGT_MOD_OCCMGR_SET_STATE, HTMGT_RC_OCC_UNEXPECTED_STATE, requestedState, occ->getState(), 0, occ->getInstance(), ERRORLOG::ERRL_SEV_INFORMATIONAL); break; } } if (nullptr == l_err) { // Clear safe mode reason since OCC is at target state if (cv_safeReturnCode != 0) { TMGT_INF("_setOccState: clearing safe mode reason " "(0x%04X)", cv_safeReturnCode); cv_safeReturnCode = 0; cv_safeOccInstance = 0; } TMGT_INF("_setOccState: All OCCs have reached state " "0x%02X", requestedState); iv_state = requestedState; if (OCC_STATE_ACTIVE == requestedState) { TMGT_CONSOLE("OCCs are now running in ACTIVE " "state"); } else if (OCC_STATE_OBSERVATION == requestedState) { TMGT_CONSOLE("OCCs are now running in OBSERVATION " "state"); } else if (OCC_STATE_CHARACTERIZATION == requestedState) { TMGT_CONSOLE("OCCs are now running in " "CHARACTERIZATION state"); } } } } } else { TMGT_ERR("_setOccState: Invalid state 0x%02X requested", requestedState); /*@ * @errortype * @moduleid HTMGT_MOD_OCCMGR_SET_STATE * @reasoncode HTMGT_RC_INVALID_DATA * @userdata1 requested state * @devdesc Invalid OCC state requested */ bldErrLog(l_err, HTMGT_MOD_OCCMGR_SET_STATE, HTMGT_RC_INVALID_DATA, 0, requestedState, 0, 0, ERRORLOG::ERRL_SEV_INFORMATIONAL); } return l_err; } // end OccManager::_setOccState() errlHndl_t OccManager::_resetOccs(TARGETING::Target * i_failedOccTarget, bool i_skipCountIncrement, bool i_skipComm, enum occResetReason i_reason) { errlHndl_t err = nullptr; bool atThreshold = false; err = _buildOccs(false, i_skipComm); // if not a already built. if (nullptr == err) { if (false == int_flags_set(FLAG_RESET_DISABLED)) { err = setOccActiveSensors(false); // Set OCC sensor to inactive if( err ) { TMGT_ERR("_resetOccs: Set OCC sensors to inactive failed."); // log and continue ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } if (false == i_skipComm) { // Send poll cmd to all OCCs err = _sendOccPoll(false, // don't flush errors nullptr, // send to all OCCs true); // only poll if communications // has been established if (err) { TMGT_ERR("_resetOccs: Poll OCCs failed."); // Proceed with reset even if failed ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } for(const auto & occ : iv_occArray ) { if (i_reason != OCC_RESET_REASON_NONE) { occ->iv_resetReason = i_reason; } if(occ->getTarget() == i_failedOccTarget) { occ->failed(true); } if (false == i_skipComm) { // Send reset prep cmd to all OCCs if(occ->resetPrep()) { atThreshold = true; } } // If we need a WOF reset, skip system count increment if( occ->needsWofReset() ) { i_skipCountIncrement = true; } } if ((false == i_skipCountIncrement) && (false == _occFailed())) { // No OCC has been marked failed, increment sys reset count ++iv_sysResetCount; TMGT_INF("_resetOCCs: Incrementing system OCC reset count" " to %d", iv_sysResetCount); if(iv_sysResetCount > OCC_RESET_COUNT_THRESHOLD) { atThreshold = true; } } // else failed OCC reset count will be incremented automatically // Update OCC states to RESET for( const auto & occ : iv_occArray ) { occ->iv_state = OCC_STATE_RESET; } uint64_t retryCount = OCC_RESET_COUNT_THRESHOLD; TARGETING::Target* sys = nullptr; TARGETING::targetService().getTopLevelTarget(sys); while(retryCount) { if (sys) { // Increment cumulative reset count since boot uint8_t count = sys->getAttr(); if (count < 0xFF) { ++count; sys->setAttr(count); } } // Reset all OCCs TMGT_INF("_resetOccs: Calling HBPM::resetPMAll"); err = HBPM::resetPMAll(); if(!err) { break; } --retryCount; if (int_flags_set(FLAG_HALT_ON_RESET_FAIL)) { TMGT_ERR("_resetOCCs: resetPMAll failed with 0x%04X " "and HALT_ON_RESET_FAIL is set. Resets will " "be disabled", err->reasonCode()); set_int_flags(get_int_flags() | FLAG_RESET_DISABLED); break; } if(retryCount) { // log if not last retry ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } else { TMGT_ERR("_resetOCCs: HBPM::resetPMAll failed. " "Leaving OCCs in reset state"); // pass err handle back err->collectTrace("HTMGT"); } } if(!atThreshold && !err) { for( const auto & occ : iv_occArray ) { // After OCC has been reset, clear internal flags occ->postResetClear(); } //get parent proc chip. TARGETING::Target* l_proc_target = NULL; //Reload OCC on this processor chip. TMGT_INF("_resetOccs: Calling loadAndStartPMAll"); err = HBPM::loadAndStartPMAll(HBPM::PM_RELOAD, l_proc_target); if(err) { TMGT_ERR("_resetOCCs: loadAndStartPMAll failed. "); err->collectTrace("HTMGT"); processOccStartStatus(false, l_proc_target); } else { processOccStartStatus(true, l_proc_target); } } else if (!err) // Reset Threshold reached and no other err { // Create threshold error TMGT_ERR("_resetOCCs: Retry Threshold reached. " "Leaving OCCs in reset state"); /*@ * @errortype * @moduleid HTMGT_MOD_OCC_RESET * @reasoncode HTMGT_RC_OCC_CRIT_FAILURE * @userdata1 return code triggering safe mode * @userdata2 OCC instance * @devdesc OCC reset threshold reached. * Leaving OCCs in reset state */ bldErrLog(err, HTMGT_MOD_OCC_RESET, HTMGT_RC_OCC_CRIT_FAILURE, 0, cv_safeReturnCode, 0, cv_safeOccInstance, ERRORLOG::ERRL_SEV_UNRECOVERABLE); TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)", cv_safeReturnCode, cv_safeOccInstance); // Check if OCC already logged reason for safe mode // (add proc callout if non-OCC safe mode reason or // the OCC hit an exception) if (((cv_safeReturnCode & OCCC_COMP_ID) != OCCC_COMP_ID) || ((cv_safeReturnCode & 0xE0) == 0xE0)) { // Add processor callout TARGETING::ConstTargetHandle_t procTarget = nullptr; Occ *occPtr = _getOcc(cv_safeOccInstance); if (occPtr != nullptr) { procTarget = TARGETING::getParentChip(occPtr->getTarget()); } else { TMGT_ERR("_resetOCCs: Unable to determine target, " "using first proc"); // Get all functional processors TARGETING::TargetHandleList pProcs; TARGETING::getChipResources(pProcs, TARGETING::TYPE_PROC, TARGETING::UTIL_FILTER_FUNCTIONAL); procTarget = pProcs[0]; } if (nullptr != procTarget) { const unsigned long huid = procTarget->getAttr(); TMGT_ERR("_resetOCCs: Adding processor callout " "(HUID=0x%0lX)", huid); err->addHwCallout(procTarget, HWAS::SRCI_PRIORITY_MED, HWAS::NO_DECONFIG, HWAS::GARD_NULL); } } else { TMGT_INF("_resetOccs: OCC should have already logged " "an error for safe mode"); } } // Any error at this point means OCCs were not reactivated if(err) { updateForSafeMode(err); } } else { TMGT_INF("_resetOccs: Skipping OCC reset due to " "internal flags 0x%08X", get_int_flags()); } } return err; } // end OccManager::_resetOccs() void OccManager::updateForSafeMode(errlHndl_t & io_err) { io_err->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE); // Add level 2 support callout io_err->addProcedureCallout(HWAS::EPUB_PRC_LVL_SUPP, HWAS::SRCI_PRIORITY_MED); // Add HB firmware callout io_err->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, HWAS::SRCI_PRIORITY_MED); TARGETING::Target* sys = nullptr; TARGETING::targetService().getTopLevelTarget(sys); const uint8_t safeMode = 1; // Mark system as being in safe mode if(sys) { sys->setAttr(safeMode); } iv_state = OCC_STATE_SAFE; _updateSafeModeReason(io_err->reasonCode(), 0); TMGT_ERR("updateForSafeMode: Safe Mode (RC: 0x%04X OCC%d)", cv_safeReturnCode, cv_safeOccInstance); TMGT_CONSOLE("OCCs are not active. The system will remain in " "safe mode (RC: 0x%04x for OCC%d)", cv_safeReturnCode, cv_safeOccInstance); } // end OccManager::updateForSafeMode() // Wait for all OCCs to reach communications checkpoint errlHndl_t OccManager::_waitForOccCheckpoint() { errlHndl_t checkpointElog = nullptr; #ifdef CONFIG_HTMGT // Wait up to 15 seconds for all OCCs to be ready (150 * 100ms = 15s) const size_t NS_BETWEEN_READ = 100 * NS_PER_MSEC; const size_t READ_RETRY_LIMIT = 150; if (iv_occArray.size() > 0) { uint8_t retryCount = 0; bool throttleErrors = false; for( const auto & occ : iv_occArray ) { bool occReady = false; uint16_t lastCheckpoint = 0x0000; while ((!occReady) && (retryCount++ < READ_RETRY_LIMIT)) { nanosleep(0, NS_BETWEEN_READ); TARGETING::ConstTargetHandle_t procTarget = TARGETING::getParentChip(occ->getTarget() ); // Read SRAM response buffer to check for OCC checkpoint errlHndl_t l_err = nullptr; const uint16_t l_length = 8; //Note: number of bytes uint8_t l_sram_data[l_length] = { 0x0 }; l_err = HBOCC::readSRAM(procTarget, OCC_RSP_SRAM_ADDR, (uint64_t*)(&(l_sram_data)), l_length); if (nullptr == l_err) { // Pull status from response (byte 2) uint8_t status = l_sram_data[2]; // Pull checkpoint from response (byte 6-7) uint16_t checkpoint= l_sram_data[6]<<8 | l_sram_data[7]; if (checkpoint != lastCheckpoint) { TMGT_INF("_waitForOccCheckpoint: OCC%d Checkpoint " "0x%04X", occ->getInstance(), checkpoint); lastCheckpoint = checkpoint; } if ( ( OCC_RC_OCC_INIT_CHECKPOINT == status ) && ( OCC_COMM_INIT_COMPLETE == checkpoint) ) { TMGT_INF("_waitForOccCheckpoint OCC%d ready!", occ->getInstance()); occReady = true; break; } if( ((checkpoint & OCC_INIT_FAILURE ) == OCC_INIT_FAILURE ) || ( status == OCC_RC_INIT_FAILURE ) ) { TMGT_ERR("_waitForOccCheckpoint: OCC%d failed " "during initialization (0x%02X, 0x%04X)", occ->getInstance(), status, checkpoint ); occReady = false; break; } } else { if (false == throttleErrors) { throttleErrors = true; TMGT_ERR("_waitForOccCheckpoint: error trying to " "read OCC%d SRAM (rc=0x%04X)", occ->getInstance(), l_err->reasonCode()); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } else { delete l_err; l_err = nullptr; } } } if (!occReady) { TMGT_CONSOLE("Final OCC%d Checkpoint NOT reached (0x%04X)", occ->getInstance(), lastCheckpoint); TMGT_ERR("_waitForOccCheckpoint OCC%d still NOT ready! " "(last checkpoint=0x%04X)", occ->getInstance(), lastCheckpoint); errlHndl_t l_err = nullptr; /*@ * @errortype * @moduleid HTMGT_MOD_WAIT_FOR_CHECKPOINT * @reasoncode HTMGT_RC_OCC_NOT_READY * @userdata1 OCC instance * @userdata2 last OCC checkpoint * @devdesc Set of OCC state failed */ bldErrLog(l_err, HTMGT_MOD_WAIT_FOR_CHECKPOINT, HTMGT_RC_OCC_NOT_READY, 0, occ->getInstance(), 0, lastCheckpoint, ERRORLOG::ERRL_SEV_PREDICTIVE); occ->collectCheckpointScomData( l_err ); occ->addOccTrace( l_err ); if (nullptr == checkpointElog) { // return the first elog checkpointElog = l_err; l_err = nullptr; } else { ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } TMGT_ERR("waitForOccCheckpoint OCC%d still NOT ready!", occ->getInstance()); break; } } } #endif return checkpointElog; } void OccManager::_updateSafeModeReason(uint32_t i_src, uint32_t i_instance) { if ((cv_safeReturnCode == 0) || ((i_src == 0) && (i_instance == 0))) { // Only update safe mode reason for the first failure, // or if trying to clear safe mode cv_safeReturnCode = i_src; cv_safeOccInstance = i_instance; } } uint32_t OccManager::_getSafeModeReason(uint32_t & o_instance) { o_instance = cv_safeOccInstance; return cv_safeReturnCode; } bool OccManager::_occNeedsReset() { bool needsReset = false; for( const auto & occ : iv_occArray ) { if (occ->needsReset() || occ->needsWofReset()) { needsReset = true; break; } } return needsReset; } // Return true if any OCC has been marked as failed bool OccManager::_occFailed() { bool failed = false; for( const auto & occ : iv_occArray ) { if (occ->iv_failed) { failed = true; break; } } return failed; } // Collect HTMGT Status Information for debug // NOTE: o_data is pointer to OCC_MAX_DATA_LENGTH byte buffer void OccManager::_getHtmgtData(uint16_t & o_length, uint8_t *o_data) { uint16_t index = 0; TARGETING::Target* sys = nullptr; TARGETING::targetService().getTopLevelTarget(sys); uint8_t safeMode = 0; uint8_t resets_since_boot = 0; if (sys) { sys->tryGetAttr(safeMode); sys->tryGetAttr (resets_since_boot); } // First add HTMGT specific data o_data[index++] = _getNumOccs(); o_data[index++] = (nullptr!=iv_occMaster)?iv_occMaster->getInstance():0xFF; o_data[index++] = iv_state; o_data[index++] = iv_targetState; o_data[index++] = iv_sysResetCount; o_data[index++] = resets_since_boot; o_data[index++] = 0x00; // STATUS VERSION (for future expansion) o_data[index++] = safeMode; UINT32_PUT(&o_data[index], cv_safeReturnCode); index += 4; UINT32_PUT(&o_data[index], cv_safeOccInstance); index += 4; // Now add OCC specific data (for each OCC) for( const auto & occ : iv_occArray ) { o_data[index++] = occ->getInstance(); o_data[index++] = occ->getState(); o_data[index++] = occ->getRole(); o_data[index++] = occ->iv_masterCapable; o_data[index++] = occ->iv_commEstablished; o_data[index++] = 0; // reserved for expansion o_data[index++] = 0; // reserved for expansion o_data[index++] = 0; // reserved for expansion o_data[index++] = occ->iv_failed; o_data[index++] = occ->needsReset(); o_data[index++] = occ->iv_resetReason; o_data[index++] = (occ->iv_wofResetCount<<4)|occ->iv_resetCount; if (occ->iv_lastPollValid) { memcpy(&o_data[index], occ->iv_lastPollResponse, 4); } else { memset(&o_data[index], 0xFF, 4); } index += 4; } o_length = index; } // Get WOF Reset reasons for all OCCs // NOTE: Data returned is of the form [ 1-byte ID | 4-byte bit vector] // per OCC instance found void OccManager::_getWOFResetReasons(uint16_t & o_length, uint8_t * o_data) { uint16_t index = 0; // Make sure OCCs were built first (so data is valid) errlHndl_t l_err = _buildOccs(); // if not already built. if( l_err ) { TMGT_ERR("_getWOFResetReasons: Failed to build OCC structures " "rc=0x%04X", l_err->reasonCode()); ERRORLOG::errlCommit(l_err, HTMGT_COMP_ID); } // Iterate through OCC objects to get bit vectors. for( const auto & occ : iv_occArray ) { o_data[index++] = occ->getInstance(); UINT32_PUT(&o_data[index], occ->getWofResetReasons()); index += 4; } o_length = index; } // Consolidate all OCC states void OccManager::_syncOccStates() { occStateId currentState = OCC_STATE_NO_CHANGE; for(const auto & occ : iv_occArray ) { if (OCC_STATE_NO_CHANGE == currentState) { currentState = occ->getState(); } else { if (currentState != occ->getState()) { // States do not match yet... currentState = OCC_STATE_NO_CHANGE; break; } } } if (OCC_STATE_NO_CHANGE != currentState) { if (iv_state != currentState) { TMGT_INF("_syncOccStates: All OCCs are in 0x%02X", currentState); iv_state = currentState; } } } // Clear all OCC reset counts // Should not be called if the system is in safe mode. void OccManager::_clearResetCounts() { for( const auto & occ : iv_occArray ) { if (occ->iv_resetCount != 0) { TMGT_INF("_clearResetCounts: Clearing OCC%d reset count " "(was %d)", occ->getInstance(), occ->iv_resetCount); occ->iv_resetCount = 0; } if (occ->iv_wofResetCount != 0) { TMGT_INF("_clearResetCounts: Clearing OCC%d WOF reset count " "(was %d) reason(s): 0x%08X", occ->getInstance(), occ->iv_wofResetCount, occ->iv_wofResetReasons); occ->iv_wofResetCount = 0; } } if (iv_sysResetCount != 0) { TMGT_INF("_clearResetCounts: Clearing system reset count " "(was %d)", iv_sysResetCount); iv_sysResetCount = 0; } } uint8_t OccManager::getNumOccs() { return Singleton::instance()._getNumOccs(); } std::vector OccManager::getOccArray() { return Singleton::instance()._getOccArray(); } errlHndl_t OccManager::buildOccs(const bool i_occStart, bool i_skipComm) { return Singleton::instance()._buildOccs(i_occStart, i_skipComm); } Occ * OccManager::getMasterOcc() { return Singleton::instance()._getMasterOcc(); } Occ * OccManager::getOcc(const uint8_t i_instance) { return Singleton::instance()._getOcc(i_instance); } errlHndl_t OccManager::setOccState(const occStateId i_state) { return Singleton::instance()._setOccState(i_state); } errlHndl_t OccManager::resetOccs(TARGETING::Target * i_failedOccTarget, bool i_skipCountIncrement, bool i_skipComm, enum occResetReason i_reason) { return Singleton::instance()._resetOccs(i_failedOccTarget, i_skipCountIncrement, i_skipComm, i_reason); } occStateId OccManager::getTargetState() { return Singleton::instance()._getTargetState(); } errlHndl_t OccManager::waitForOccCheckpoint() { return Singleton::instance()._waitForOccCheckpoint(); } void OccManager::updateSafeModeReason(uint32_t i_src, uint32_t i_instance) { Singleton::instance(). _updateSafeModeReason(i_src, i_instance); } uint32_t OccManager::getSafeModeReason(uint32_t & o_instance) { return Singleton::instance(). _getSafeModeReason(o_instance); } bool OccManager::occNeedsReset() { return Singleton::instance()._occNeedsReset(); } bool OccManager::occFailed() { return Singleton::instance()._occFailed(); } void OccManager::getHtmgtData(uint16_t & o_length, uint8_t *o_data) { Singleton::instance()._getHtmgtData(o_length, o_data); } void OccManager::getWOFResetReasons(uint16_t & o_length, uint8_t * o_data) { Singleton::instance()._getWOFResetReasons(o_length, o_data); } void OccManager::clearResetCounts() { Singleton::instance()._clearResetCounts(); } void OccManager::syncOccStates() { Singleton::instance()._syncOccStates(); } } // end namespace