summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/usr/htmgt/htmgt.C6
-rw-r--r--src/usr/htmgt/htmgt_cfgdata.C50
-rw-r--r--src/usr/htmgt/htmgt_cfgdata.H12
-rw-r--r--src/usr/htmgt/htmgt_occ.C53
-rw-r--r--src/usr/htmgt/htmgt_occ.H36
-rw-r--r--src/usr/htmgt/htmgt_poll.C22
-rw-r--r--src/usr/htmgt/occError.C67
-rw-r--r--src/usr/htmgt/occError.H1
-rw-r--r--src/usr/htmgt/test/htmgtcfgtest.H3
9 files changed, 185 insertions, 65 deletions
diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C
index d7675bb56..cf2190d07 100644
--- a/src/usr/htmgt/htmgt.C
+++ b/src/usr/htmgt/htmgt.C
@@ -107,8 +107,8 @@ namespace HTMGT
if (OccManager::occNeedsReset())
{
// No need to continue if reset is required
- TMGT_ERR("sendOccConfigData(): OCCs need "
- "to be reset");
+ TMGT_ERR("processOccStartStatus(): "
+ "OCCs need to be reset");
break;
}
else
@@ -305,6 +305,8 @@ namespace HTMGT
errlHndl_t err = OccManager::resetOccs(nullptr);
if(err)
{
+ TMGT_ERR("processOccError(): Error when attempting"
+ " to reset OCCs");
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
}
}
diff --git a/src/usr/htmgt/htmgt_cfgdata.C b/src/usr/htmgt/htmgt_cfgdata.C
index de636d153..85b8029f3 100644
--- a/src/usr/htmgt/htmgt_cfgdata.C
+++ b/src/usr/htmgt/htmgt_cfgdata.C
@@ -130,7 +130,8 @@ namespace HTMGT
{
case OCC_CFGDATA_FREQ_POINT:
getFrequencyPointMessageData(cmdData,
- cmdDataLen);
+ cmdDataLen,
+ occ->wofResetCount());
break;
case OCC_CFGDATA_OCC_ROLE:
@@ -1153,7 +1154,8 @@ void getGPUConfigMessageData(const TargetHandle_t i_occ,
void getFrequencyPointMessageData(uint8_t* o_data,
- uint64_t & o_size)
+ uint64_t & o_size,
+ uint8_t i_wofResetCount )
{
uint64_t index = 0;
uint16_t min = 0;
@@ -1177,6 +1179,7 @@ void getFrequencyPointMessageData(uint8_t* o_data,
uint8_t turboAllowed =
sys->getAttr<ATTR_OPEN_POWER_TURBO_MODE_SUPPORTED>();
+
if (turboAllowed)
{
turbo = sys->getAttr<ATTR_FREQ_CORE_MAX>();
@@ -1185,31 +1188,61 @@ void getFrequencyPointMessageData(uint8_t* o_data,
ATTR_SYSTEM_WOF_DISABLE_type wofSupported;
if (!sys->tryGetAttr<ATTR_SYSTEM_WOF_DISABLE>(wofSupported))
{
+ ultra = WOF_SYSTEM_DISABLED;
G_wofSupported = false;
}
else
{
- if( wofSupported != SYSTEM_WOF_DISABLE_ON )
+ uint16_t tempUt = sys->getAttr<ATTR_ULTRA_TURBO_FREQ_MHZ>();
+ if( wofSupported == SYSTEM_WOF_DISABLE_ON )
{
- ultra = sys->getAttr<ATTR_ULTRA_TURBO_FREQ_MHZ>();
+ TMGT_INF("System does not support WOF");
+ G_wofSupported = false;
+ ultra = WOF_SYSTEM_DISABLED;
}
- else
+ else if( tempUt == 0 )
+ {
+ TMGT_INF("Missing Ultra Turbo VPD point. WOF disabled.");
+ G_wofSupported = false;
+ ultra = WOF_MISSING_ULTRA_TURBO;
+ }
+ else if( i_wofResetCount >= WOF_RESET_COUNT_THRESHOLD )
+ {
+ TMGT_INF("WOF reset count reached. WOF disabled.");
+ G_wofSupported = false;
+ ultra = WOF_RESET_COUNT_REACHED;
+ }
+ else if( turbo <= nominal )
{
+ TMGT_INF("Turbo is less than nominal. WOF disabled.");
G_wofSupported = false;
+ ultra = WOF_UNSUPPORTED_FREQ;
+ }
+ else if( tempUt <= turbo )
+ {
+ TMGT_INF("Ultra Turbo is less than Turbo. WOF disabled.");
+ G_wofSupported = false;
+ ultra = WOF_UNSUPPORTED_FREQ;
+ }
+ else
+ {
+ ultra = tempUt;
}
+
}
if( !G_wofSupported )
{
- TMGT_INF("getFrequencyPoint: WOF not enabled");
+ TMGT_INF("getFrequencyPoint: WOF not enabled! RC = %x", ultra);
}
}
else
{
// If turbo not supported, send nominal for turbo
- // and 0 for ultra-turbo (no WOF support)
+ // and reason code for ultra-turbo (no WOF support)
TMGT_INF("getFrequencyPoint: Turbo/WOF not supported");
turbo = nominal;
+ ultra = WOF_UNSUPPORTED_FREQ;
G_wofSupported = false;
}
@@ -1342,5 +1375,4 @@ void getApssMessageData(uint8_t* o_data,
}
-
-}
+}// namespace HTMGT
diff --git a/src/usr/htmgt/htmgt_cfgdata.H b/src/usr/htmgt/htmgt_cfgdata.H
index 9d632889d..82eb05a35 100644
--- a/src/usr/htmgt/htmgt_cfgdata.H
+++ b/src/usr/htmgt/htmgt_cfgdata.H
@@ -80,6 +80,13 @@ namespace HTMGT
CFDATA_DVFS_NOT_DEFINED = 0xFF,
};
+ enum // WOF disabled reasons
+ {
+ WOF_MISSING_ULTRA_TURBO = 0x0000,
+ WOF_SYSTEM_DISABLED = 0x0001,
+ WOF_RESET_COUNT_REACHED = 0x0002,
+ WOF_UNSUPPORTED_FREQ = 0x0003,
+ };
enum cfgTargets
{
@@ -249,10 +256,13 @@ namespace HTMGT
*
* @param[out] o_data - preallocated buffer to fill in
* @param[out] o_size - set to the message size
+ * @param[in] i_wofResetCount - Number of times OCC requested a reset
+ * due to WOF
* @pre o_data is large enough.
*/
void getFrequencyPointMessageData(uint8_t* o_data,
- uint64_t & o_size);
+ uint64_t & o_size,
+ uint8_t i_wofResetCount );
/**
* Generate the APSS configuration message
diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
index 4559b65a0..760799e50 100644
--- a/src/usr/htmgt/htmgt_occ.C
+++ b/src/usr/htmgt/htmgt_occ.C
@@ -57,6 +57,8 @@ namespace HTMGT
iv_state(OCC_STATE_UNKNOWN),
iv_commEstablished(false),
iv_needsReset(false),
+ iv_needsWofReset(false),
+ iv_wofResetCount(0),
iv_failed(false),
iv_seqNumber(0),
iv_homer(i_homer),
@@ -202,9 +204,15 @@ namespace HTMGT
atThreshold = true;
}
}
+ else if( iv_needsWofReset ) //If WOF reset, increment count
+ {
+ iv_wofResetCount++;
+ TMGT_INF("resetPrep(): WOF reset requested. Reset Count = %d",
+ iv_wofResetCount );
+ }
else
{
- cmdData[1] = OCC_RESET_FAIL_OTHER_OCC;
+ cmdData[1] = OCC_RESET_FAIL_THIS_OCC;
}
if (iv_commEstablished)
@@ -239,6 +247,7 @@ namespace HTMGT
iv_state = OCC_STATE_UNKNOWN;
iv_commEstablished = false;
iv_needsReset = false;
+ iv_needsWofReset = false;
iv_failed = false;
iv_lastPollValid = false;
iv_resetReason = OCC_RESET_REASON_NONE;
@@ -401,7 +410,7 @@ namespace HTMGT
:iv_occMaster(nullptr),
iv_state(OCC_STATE_UNKNOWN),
iv_targetState(OCC_STATE_ACTIVE),
- iv_resetCount(0),
+ iv_sysResetCount(0),
iv_normalPstateTables(true)
{
}
@@ -903,17 +912,23 @@ namespace HTMGT
atThreshold = true;
}
}
+ // If we need a WOF reset, skip system count increment
+ if( occ->needsWofReset() )
+ {
+ i_skipCountIncrement = true;
+ }
+
}
if ((false == i_skipCountIncrement) && (false == _occFailed()))
{
// No OCC has been marked failed, increment sys reset count
- ++iv_resetCount;
+ ++iv_sysResetCount;
TMGT_INF("_resetOCCs: Incrementing system OCC reset count"
- " to %d", iv_resetCount);
+ " to %d", iv_sysResetCount);
- if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD)
+ if(iv_sysResetCount > OCC_RESET_COUNT_THRESHOLD)
{
atThreshold = true;
}
@@ -1225,7 +1240,7 @@ namespace HTMGT
for( const auto & occ : iv_occArray )
{
- if (occ->needsReset())
+ if (occ->needsReset() || occ->needsWofReset())
{
needsReset = true;
break;
@@ -1235,7 +1250,6 @@ namespace HTMGT
return needsReset;
}
-
// Return true if any OCC has been marked as failed
bool OccManager::_occFailed()
{
@@ -1290,7 +1304,7 @@ namespace HTMGT
o_data[index++] = (nullptr!=iv_occMaster)?iv_occMaster->getInstance():0xFF;
o_data[index++] = iv_state;
o_data[index++] = iv_targetState;
- o_data[index++] = iv_resetCount;
+ o_data[index++] = iv_sysResetCount;
o_data[index++] = iv_normalPstateTables ? 0 : 1;
index += 1; // reserved for expansion
o_data[index++] = safeMode;
@@ -1393,7 +1407,8 @@ namespace HTMGT
{
TMGT_INF("_clearResetCounts: Clearing OCC%d reset count "
"(was %d)",
- occ->getInstance(), occ->iv_resetCount);
+ occ->getInstance(),
+ occ->iv_resetCount);
occ->iv_resetCount = 0;
if (safeMode)
{
@@ -1401,13 +1416,27 @@ namespace HTMGT
occ->postResetClear();
}
}
+
+ if(occ->iv_wofResetCount != 0)
+ {
+ occ->iv_wofResetCount = 0;
+ TMGT_INF("_clearResetCounts: Clearing OCC%d WOF reset count "
+ "( was %d)",
+ occ->getInstance(),
+ occ->iv_wofResetCount);
+ if(safeMode)
+ {
+ // Clear OCC flags
+ occ->postResetClear();
+ }
+ }
}
- if (iv_resetCount != 0)
+ if (iv_sysResetCount != 0)
{
TMGT_INF("_clearResetCounts: Clearing system reset count "
- "(was %d)", iv_resetCount);
- iv_resetCount = 0;
+ "(was %d)", iv_sysResetCount);
+ iv_sysResetCount = 0;
}
}
diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
index 80df52ab7..fb439672d 100644
--- a/src/usr/htmgt/htmgt_occ.H
+++ b/src/usr/htmgt/htmgt_occ.H
@@ -72,6 +72,7 @@ namespace HTMGT
enum
{
OCC_RESET_COUNT_THRESHOLD = 3,
+ WOF_RESET_COUNT_THRESHOLD = 3,
};
enum occResetReason
@@ -83,6 +84,7 @@ namespace HTMGT
OCC_RESET_REASON_POWER_FAULT = 0x04,
OCC_RESET_REASON_DIFF_OCC = 0x05,
OCC_RESET_REASON_OCC_REQUEST = 0x06,
+ OCC_RESET_REASON_WOF_REQUEST = 0x07,
};
// OCC Callout Structure
@@ -215,12 +217,14 @@ namespace HTMGT
*/
occStateId getState() { return iv_state; };
+
/**
* @brief Prepare this OCC for reset
* @return return true if at threshold otherwise false
*/
bool resetPrep();
+
/**
* @brief Set IPMI OCC sensor state
* @param i_activate: true - set active
@@ -256,6 +260,22 @@ namespace HTMGT
/**
+ * @brief Determine if OCC needs to be reset due to WOF
+ *
+ * @return true if this OCC needs to be reset
+ */
+ bool needsWofReset() { return iv_needsWofReset; }
+
+
+ /**
+ * @brief Returns the number of times a WOF reset has occured
+ *
+ * @return Number of WOF resets for this OCC
+ */
+ uint8_t wofResetCount() { return iv_wofResetCount; }
+
+
+ /**
* @brief Return OCCs present bits
*
* @return bitmask representing this OCC position
@@ -282,6 +302,7 @@ namespace HTMGT
*/
void collectCheckpointScomData(errlHndl_t i_err);
+
/**
* @brief Add OCC trace buffers to given error log (ERR, IMP, INF)
*
@@ -290,8 +311,6 @@ namespace HTMGT
void addOccTrace( errlHndl_t & io_errl );
-
-
private: // functions
/**
@@ -303,6 +322,7 @@ namespace HTMGT
void pollRspHandler(const uint8_t * i_pollResponse,
const uint16_t i_pollResponseSize);
+
/**
* @brief Collect, Commit and Clear error log from the OCC
*
@@ -314,6 +334,7 @@ namespace HTMGT
const uint32_t i_address,
const uint16_t i_length);
+
/**
* @brief Determine what actions are required for elog
*
@@ -325,6 +346,7 @@ namespace HTMGT
bool & o_occReset,
ERRORLOG::errlSeverity_t & o_errlSeverity);
+
/**
* @brief Add specified callout to the error log
*
@@ -339,11 +361,13 @@ namespace HTMGT
const occErrlCallout_t i_callout,
uint8_t & io_callout_num);
+
/**
* @brief Update the GPU presence sensors in the system
*/
void updateGpuPresence();
+
protected:
// Instance number of this OCC: 0 = first physical OCC
uint8_t iv_instance;
@@ -357,6 +381,10 @@ namespace HTMGT
bool iv_commEstablished;
// true if OCC needs to be reset
bool iv_needsReset;
+ // true if OCC needs to be reset due to WOF
+ bool iv_needsWofReset;
+ // WOF reset count
+ uint8_t iv_wofResetCount;
// true if OCC failed
bool iv_failed;
// Sequence number of last/current OCC command
@@ -390,7 +418,6 @@ namespace HTMGT
uint8_t iv_resetCount;
// Version of data stored (0 = not written)
uint8_t iv_version;
-
};
@@ -565,7 +592,6 @@ namespace HTMGT
*/
static bool occNeedsReset();
-
/**
* @brief Collect FFDC debug data for HTMGT and OCCs
*
@@ -645,7 +671,7 @@ namespace HTMGT
occList_t iv_occArray;
occStateId iv_state;
occStateId iv_targetState;
- uint8_t iv_resetCount;
+ uint8_t iv_sysResetCount;
bool iv_normalPstateTables;
diff --git a/src/usr/htmgt/htmgt_poll.C b/src/usr/htmgt/htmgt_poll.C
index 3316f9425..af863b1e5 100644
--- a/src/usr/htmgt/htmgt_poll.C
+++ b/src/usr/htmgt/htmgt_poll.C
@@ -53,7 +53,7 @@ namespace HTMGT
for( const auto & l_occ : iv_occArray )
{
- if(NULL == i_occTarget || l_occ->iv_target == i_occTarget)
+ if(nullptr == i_occTarget || l_occ->iv_target == i_occTarget)
{
if ((l_occ->iv_commEstablished) ||
(onlyIfEstablished == false))
@@ -96,13 +96,13 @@ namespace HTMGT
errlHndl_t Occ::pollForErrors(const bool i_flushAllErrors)
{
- errlHndl_t err = NULL;
- uint8_t * poll_rsp = NULL;
+ errlHndl_t err = nullptr;
+ uint8_t * poll_rsp = nullptr;
// Only send poll if OCC has not logged an exception
if (0 == iv_exceptionLogged)
{
- TMGT_INF("sendOccPoll: Polling OCC%d", iv_instance);
+ TMGT_INF("pollForErrors: Polling OCC%d", iv_instance);
bool continuePolling = false;
size_t elogCount = 10;
@@ -117,10 +117,10 @@ namespace HTMGT
l_cmdData);
err = cmd.sendOccCmd();
- if (err != NULL)
+ if (err != nullptr)
{
// Poll failed
- TMGT_ERR("sendOccPoll: OCC%d poll failed with rc=0x%04X",
+ TMGT_ERR("pollForErrors: OCC%d poll failed with rc=0x%04X",
iv_instance,
err->reasonCode());
@@ -147,7 +147,7 @@ namespace HTMGT
{
// Limit number of elogs retrieved so
// we do not get stuck in loop
- TMGT_INF("sendOccPoll: OCC%d still has"
+ TMGT_INF("pollForErrors: OCC%d still has"
"more errors to report.",
iv_instance);
continuePolling = false;
@@ -162,7 +162,7 @@ namespace HTMGT
}
else
{
- TMGT_ERR("sendOccPoll: OCC%d poll command response "
+ TMGT_ERR("pollForErrors: OCC%d poll command response "
"failed with invalid data length %d",
iv_instance, poll_rsp_size);
/*@
@@ -268,8 +268,8 @@ namespace HTMGT
if (iv_state != pollRsp->state)
{
iv_state = (occStateId)pollRsp->state;
- TMGT_INF("pollRspHandler: updating OCC%d state"
- " to %s",
+ TMGT_INF("pollRspHandler: Need reset. "
+ "updating OCC%d state to %s",
iv_instance, state_string(iv_state));
}
break;
@@ -281,7 +281,7 @@ namespace HTMGT
(OCC_STATE_OBSERVATION == pollRsp->state) ||
(OCC_STATE_CHARACTERIZATION == pollRsp->state))
{
- errlHndl_t l_err = NULL;
+ errlHndl_t l_err = nullptr;
// Check role status
if (((OCC_ROLE_SLAVE == iv_role) &&
diff --git a/src/usr/htmgt/occError.C b/src/usr/htmgt/occError.C
index 70abd1c71..47b0eb395 100644
--- a/src/usr/htmgt/occError.C
+++ b/src/usr/htmgt/occError.C
@@ -89,7 +89,7 @@ namespace HTMGT
const uint32_t i_address,
const uint16_t i_length)
{
- errlHndl_t l_errlHndl = NULL;
+ errlHndl_t l_errlHndl = nullptr;
// Read data from SRAM (length must be multiple of 8 bytes)
const uint16_t l_length = (i_length) & 0xFFF8;
@@ -101,7 +101,7 @@ namespace HTMGT
reinterpret_cast<uint64_t*>(l_buffer.pointer()),
l_length );
#endif
- if (NULL == l_errlHndl)
+ if (nullptr == l_errlHndl)
{
const occErrlEntry_t * l_occElog= reinterpret_cast<occErrlEntry_t*>
@@ -129,6 +129,17 @@ namespace HTMGT
// Process Actions
bool l_occReset = false;
elogProcessActions(l_occElog->actions, l_occReset, severity);
+
+ // Check if we need a WOF requested reset
+ if(iv_needsWofReset == true)
+ {
+ if( iv_wofResetCount < WOF_RESET_COUNT_THRESHOLD )
+ {
+ // Not at WOF reset threshold yet. Set sev to INFO
+ severity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
+ }
+ }
+
if (l_occReset == true)
{
iv_needsReset = true;
@@ -209,7 +220,7 @@ namespace HTMGT
}
// Any bad fru data found ?
- errlHndl_t err2 = NULL;
+ errlHndl_t err2 = nullptr;
if (l_bad_fru_data == true)
{
TMGT_BIN("Callout Data", &l_occElog->callout[0],
@@ -286,7 +297,7 @@ namespace HTMGT
OccCmd l_cmd(this, OCC_CMD_CLEAR_ERROR_LOG,
sizeof(l_cmdData), l_cmdData);
l_errlHndl = l_cmd.sendOccCmd();
- if (l_errlHndl != NULL)
+ if (l_errlHndl != nullptr)
{
TMGT_ERR("occProcessElog: Failed to clear elog id %d to"
" OCC%d (rc=0x%04X)",
@@ -321,7 +332,7 @@ namespace HTMGT
const uint32_t sensor = (uint32_t)i_callout.calloutValue;
TARGETING::Target * target =
TARGETING::UTIL::getSensorTarget(sensor);
- if (NULL != target)
+ if (nullptr != target)
{
io_errlHndl->addHwCallout(target, i_priority,
HWAS::NO_DECONFIG,
@@ -392,34 +403,42 @@ namespace HTMGT
bool & o_occReset,
ERRORLOG::errlSeverity_t & o_errlSeverity)
{
- if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED)
+ if (i_actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED)
{
o_occReset = true;
- iv_failed = true;
- iv_resetReason = OCC_RESET_REASON_OCC_REQUEST;
+ iv_failed = false;
+ iv_needsWofReset = true;
+ iv_resetReason = OCC_RESET_REASON_WOF_REQUEST;
- TMGT_INF("elogProcessActions: OCC%d requested reset",
- iv_instance);
+ TMGT_INF("elogProcessActions: OCC%d requested a WOF reset",
+ iv_instance);
}
-
- if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED)
+ else
{
- o_occReset = true;
- iv_failed = true;
- iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE;
- iv_resetCount = OCC_RESET_COUNT_THRESHOLD;
+ if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED)
+ {
+ o_occReset = true;
+ iv_failed = true;
+ iv_resetReason = OCC_RESET_REASON_OCC_REQUEST;
- TMGT_INF("elogProcessActions: OCC%d requested safe mode",
- iv_instance);
- TMGT_CONSOLE("OCC%d requested system enter safe mode",
+ TMGT_INF("elogProcessActions: OCC%d requested reset",
iv_instance);
+ }
+
+ if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED)
+ {
+ o_occReset = true;
+ iv_failed = true;
+ iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE;
+ iv_resetCount = OCC_RESET_COUNT_THRESHOLD;
+
+ TMGT_INF("elogProcessActions: OCC%d requested safe mode",
+ iv_instance);
+ TMGT_CONSOLE("OCC%d requested system enter safe mode",
+ iv_instance);
+ }
}
} // end Occ::elogProcessActions()
-
-
} // end namespace
-
-
-
diff --git a/src/usr/htmgt/occError.H b/src/usr/htmgt/occError.H
index 0f0a284be..1f3e5ad7d 100644
--- a/src/usr/htmgt/occError.H
+++ b/src/usr/htmgt/occError.H
@@ -41,6 +41,7 @@ namespace HTMGT
// Error Actions
enum tmgtErrlActionsType
{
+ TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED = 0x20,
TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED = 0x40,
TMGT_ERRL_ACTIONS_RESET_REQUIRED = 0x80,
};
diff --git a/src/usr/htmgt/test/htmgtcfgtest.H b/src/usr/htmgt/test/htmgtcfgtest.H
index ab71ec034..9bdffd1db 100644
--- a/src/usr/htmgt/test/htmgtcfgtest.H
+++ b/src/usr/htmgt/test/htmgtcfgtest.H
@@ -432,11 +432,12 @@ public:
{
uint8_t data[4*KILOBYTE];
uint64_t size = 0;
+ uint8_t wofResetCount = 0;
TS_TRACE(ENTER_MRK"HTMGT: testThermalControlConfigData");
memset(data, 0, 4*KILOBYTE);
- getFrequencyPointMessageData(data, size);
+ getFrequencyPointMessageData(data, size, wofResetCount);
if (data[0] != OCC_CFGDATA_FREQ_POINT)
{
OpenPOWER on IntegriCloud