summaryrefslogtreecommitdiffstats
path: root/src/usr/htmgt
diff options
context:
space:
mode:
authorChris Cain <cjcain@us.ibm.com>2017-08-24 11:24:12 -0500
committerDaniel M. Crowell <dcrowell@us.ibm.com>2017-09-07 14:25:17 -0400
commit87ff275e941937c256b5a00ddec76638b652f857 (patch)
treedb2cec15f0ca5ccdaa11eadfebab4da5c7ab8c5b /src/usr/htmgt
parenta644d89cb2b6208789e72ac4bf7423ec0fea5d3b (diff)
downloadtalos-hostboot-87ff275e941937c256b5a00ddec76638b652f857.tar.gz
talos-hostboot-87ff275e941937c256b5a00ddec76638b652f857.zip
HTMGT: Config data changes for GPU support
Change-Id: I2b4a5a82791ee6c4531d102dad51389f9dedbe6c RTC: 133828 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45480 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Sheldon R. Bailey <baileysh@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/htmgt')
-rw-r--r--src/usr/htmgt/htmgt_cfgdata.C163
-rw-r--r--src/usr/htmgt/htmgt_cfgdata.H25
-rw-r--r--src/usr/htmgt/htmgt_occ.C34
-rw-r--r--src/usr/htmgt/htmgt_occ.H7
-rw-r--r--src/usr/htmgt/htmgt_poll.C8
-rw-r--r--src/usr/htmgt/htmgt_poll.H11
-rw-r--r--src/usr/htmgt/occError.C7
-rw-r--r--src/usr/htmgt/occError.H3
8 files changed, 249 insertions, 9 deletions
diff --git a/src/usr/htmgt/htmgt_cfgdata.C b/src/usr/htmgt/htmgt_cfgdata.C
index eb4136d19..80d5d7cab 100644
--- a/src/usr/htmgt/htmgt_cfgdata.C
+++ b/src/usr/htmgt/htmgt_cfgdata.C
@@ -178,6 +178,12 @@ namespace HTMGT
cmdDataLen );
break;
+ case OCC_CFGDATA_GPU_CONFIG:
+ getGPUConfigMessageData(occ->getTarget(),
+ cmdData,
+ cmdDataLen);
+ break;
+
default:
TMGT_ERR("sendOccConfigData: Unsupported"
" format type 0x%02X",
@@ -932,7 +938,8 @@ void getThermalControlMessageData(uint8_t* o_data,
l_numSets++;
// VRM
- l_timeout = l_sys->getAttr<ATTR_OPEN_POWER_VRM_READ_TIMEOUT_SEC>();
+ if (!l_sys->tryGetAttr<ATTR_OPEN_POWER_VRM_READ_TIMEOUT_SEC>(l_timeout))
+ l_timeout = 0;
if (l_timeout != 0)
{
o_data[index++] = CFGDATA_FRU_TYPE_VRM;
@@ -944,6 +951,51 @@ void getThermalControlMessageData(uint8_t* o_data,
l_numSets++;
}
+ // GPU Cores
+ if (!l_sys->tryGetAttr<ATTR_OPEN_POWER_GPU_READ_TIMEOUT_SEC>(l_timeout))
+ l_timeout = 0xFF;
+ if (l_timeout == 0)
+ {
+ l_timeout = 0xFF;
+ }
+ if (!l_sys->
+ tryGetAttr<ATTR_OPEN_POWER_GPU_ERROR_TEMP_DEG_C>(l_ERR_temp))
+ l_ERR_temp = OCC_NOT_DEFINED;
+ if (l_ERR_temp == 0)
+ {
+ l_ERR_temp = OCC_NOT_DEFINED;
+ }
+ o_data[index++] = CFGDATA_FRU_TYPE_GPU_CORE;
+ o_data[index++] = OCC_NOT_DEFINED; //DVFS
+ o_data[index++] = l_ERR_temp; //ERROR
+ o_data[index++] = OCC_NOT_DEFINED; //PM_DVFS
+ o_data[index++] = OCC_NOT_DEFINED; //PM_ERROR
+ o_data[index++] = l_timeout;
+ l_numSets++;
+
+ // GPU Memory
+ if (!l_sys->
+ tryGetAttr<ATTR_OPEN_POWER_GPU_MEM_READ_TIMEOUT_SEC>(l_timeout))
+ l_timeout = 0xFF;
+ if (l_timeout == 0)
+ {
+ l_timeout = 0xFF;
+ }
+ if (!l_sys->
+ tryGetAttr<ATTR_OPEN_POWER_GPU_MEM_ERROR_TEMP_DEG_C>(l_ERR_temp))
+ l_ERR_temp = OCC_NOT_DEFINED;
+ if (l_ERR_temp == 0)
+ {
+ l_ERR_temp = OCC_NOT_DEFINED;
+ }
+ o_data[index++] = CFGDATA_FRU_TYPE_GPU_MEMORY;
+ o_data[index++] = OCC_NOT_DEFINED; //DVFS
+ o_data[index++] = l_ERR_temp; //ERROR
+ o_data[index++] = OCC_NOT_DEFINED; //PM_DVFS
+ o_data[index++] = OCC_NOT_DEFINED; //PM_ERROR
+ o_data[index++] = l_timeout;
+ l_numSets++;
+
o_data[l_numSetsOffset] = l_numSets;
o_size = index;
@@ -975,9 +1027,118 @@ void getAVSBusConfigMessageData( const TargetHandle_t i_occ,
o_data[index++] = 0xFF; //reserved
o_data[index++] = 0xFF; //reserved
o_size = index;
+
}
+// Send config data required by OCC for GPU handling.
+// The OCC will determine which GPUs are present from the APSS GPIOs.
+void getGPUConfigMessageData(const TargetHandle_t i_occ,
+ uint8_t * o_data,
+ uint64_t & o_size)
+{
+ unsigned int index = 0;
+ assert(o_data != nullptr);
+
+ // Get system and proc target
+ Target* sys = nullptr;
+ targetService().getTopLevelTarget(sys);
+ assert(sys != nullptr);
+ ConstTargetHandle_t proc = getParentChip(i_occ);
+ assert(proc != nullptr);
+
+ // Populate the data
+ o_data[index++] = OCC_CFGDATA_GPU_CONFIG;
+ o_data[index++] = 0x01; // GPU Config Version
+
+ uint16_t power = 0;
+ power = sys->getAttr<ATTR_CALCULATED_MAX_SYS_POWER_EXCLUDING_GPUS>();
+ //uint16_t miscpwr =
+ // sys->getAttr<ATTR_MISC_SYSTEM_COMPONENTS_MAX_POWER_WATTS>();
+ UINT16_PUT(&o_data[index], power); // Total non-GPU max power (W)
+ index += 2;
+
+ power = sys->getAttr<ATTR_CALCULATED_PROC_MEMORY_POWER_DROP>();
+ UINT16_PUT(&o_data[index], power); // Total proc/mem power drop (W)
+ index += 2;
+ o_data[index++] = 0; // reserved
+ o_data[index++] = 0;
+
+ uint32_t gpu_func_sensors[MAX_GPUS] = {0};
+ uint32_t gpu_temp_sensors[MAX_GPUS] = {0};
+ uint32_t gpu_memtemp_sensors[MAX_GPUS] = {0};
+ // Read GPU sensor numbers
+ uint8_t num_sensors = 0;
+ errlHndl_t err = nullptr;
+ err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
+ HWAS::GPU_FUNC_SENSOR,
+ num_sensors, gpu_func_sensors);
+ if (err)
+ {
+ TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_FUNC_SENSOR)"
+ " failed with rc 0x%04X", err->reasonCode());
+ ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
+ memset(gpu_func_sensors, 0, sizeof(gpu_func_sensors));
+ }
+ err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
+ HWAS::GPU_TEMPERATURE_SENSOR,
+ num_sensors, gpu_temp_sensors);
+ if (err)
+ {
+ TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_TEMP_SENSOR)"
+ " failed with rc 0x%04X", err->reasonCode());
+ ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
+ memset(gpu_temp_sensors, 0, sizeof(gpu_temp_sensors));
+ }
+ err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
+ HWAS::GPU_MEMORY_TEMP_SENSOR,
+ num_sensors, gpu_memtemp_sensors);
+ if (err)
+ {
+ TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_MEM_TEMP_SENSOR)"
+ " failed with rc 0x%04X", err->reasonCode());
+ ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
+ memset(gpu_memtemp_sensors, 0, sizeof(gpu_memtemp_sensors));
+ }
+ for (unsigned int index = 0; index < MAX_GPUS; ++index)
+ {
+ if (gpu_func_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
+ gpu_func_sensors[index] = 0;
+ if (gpu_temp_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
+ gpu_temp_sensors[index] = 0;
+ if (gpu_memtemp_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
+ gpu_memtemp_sensors[index] = 0;
+ }
+
+ // GPU0
+ UINT32_PUT(&o_data[index], gpu_temp_sensors[0]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_memtemp_sensors[0]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_func_sensors[0]);
+ index += 4;
+
+ // GPU1
+ UINT32_PUT(&o_data[index], gpu_temp_sensors[1]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_memtemp_sensors[1]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_func_sensors[1]);
+ index += 4;
+
+ // GPU2
+ UINT32_PUT(&o_data[index], gpu_temp_sensors[2]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_memtemp_sensors[2]);
+ index += 4;
+ UINT32_PUT(&o_data[index], gpu_func_sensors[2]);
+ index += 4;
+
+ o_size = index;
+
+} // end getGPUConfigMessageData()
+
+
void getFrequencyPointMessageData(uint8_t* o_data,
uint64_t & o_size)
diff --git a/src/usr/htmgt/htmgt_cfgdata.H b/src/usr/htmgt/htmgt_cfgdata.H
index 126b5f4c7..9d632889d 100644
--- a/src/usr/htmgt/htmgt_cfgdata.H
+++ b/src/usr/htmgt/htmgt_cfgdata.H
@@ -28,6 +28,7 @@
#include <targeting/common/target.H>
#include "htmgt_occ.H"
+#define MAX_GPUS 3
namespace HTMGT
{
@@ -53,6 +54,7 @@ namespace HTMGT
OCC_CFGDATA_MEM_THROTTLE = 0x12, // Memory Throttle Settings
OCC_CFGDATA_TCT_CONFIG = 0x13, // Thermal Control Treshold
OCC_CFGDATA_AVSBUS_CONFIG = 0x14, // AVSBus Config
+ OCC_CFGDATA_GPU_CONFIG = 0x15, // GPU Config
OCC_CFGDATA_FORMAT_END, // Marker to indicate last entry
OCC_CFGDATA_CLEAR_ALL = 0xFF, // Clear All Active Config Data
@@ -68,10 +70,12 @@ namespace HTMGT
CFGDATA_CORES = 24,
- CFGDATA_FRU_TYPE_PROC = 0x00,
- CFGDATA_FRU_TYPE_MEMBUF = 0x01,
- CFGDATA_FRU_TYPE_DIMM = 0x02,
- CFGDATA_FRU_TYPE_VRM = 0x03,
+ CFGDATA_FRU_TYPE_PROC = 0x00,
+ CFGDATA_FRU_TYPE_MEMBUF = 0x01,
+ CFGDATA_FRU_TYPE_DIMM = 0x02,
+ CFGDATA_FRU_TYPE_VRM = 0x03,
+ CFGDATA_FRU_TYPE_GPU_CORE = 0x04,
+ CFGDATA_FRU_TYPE_GPU_MEMORY = 0x05,
CFDATA_DVFS_NOT_DEFINED = 0xFF,
};
@@ -116,6 +120,7 @@ namespace HTMGT
{ OCC_CFGDATA_MEM_THROTTLE, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
{ OCC_CFGDATA_TCT_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
{ OCC_CFGDATA_AVSBUS_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
+ { OCC_CFGDATA_GPU_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
};
const size_t OCC_CONFIG_TABLE_SIZE = sizeof(occCfgDataTable) /
sizeof(occCfgDataTable_t);
@@ -227,6 +232,18 @@ namespace HTMGT
uint8_t* o_data, uint64_t & o_size);
/**
+ * Fills in the GPU Configuration Data message buffer
+ *
+ * @param[in] i_occ - the OCC target
+ * @param[out] o_data - preallocated buffer to fill in
+ * @param[out] o_size - set to the message size
+ * @pre o_data is large enough.
+ */
+ void getGPUConfigMessageData(const TARGETING::TargetHandle_t i_occ,
+ uint8_t * o_data,
+ uint64_t & o_size);
+
+ /**
* Fill in the Frequency Point Configuration Data
* message buffer.
*
diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C
index c6d230fad..4559b65a0 100644
--- a/src/usr/htmgt/htmgt_occ.C
+++ b/src/usr/htmgt/htmgt_occ.C
@@ -63,6 +63,7 @@ namespace HTMGT
iv_target(i_target),
iv_lastPollValid(false),
iv_occsPresent(1 << i_instance),
+ iv_gpuCfg(0),
iv_resetReason(OCC_RESET_REASON_NONE),
iv_exceptionLogged(0),
iv_resetCount(0),
@@ -363,6 +364,31 @@ namespace HTMGT
OCC_TRACE_INF );
}
+ // Notify HostBoot which GPUs are present (after OCC goes active)
+ void Occ::updateGpuPresence()
+ {
+ TARGETING::ConstTargetHandle_t const_proc_target =
+ TARGETING::getParentChip(iv_target);
+ SENSOR::StatusSensor::statusEnum gpu_status[MAX_GPUS] =
+ {
+ SENSOR::StatusSensor::NOT_PRESENT,
+ SENSOR::StatusSensor::NOT_PRESENT,
+ SENSOR::StatusSensor::NOT_PRESENT
+ };
+ if (iv_gpuCfg & GPUCFG_GPU0_PRESENT)
+ gpu_status[0] = SENSOR::StatusSensor::PRESENT;
+ if (iv_gpuCfg & GPUCFG_GPU1_PRESENT)
+ gpu_status[1] = SENSOR::StatusSensor::PRESENT;
+ if (iv_gpuCfg & GPUCFG_GPU2_PRESENT)
+ gpu_status[2] = SENSOR::StatusSensor::PRESENT;
+
+ TMGT_INF("updateGpuPresence: OCC%d - GPU0:%d, GPU1:%d, GPU2:%d",
+ iv_instance, gpu_status[0], gpu_status[1], gpu_status[2]);
+ SENSOR::updateGpuSensorStatus(const_cast<TARGETING::TargetHandle_t>
+ (const_proc_target),
+ gpu_status);
+ }
+
/////////////////////////////////////////////////////////////////
@@ -752,7 +778,12 @@ namespace HTMGT
// Make sure all OCCs went to active state
for( const auto & occ : iv_occArray )
{
- if (requestedState != occ->getState())
+ if (requestedState == occ->getState())
+ {
+ // Update GPU present status
+ occ->updateGpuPresence();
+ }
+ else
{
TMGT_ERR("_setOccState: OCC%d is not in 0x%02X "
"state",
@@ -797,7 +828,6 @@ namespace HTMGT
"CHARACTERIZATION state");
}
}
-
}
}
}
diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H
index 73f081411..80df52ab7 100644
--- a/src/usr/htmgt/htmgt_occ.H
+++ b/src/usr/htmgt/htmgt_occ.H
@@ -339,6 +339,11 @@ namespace HTMGT
const occErrlCallout_t i_callout,
uint8_t & io_callout_num);
+ /**
+ * @brief Update the GPU presence sensors in the system
+ */
+ void updateGpuPresence();
+
protected:
// Instance number of this OCC: 0 = first physical OCC
uint8_t iv_instance;
@@ -366,6 +371,8 @@ namespace HTMGT
bool iv_lastPollValid;
// expected occsPresent byte in POLL response
uint8_t iv_occsPresent;
+ // GPU configuration from poll response data
+ uint8_t iv_gpuCfg;
occResetReason iv_resetReason;
diff --git a/src/usr/htmgt/htmgt_poll.C b/src/usr/htmgt/htmgt_poll.C
index 2457db373..3316f9425 100644
--- a/src/usr/htmgt/htmgt_poll.C
+++ b/src/usr/htmgt/htmgt_poll.C
@@ -355,6 +355,14 @@ namespace HTMGT
iv_instance, state_string(iv_state));
}
+ // Check GPU config
+ if (iv_gpuCfg != pollRsp->gpuCfg)
+ {
+ iv_gpuCfg = pollRsp->gpuCfg;
+ TMGT_INF("pollRspHandler: updating OCC%d GPU config to 0x%02X",
+ iv_instance, iv_gpuCfg);
+ }
+
// Copy rspData to lastPollResponse
memcpy(iv_lastPollResponse, pollRsp, OCC_POLL_DATA_MIN_SIZE);
iv_lastPollValid = true;
diff --git a/src/usr/htmgt/htmgt_poll.H b/src/usr/htmgt/htmgt_poll.H
index db18393a3..d3c9526fc 100644
--- a/src/usr/htmgt/htmgt_poll.H
+++ b/src/usr/htmgt/htmgt_poll.H
@@ -47,6 +47,14 @@ namespace HTMGT
const uint8_t OCC_XSTATUS_MEM_THROT_OT = 0x20;
const uint8_t OCC_XSTATUS_N_POWER = 0x10;
+ // GPU Config bits
+ enum gpuConfig_e
+ {
+ GPUCFG_GPU2_PRESENT = 0x04,
+ GPUCFG_GPU1_PRESENT = 0x02,
+ GPUCFG_GPU0_PRESENT = 0x01
+ };
+
struct occPollRspStruct_t
{
uint8_t status;
@@ -59,7 +67,8 @@ namespace HTMGT
uint8_t errorId;
uint32_t errorAddress;
uint16_t errorLength;
- uint16_t reserved[2];
+ uint8_t reserved;
+ uint8_t gpuCfg;
uint8_t codeLevel[16];
uint8_t sensor[6];
uint8_t numBlocks;
diff --git a/src/usr/htmgt/occError.C b/src/usr/htmgt/occError.C
index 87243996f..70abd1c71 100644
--- a/src/usr/htmgt/occError.C
+++ b/src/usr/htmgt/occError.C
@@ -369,6 +369,13 @@ namespace HTMGT
l_success = false;
}
}
+ else if (i_callout.type == OCC_CALLOUT_TYPE_GPU_SENSOR)
+ {
+ const uint32_t sensor = (uint32_t)i_callout.calloutValue;
+ io_errlHndl->addSensorCallout(sensor, HWAS::GPU_FUNC_SENSOR,
+ i_priority);
+ io_callout_num++;
+ }
else
{
TMGT_ERR("elogAddCallout: Invalid callout type (type=%d)",
diff --git a/src/usr/htmgt/occError.H b/src/usr/htmgt/occError.H
index 0345d582d..0f0a284be 100644
--- a/src/usr/htmgt/occError.H
+++ b/src/usr/htmgt/occError.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2014,2016 */
+/* Contributors Listed Below - COPYRIGHT 2014,2017 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -50,6 +50,7 @@ namespace HTMGT
{
OCC_CALLOUT_TYPE_SENSOR = 0x01,
OCC_CALLOUT_TYPE_COMPONENT_ID = 0x02,
+ OCC_CALLOUT_TYPE_GPU_SENSOR = 0x03,
};
// TMGT-OCC Component Ids
OpenPOWER on IntegriCloud