diff options
author | Matt Derksen <mderkse1@us.ibm.com> | 2017-08-24 10:45:28 -0500 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2017-09-05 10:29:35 -0400 |
commit | 14187f38b431641e65558be7a15fbdcd11a75fe3 (patch) | |
tree | 7b28a5733613006561eeae4474b74626835fad0e /src/usr/ipmi | |
parent | 9a2410a8912c7af427ac4a10ad3ffbd3c56920a1 (diff) | |
download | talos-hostboot-14187f38b431641e65558be7a15fbdcd11a75fe3.tar.gz talos-hostboot-14187f38b431641e65558be7a15fbdcd11a75fe3.zip |
New OCC/HTMGT interfaces for GPU sensor support
getGpuSensors() and updateGpuSensorStatus()
Depends-on: I8a0de390516fd02df07860b960db506899b13f14
Change-Id: I290876d0e5f4889e6f2b1a45b5f81172acb28caf
RTC:178218
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45196
Reviewed-by: Martin Gloff <mgloff@us.ibm.com>
Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/ipmi')
-rw-r--r-- | src/usr/ipmi/ipmifruinv.C | 55 | ||||
-rw-r--r-- | src/usr/ipmi/ipmisensor.C | 315 |
2 files changed, 369 insertions, 1 deletions
diff --git a/src/usr/ipmi/ipmifruinv.C b/src/usr/ipmi/ipmifruinv.C index 1e13579be..8903535a8 100644 --- a/src/usr/ipmi/ipmifruinv.C +++ b/src/usr/ipmi/ipmifruinv.C @@ -34,6 +34,7 @@ #include <targeting/common/utilFilter.H> #include <errl/errlmanager.H> #include <ipmi/ipmifruinv.H> +#include <ipmi/ipmisensor.H> #include "ipmifru.H" #include "ipmifruinvprvt.H" #include <stdio.h> @@ -1685,6 +1686,60 @@ void IPMIFRUINV::setData(bool i_updateData) IPMIFRUINV::clearData(it->first); } } + + // Only send GPU sensor PRESENT status one time (no update), + // then allow HTMGT to update + + // Go through processors and send GPU sensor status + // Get all Proc targets + TARGETING::TargetHandleList l_procTargetList; + getAllChips(l_procTargetList, TARGETING::TYPE_PROC); + + uint32_t gpu_sensors[SENSOR::MAX_GPU_SENSORS_PER_PROCESSOR]; + uint8_t num_valid_sensors = 0; + for (const auto & l_procChip: l_procTargetList) + { + // report present GPU sensors + l_errl = SENSOR::getGpuSensors( l_procChip, + HWAS::GPU_FUNC_SENSOR, + num_valid_sensors, + gpu_sensors ); + if (!l_errl) + { + // build up present GPUs based on sensor data returned + SENSOR::StatusSensor::statusEnum + gpu_status[SENSOR::MAX_PROCESSOR_GPUS]; + + // initialize to NOT PRESENT + for (uint8_t j = 0; j < SENSOR::MAX_PROCESSOR_GPUS; j++) + { + gpu_status[j] = + SENSOR::StatusSensor::statusEnum::NOT_PRESENT; + } + + // now change the PRESENT ones + for (uint8_t i = 0; + i < SENSOR::MAX_GPU_SENSORS_PER_PROCESSOR; i++) + { + if (i < SENSOR::MAX_PROCESSOR_GPUS) + { + if (gpu_sensors[i] != + TARGETING::UTIL::INVALID_IPMI_SENSOR) + { + gpu_status[i] = + SENSOR::StatusSensor::statusEnum::PRESENT; + } + } + else + { + break; + } + } + + // Send the present/non-present GPU sensors + SENSOR::updateGpuSensorStatus( l_procChip, gpu_status); + } + } } } while(0); diff --git a/src/usr/ipmi/ipmisensor.C b/src/usr/ipmi/ipmisensor.C index c4d0883d6..59cfcd41f 100644 --- a/src/usr/ipmi/ipmisensor.C +++ b/src/usr/ipmi/ipmisensor.C @@ -36,6 +36,9 @@ #include <targeting/common/utilFilter.H> #include <ipmi/ipmi_reasoncodes.H> #include <endian.h> +#include <vpd/pvpdenums.H> +#include <devicefw/userif.H> + extern trace_desc_t * g_trac_ipmi; @@ -218,7 +221,7 @@ namespace SENSOR else { TRACFCOMP(g_trac_ipmi,"We were not able to find a sensor number in" - " the IPMI_SENSORS attribute for sensor_name=0x%x" + " the IPMI_SENSORS attribute for sensor_name=0x%x " "for target with huid=0x%x, skipping call to " "sendSetSensorReading()", iv_name, TARGETING::get_huid( iv_target )); @@ -729,6 +732,32 @@ namespace SENSOR }; //************************************************************************** + // GpuSensor constructor + //************************************************************************** + GpuSensor::GpuSensor(TARGETING::SENSOR_NAME i_name, uint16_t i_num, + TARGETING::ConstTargetHandle_t i_target) + : StatusSensor(i_target) + + { + /* Note: StatusSensor sets these for processor target */ + //iv_functionalOffset = PROC_DISABLED; + //iv_presentOffset = PROC_PRESENCE_DETECTED; + + // Override iv_name set by parent constructor + iv_name = i_name; + + // 3 numbers possible (1 per GPU) for each name, so save which one + iv_sensorNumber = i_num; + }; + + //************************************************************************** + // GpuSensor destructor + //************************************************************************** + GpuSensor::~GpuSensor() + { + } + + //************************************************************************** // FaultSensor constructor //************************************************************************** @@ -1251,4 +1280,288 @@ namespace SENSOR return TARGETING::UTIL::getSensorNumber(nodes[0], TARGETING::SENSOR_NAME_BACKPLANE_FAULT); } + + /** + * @brief All sensors returned cfgID bit + * NV keyword = 0x02 --> 0b0100 -> 4 + */ + static const uint16_t NVCFG_ALL_SENSORS_RETURNED = 4; + + /** + * @brief Helper function to getGpuSensors() + * NV keyword tells us what backplane is installed, + * thus what GPUs are supported + * + * @param[out] returns NV keyword in bitwise format + * + * @return Error log handle if a deviceRead fails + */ + errlHndl_t getNVCfgIDBit(uint16_t & o_cfgID_bitwise) + { + static uint16_t L_NV_bits = 0; + errlHndl_t l_err = nullptr; + + if (L_NV_bits == 0) + { + // grab system enclosure node + TARGETING::TargetHandle_t l_sys = NULL; + TARGETING::TargetHandleList l_nodeTargets; + TARGETING::targetService().getTopLevelTarget(l_sys); + assert(l_sys != NULL); + getChildAffinityTargets(l_nodeTargets, l_sys, TARGETING::CLASS_ENC, + TARGETING::TYPE_NODE); + assert(!l_nodeTargets.empty()); + + // get keyword size first + PVPD::pvpdRecord l_Record = PVPD::VNDR; + PVPD::pvpdKeyword l_KeyWord = PVPD::NV; + size_t l_nvKwdSize = 0; + l_err = deviceRead(l_nodeTargets[0],NULL,l_nvKwdSize, + DEVICE_PVPD_ADDRESS(l_Record,l_KeyWord)); + if (!l_err) + { + uint8_t l_kwd[l_nvKwdSize] = {0}; + // now read the keyword + l_err = deviceRead(l_nodeTargets[0],l_kwd,l_nvKwdSize, + DEVICE_PVPD_ADDRESS(l_Record,l_KeyWord)); + if (!l_err) + { + uint8_t cfgID = l_kwd[l_nvKwdSize-1]; + if (cfgID < 16) // maximum setting (bits 0-15) + { + L_NV_bits = 0x0001 << cfgID; + } + } + else + { + TRACFCOMP(g_trac_ipmi,ERR_MRK"%.8X Error getting VNDR record data",l_err->eid()); + } + } + else + { + TRACFCOMP(g_trac_ipmi,ERR_MRK"%.8X Error getting VNDR record size",l_err->eid()); + } + } + + o_cfgID_bitwise = L_NV_bits; + + return l_err; + } + + /** + * @brief Grab the GPU sensor type IDs for a particular processor target + * + * Will return all sensor ids that match the type for a given target. + * + * @param[in] - i_proc - processor target + * @param[in] - i_type - Functional/state, gpucoretemp, gpumemtemp + * @param[out] - o_num_ids - number of valid IDs returned in o_ids + * @param[out] - o_ids - ordered list of sensor IDs + * + * @return Errorlog handle + */ + errlHndl_t getGpuSensors( TARGETING::Target* i_proc, + HWAS::sensorTypeEnum i_type, + uint8_t & o_num_ids, + uint32_t o_ids[MAX_GPU_SENSORS_PER_PROCESSOR] ) + { + static uint16_t L_obus_cfgID_bit = 0; + errlHndl_t l_errl = nullptr; + + // default to no ids returned + o_num_ids = 0; + + TARGETING::AttributeTraits<TARGETING::ATTR_GPU_SENSORS>::Type + l_sensorArray; + + bool foundSensors = i_proc->tryGetAttr<TARGETING::ATTR_GPU_SENSORS> + (l_sensorArray); + + // Verify we are getting non-default values + if (foundSensors && l_sensorArray[0][0] != 0) + { + // Figure out which backplane we have + // Only read NV keyword once (if possible) + if (L_obus_cfgID_bit == 0) + { + l_errl = getNVCfgIDBit(L_obus_cfgID_bit); + if (l_errl || (L_obus_cfgID_bit == 0)) + { + delete l_errl; + // default to full list of GPU sensors + L_obus_cfgID_bit = NVCFG_ALL_SENSORS_RETURNED; + } + } + + uint32_t elementCount = (sizeof(l_sensorArray)/ + sizeof(l_sensorArray[0])); + TRACFCOMP(g_trac_ipmi,"getGpuSensors() -> GPU_SENSORS array size = %d, cfgBit = 0x%x", + elementCount, L_obus_cfgID_bit); + + // verify array index won't exceed output array (o_ids) + assert(elementCount <= MAX_GPU_SENSORS_PER_PROCESSOR); + + // now cycle through each GPU row + for (uint32_t index = 0; index < elementCount; index++) + { + uint16_t * row_ptr = &l_sensorArray[index][0]; + + TRACFCOMP(g_trac_ipmi,"getGpuSensors() -> ROW %d, 0x%04X, 0x%X, 0x%04X, 0x%X, 0x%04X, 0x%X, 0x%X", + index, row_ptr[0], row_ptr[1], row_ptr[2], + row_ptr[3], row_ptr[4], row_ptr[5], row_ptr[6]); + + // Include Sensor if the GPU is present in the current OBUS_CFG + if ((L_obus_cfgID_bit & + row_ptr[TARGETING::GPU_SENSOR_ARRAY_OBUS_CFG_OFFSET]) + == L_obus_cfgID_bit ) + { + switch(i_type) + { + case HWAS::GPU_FUNC_SENSOR: + o_ids[index] = + row_ptr[TARGETING::GPU_SENSOR_ARRAY_FUNC_ID_OFFSET]; + o_num_ids++; + break; + case HWAS::GPU_MEMORY_TEMP_SENSOR: + o_ids[index] = + row_ptr[TARGETING::GPU_SENSOR_ARRAY_MEM_TEMP_ID_OFFSET]; + o_num_ids++; + break; + case HWAS::GPU_TEMPERATURE_SENSOR: + o_ids[index] = + row_ptr[TARGETING::GPU_SENSOR_ARRAY_TEMP_ID_OFFSET]; + o_num_ids++; + break; + default: + TRACFCOMP(g_trac_ipmi,"getGpuSensors() -> unknown sensor type 0x%02X", i_type); + o_ids[index] = TARGETING::UTIL::INVALID_IPMI_SENSOR; + } + } + else + { + o_ids[index] = TARGETING::UTIL::INVALID_IPMI_SENSOR; + } + TRACFCOMP(g_trac_ipmi, + "getGpuSensors() -> o_id[%d] = 0x%X", index, o_ids[index]); + } // end of for loop + } // end of if check for non-default values + + return NULL; + } // end getGpuSensors() + + + /** + * @brief Helper function that sends GPU sensor status to BMC + * + * @param[in] sensor name (IPMI_SENSOR_TYPE with IPMI_ENTITY_ID) + * @param[in] sensor id number + * @param[in] processor target + * @param[in] status to send for the identified GPU sensor + */ + void sendGpuSensorStatus(uint16_t i_sensor_name_value, + uint16_t i_sensor_id, + TARGETING::ConstTargetHandle_t i_target, + StatusSensor::statusEnum & i_status) + { + TRACFCOMP(g_trac_ipmi, "sendGpuSensorStatus(0x%0X, 0x%X, Target 0x%X, status: %d)", + i_sensor_name_value, i_sensor_id, + TARGETING::get_huid(i_target), i_status); + + TARGETING::SENSOR_NAME l_sensor_name; + switch(i_sensor_name_value) + { + case TARGETING::SENSOR_NAME_GPU_TEMP: + l_sensor_name = TARGETING::SENSOR_NAME_GPU_TEMP; + break; + case TARGETING::SENSOR_NAME_GPU_STATE: + l_sensor_name = TARGETING::SENSOR_NAME_GPU_STATE; + break; + case TARGETING::SENSOR_NAME_GPU_MEM_TEMP: + l_sensor_name = TARGETING::SENSOR_NAME_GPU_MEM_TEMP; + break; + default: + TRACFCOMP(g_trac_ipmi, "sendGpuSensorStatus(0x%0X, 0x%X) - unknown GPU sensor name", + i_sensor_name_value, i_sensor_id); + l_sensor_name = TARGETING::SENSOR_NAME_FAULT; + break; + } + + // Only update if we found a valid gpu sensor name + if (l_sensor_name != TARGETING::SENSOR_NAME_FAULT) + { + // create a GPU status sensor for our needs + GpuSensor l_sensor(l_sensor_name, i_sensor_id, i_target); + + // send the status to the BMC + errlHndl_t l_err = l_sensor.setStatus( i_status ); + + // commit the error and move to the next target + if( l_err ) + { + errlCommit( l_err, IPMI_COMP_ID ); + } + } + } + + /** + * @brief Updates GPU sensor status for GPUs on this + * particular processor target + * + * @param[in] - i_proc - processor target + * @param[in] - i_gpu_status - status of GPU0, GPU1 and GPU2 + */ + void updateGpuSensorStatus( TARGETING::Target* i_proc, + StatusSensor::statusEnum i_gpu_status[MAX_PROCESSOR_GPUS] ) + { + uint16_t obus_cfgID_bit = 0; + + TARGETING::AttributeTraits<TARGETING::ATTR_GPU_SENSORS>::Type + l_sensorArray; + + bool foundSensors = i_proc->tryGetAttr<TARGETING::ATTR_GPU_SENSORS> + (l_sensorArray); + + // Verify we are getting non-default values + if (foundSensors && (l_sensorArray[0][0] != 0)) + { + // Figure out which backplane we have + // Only read NV keyword once (if possible) + errlHndl_t l_errl = getNVCfgIDBit(obus_cfgID_bit); + if (l_errl || (obus_cfgID_bit == 0)) + { + // default to all sensors + obus_cfgID_bit = NVCFG_ALL_SENSORS_RETURNED; + delete l_errl; + } + + uint32_t elementCount = (sizeof(l_sensorArray)/ + sizeof(l_sensorArray[0])); + TRACDCOMP(g_trac_ipmi,"updateGpuSensorStatus() -> array size = %d, cfgBit = 0x%x", + elementCount, obus_cfgID_bit); + + // verify array index won't exceed output array (o_ids) + assert(elementCount <= MAX_PROCESSOR_GPUS); + + // now cycle through each GPU row + for (uint8_t index = 0; index < MAX_PROCESSOR_GPUS; index++) + { + uint16_t * sensor_row_ptr = &l_sensorArray[index][0]; + StatusSensor::statusEnum newStatus = i_gpu_status[index]; + + // Include Sensor if the GPU is present in the current OBUS_CFG + if ((obus_cfgID_bit & + sensor_row_ptr[TARGETING::GPU_SENSOR_ARRAY_OBUS_CFG_OFFSET]) + == obus_cfgID_bit ) + { + // Only update the GPU status sensors, skip temperature ones + // GPU core Status/Functional Sensor + uint16_t sensor_name = + sensor_row_ptr[TARGETING::GPU_SENSOR_ARRAY_FUNC_OFFSET]; + uint16_t sensor_id = + sensor_row_ptr[TARGETING::GPU_SENSOR_ARRAY_FUNC_ID_OFFSET]; + sendGpuSensorStatus(sensor_name,sensor_id,i_proc,newStatus); + } + } // end of GPU loop + } // end of if check for non-default values + } // end of updateGpuSensorStatus() }; // end name space |