diff options
author | mbroyles <mbroyles@us.ibm.com> | 2017-09-25 14:26:08 -0500 |
---|---|---|
committer | Martha Broyles <mbroyles@us.ibm.com> | 2017-10-18 12:33:08 -0400 |
commit | bcc1e17b10e570791acb64649bbb7bd5e4d9348d (patch) | |
tree | c8967b8ceb72ce92e015301a0e09bd98b599d7cf | |
parent | fa085f9b68802ec92a250eccca6ddb4152e7b61f (diff) | |
download | talos-occ-bcc1e17b10e570791acb64649bbb7bd5e4d9348d.tar.gz talos-occ-bcc1e17b10e570791acb64649bbb7bd5e4d9348d.zip |
Support for GPU config format version 2
Change-Id: I14108b7a5ea7ce4e3649ab164a6e6c905274c635
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46765
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c | 181 | ||||
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h | 32 | ||||
-rwxr-xr-x | src/occ_405/occ_sys_config.c | 7 | ||||
-rwxr-xr-x | src/occ_405/occ_sys_config.h | 11 |
4 files changed, 208 insertions, 23 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c index 5982e7f..52cb637 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c @@ -75,6 +75,13 @@ extern uint8_t G_occ_interrupt_type; extern uint16_t G_proc_fmax_mhz; // Maximum frequency (uturbo if WOF enabled, otherwise turbo) extern OCCPstateParmBlock G_oppb; // OCC Pstate Parameters Block Structure +extern uint32_t G_first_proc_gpu_config; +extern uint32_t G_first_num_gpus_sys; +extern uint32_t G_curr_num_gpus_sys; +extern uint32_t G_curr_proc_gpu_config; +extern bool G_gpu_config_done; +extern bool G_gpu_monitoring_allowed; +extern task_t G_task_table[TASK_END]; typedef struct data_req_table { @@ -1262,36 +1269,166 @@ errlHndl_t data_store_gpu(const cmdh_fsp_cmd_t * i_cmd_ptr, cmdh_fsp_rsp_t * o_rsp_ptr) { errlHndl_t l_err = NULL; - const uint8_t GPU_VERSION = 0x01; - const uint16_t GPU_LENGTH = sizeof(cmdh_gpu_config_t) - sizeof(cmdh_fsp_cmd_header_t); + uint8_t i = 0; + uint8_t l_gpu_num = 0; + cmdh_gpu_config_v2_t *l_cmd_ptr = (cmdh_gpu_config_v2_t *)i_cmd_ptr; + uint16_t l_data_length = CMDH_DATALEN_FIELD_UINT16((&l_cmd_ptr->header)); + uint16_t l_gpu_data_length = 0; + uint8_t l_present_bit_mask = 0; // Bit mask for present GPUs behind this OCC + + // parse data based on version. Version byte is located at same offset for all versions + if(l_cmd_ptr->header.version == 1) + { + cmdh_gpu_config_t *l_cmd_ptr_v1 = (cmdh_gpu_config_t *)i_cmd_ptr; + l_data_length = CMDH_DATALEN_FIELD_UINT16(l_cmd_ptr_v1); + l_gpu_data_length = sizeof(cmdh_gpu_config_t) - sizeof(cmdh_fsp_cmd_header_t); + if(l_gpu_data_length == l_data_length) + { + G_sysConfigData.total_non_gpu_max_pwr_watts = l_cmd_ptr_v1->total_non_gpu_max_pwr_watts; + G_sysConfigData.total_proc_mem_pwr_drop_watts = l_cmd_ptr_v1->total_proc_mem_pwr_drop_watts; - cmdh_gpu_config_t *l_cmd_ptr = (cmdh_gpu_config_t *)i_cmd_ptr; - uint16_t l_data_length = CMDH_DATALEN_FIELD_UINT16(l_cmd_ptr); + AMECSENSOR_PTR(TEMPGPU0)->ipmi_sid = l_cmd_ptr_v1->gpu0_temp_sid; + AMECSENSOR_PTR(TEMPGPU0MEM)->ipmi_sid = l_cmd_ptr_v1->gpu0_mem_temp_sid; + G_sysConfigData.gpu_sensor_ids[0] = l_cmd_ptr_v1->gpu0_sid; - if ((GPU_VERSION == l_cmd_ptr->version) && (GPU_LENGTH == l_data_length)) + AMECSENSOR_PTR(TEMPGPU1)->ipmi_sid = l_cmd_ptr_v1->gpu1_temp_sid; + AMECSENSOR_PTR(TEMPGPU1MEM)->ipmi_sid = l_cmd_ptr_v1->gpu1_mem_temp_sid; + G_sysConfigData.gpu_sensor_ids[1] = l_cmd_ptr_v1->gpu1_sid; + + AMECSENSOR_PTR(TEMPGPU2)->ipmi_sid = l_cmd_ptr_v1->gpu2_temp_sid; + AMECSENSOR_PTR(TEMPGPU2MEM)->ipmi_sid = l_cmd_ptr_v1->gpu2_mem_temp_sid; + G_sysConfigData.gpu_sensor_ids[2] = l_cmd_ptr_v1->gpu2_sid; + + G_data_cnfg->data_mask |= DATA_MASK_GPU; + CMDH_TRAC_IMP("data_store_gpu: Got valid GPU data packet"); + } + else + { + CMDH_TRAC_ERR("data_store_gpu: GPU version 1 invalid length Expected: 0x%04X Received: 0x%04X", + l_gpu_data_length, l_data_length); + cmdh_build_errl_rsp(i_cmd_ptr, o_rsp_ptr, ERRL_RC_INVALID_CMD_LEN, &l_err); + } + } // if version 1 + else if(l_cmd_ptr->header.version == 2) { - G_sysConfigData.total_non_gpu_max_pwr_watts = l_cmd_ptr->total_non_gpu_max_pwr_watts; - G_sysConfigData.total_proc_mem_pwr_drop_watts = l_cmd_ptr->total_proc_mem_pwr_drop_watts; + l_gpu_data_length = sizeof(cmdh_gpu_cfg_header_v2_t) - sizeof(cmdh_fsp_cmd_header_t); + l_gpu_data_length += (l_cmd_ptr->header.num_data_sets * sizeof(cmdh_gpu_set_v2_t)); + + if(l_gpu_data_length == l_data_length) + { + if( (l_cmd_ptr->header.gpu_i2c_engine == PIB_I2C_ENGINE_C) && + ((l_cmd_ptr->header.gpu_i2c_bus_voltage == 0) || (l_cmd_ptr->header.gpu_i2c_bus_voltage == 18)) ) + { + G_sysConfigData.gpu_i2c_engine = l_cmd_ptr->header.gpu_i2c_engine; + G_sysConfigData.gpu_i2c_bus_voltage = l_cmd_ptr->header.gpu_i2c_bus_voltage; + CMDH_TRAC_IMP("data_store_gpu: I2C engine = 0x%02X I2C bus voltage = %d deci volts", + G_sysConfigData.gpu_i2c_engine, G_sysConfigData.gpu_i2c_bus_voltage); - AMECSENSOR_PTR(TEMPGPU0)->ipmi_sid = l_cmd_ptr->gpu0_temp_sid; - AMECSENSOR_PTR(TEMPGPU0MEM)->ipmi_sid = l_cmd_ptr->gpu0_mem_temp_sid; - G_sysConfigData.gpu_sensor_ids[0] = l_cmd_ptr->gpu0_sid; + G_sysConfigData.total_non_gpu_max_pwr_watts = l_cmd_ptr->header.total_non_gpu_max_pwr_watts; + G_sysConfigData.total_proc_mem_pwr_drop_watts = l_cmd_ptr->header.total_proc_mem_pwr_drop_watts; - AMECSENSOR_PTR(TEMPGPU1)->ipmi_sid = l_cmd_ptr->gpu1_temp_sid; - AMECSENSOR_PTR(TEMPGPU1MEM)->ipmi_sid = l_cmd_ptr->gpu1_mem_temp_sid; - G_sysConfigData.gpu_sensor_ids[1] = l_cmd_ptr->gpu1_sid; + // Store the individual GPU data + for(i=0; i<l_cmd_ptr->header.num_data_sets; i++) + { + // Get the GPU number data is for + l_gpu_num = l_cmd_ptr->gpu_data[i].gpu_num; - AMECSENSOR_PTR(TEMPGPU2)->ipmi_sid = l_cmd_ptr->gpu2_temp_sid; - AMECSENSOR_PTR(TEMPGPU2MEM)->ipmi_sid = l_cmd_ptr->gpu2_mem_temp_sid; - G_sysConfigData.gpu_sensor_ids[2] = l_cmd_ptr->gpu2_sid; + if( (l_gpu_num >= 0) && (l_gpu_num < MAX_NUM_GPU_PER_DOMAIN) ) + { + G_sysConfigData.gpu_i2c_info[l_gpu_num].port = l_cmd_ptr->gpu_data[i].i2c_port; + G_sysConfigData.gpu_i2c_info[l_gpu_num].address = l_cmd_ptr->gpu_data[i].i2c_addr; - G_data_cnfg->data_mask |= DATA_MASK_GPU; - CMDH_TRAC_IMP("data_store_gpu: Got valid GPU data packet"); - } + // if port or i2c address is 0xFF the GPU will not be monitored + if( (G_sysConfigData.gpu_i2c_info[l_gpu_num].port != 0xFF) && + (G_sysConfigData.gpu_i2c_info[l_gpu_num].address != 0xFF) ) + { + CMDH_TRAC_IMP("data_store_gpu: GPU%d I2C port = 0x%02X address = 0x%02X", l_gpu_num, + G_sysConfigData.gpu_i2c_info[l_gpu_num].port, + G_sysConfigData.gpu_i2c_info[l_gpu_num].address); + + AMECSENSOR_PTR(TEMPGPU0 + l_gpu_num)->ipmi_sid = l_cmd_ptr->gpu_data[i].gpu_temp_sid; + AMECSENSOR_PTR(TEMPGPU0MEM + l_gpu_num)->ipmi_sid = l_cmd_ptr->gpu_data[i].gpu_mem_temp_sid; + G_sysConfigData.gpu_sensor_ids[l_gpu_num] = l_cmd_ptr->gpu_data[i].gpu_sid; + + // If there is no APSS this data is giving GPU presence, mark this GPU as present + if(G_pwr_reading_type != PWR_READING_TYPE_APSS) + { + l_present_bit_mask |= (0x01 << l_gpu_num); + } + } + else + { + CMDH_TRAC_ERR("data_store_gpu: GPU%d NOT monitored Invalid I2C port = 0x%02X I2C address = 0x%02X", + l_gpu_num, G_sysConfigData.gpu_i2c_info[l_gpu_num].port, + G_sysConfigData.gpu_i2c_info[l_gpu_num].address); + } + } + else + { + // We got an invalid GPU number + CMDH_TRAC_ERR("data_store_gpu: Received invalid GPU number %d", l_gpu_num); + cmdh_build_errl_rsp(i_cmd_ptr, o_rsp_ptr, ERRL_RC_INVALID_DATA, &l_err); + break; + } + } // for each GPU data set + + // if there is no APSS for GPU presence then this data is the GPU presence + if(G_pwr_reading_type != PWR_READING_TYPE_APSS) + { + if(l_err == NULL) + { + G_first_num_gpus_sys = l_cmd_ptr->header.total_num_gpus_system; + G_curr_num_gpus_sys = G_first_num_gpus_sys; + G_first_proc_gpu_config = l_present_bit_mask; + G_curr_proc_gpu_config = G_first_proc_gpu_config; + G_gpu_config_done = TRUE; + + if(G_first_proc_gpu_config) + { + // GPUs are present enable monitoring + G_gpu_monitoring_allowed = TRUE; + G_task_table[TASK_ID_GPU_SM].flags = GPU_RTL_FLAGS; + } + + CMDH_TRAC_IMP("data_store_gpu: This OCC GPUs present mask = 0x%02X Total number GPUs present in system = %d", + G_first_proc_gpu_config, G_first_num_gpus_sys); + + G_data_cnfg->data_mask |= DATA_MASK_GPU; + CMDH_TRAC_IMP("data_store_gpu: Got valid GPU data packet"); + } + else + { + G_first_num_gpus_sys = 0; + G_curr_num_gpus_sys = 0; + G_first_proc_gpu_config = 0; + G_curr_proc_gpu_config = 0; + G_gpu_config_done = FALSE; + } + } + else if(l_err == NULL) + { + G_data_cnfg->data_mask |= DATA_MASK_GPU; + CMDH_TRAC_IMP("data_store_gpu: Got valid GPU data packet"); + } + } // valid i2c engine and voltage + else + { + // We got an invalid I2C Engine and/or voltage + CMDH_TRAC_ERR("data_store_gpu: Received invalid I2C Engine/Voltage 0x%02X / %d", + l_cmd_ptr->header.gpu_i2c_engine, l_cmd_ptr->header.gpu_i2c_bus_voltage); + cmdh_build_errl_rsp(i_cmd_ptr, o_rsp_ptr, ERRL_RC_INVALID_DATA, &l_err); + } + } // if length valid + else + { + CMDH_TRAC_ERR("data_store_gpu: GPU version 2 invalid length Expected: 0x%04X Received: 0x%04X", + l_gpu_data_length, l_data_length); + cmdh_build_errl_rsp(i_cmd_ptr, o_rsp_ptr, ERRL_RC_INVALID_CMD_LEN, &l_err); + } + } //else if version 2 else { - CMDH_TRAC_ERR("data_store_gpu: Invalid GPU version/length (0x%02X/0x%04X))", - l_cmd_ptr->version, l_data_length); + CMDH_TRAC_ERR("data_store_gpu: Invalid GPU version 0x%02X", l_cmd_ptr->header.version); cmdh_build_errl_rsp(i_cmd_ptr, o_rsp_ptr, ERRL_RC_INVALID_DATA, &l_err); } @@ -1735,7 +1872,7 @@ errlHndl_t data_store_thrm_thresholds(const cmdh_fsp_cmd_t * i_cmd_ptr, G_data_cnfg->thrm_thresh.data[l_frutype].max_read_timeout = l_cmd_ptr->data[i].max_read_timeout; - // Set a local flag if we get data for VRM FRU type + // Set a local flag if we get data for VRM OT status FRU type if(l_frutype == DATA_FRU_VRM_OT_STATUS) { l_vrm_frutype = TRUE; diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h index dbeb768..5dac7fa 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h +++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h @@ -36,6 +36,7 @@ #include "cmdh_fsp.h" #include "cmdh_fsp_cmds.h" #include "apss.h" +#include "occ_sys_config.h" // Enum of the various CnfgData command formats that // are sent to OCC over the TMGT<->OCC interface. @@ -165,6 +166,37 @@ typedef struct __attribute__ ((packed)) }cmdh_avsbus_config_t; // Used by TMGT to send OCC GPU data. +// Header data for GPU version 2 cfg packet +typedef struct __attribute__ ((packed)) +{ + struct cmdh_fsp_cmd_header; + uint8_t format; + uint8_t version; + uint16_t total_non_gpu_max_pwr_watts; + uint16_t total_proc_mem_pwr_drop_watts; + uint8_t total_num_gpus_system; + uint8_t gpu_i2c_engine; + uint8_t gpu_i2c_bus_voltage; + uint8_t num_data_sets; +}cmdh_gpu_cfg_header_v2_t; + +typedef struct __attribute__ ((packed)) +{ + uint8_t gpu_num; // number 0...2 for GPU data is for + uint8_t i2c_port; // I2C port for this GPU + uint8_t i2c_addr; // I2C slave address for this GPU + uint8_t reserved; + uint32_t gpu_temp_sid; // GPU Core Temperature Sensor ID + uint32_t gpu_mem_temp_sid; // GPU Memory Temperature Sensor ID + uint32_t gpu_sid; // GPU Sensor ID for callout +}cmdh_gpu_set_v2_t; + +typedef struct __attribute__ ((packed)) +{ + cmdh_gpu_cfg_header_v2_t header; + cmdh_gpu_set_v2_t gpu_data[MAX_NUM_GPU_PER_DOMAIN]; +}cmdh_gpu_config_v2_t; + typedef struct __attribute__ ((packed)) { struct cmdh_fsp_cmd_header; diff --git a/src/occ_405/occ_sys_config.c b/src/occ_405/occ_sys_config.c index 20d5022..93f0805 100755 --- a/src/occ_405/occ_sys_config.c +++ b/src/occ_405/occ_sys_config.c @@ -230,6 +230,13 @@ occSysConfigData_t G_sysConfigData = .total_non_gpu_max_pwr_watts = 0, .total_proc_mem_pwr_drop_watts = 0, .psr = 100, // default to 100% take all possible power away from CPU/memory first + .gpu_i2c_engine = PIB_I2C_ENGINE_C, // default to engine C + .gpu_i2c_bus_voltage = 18, // default to 1.8V + .gpu_i2c_info = { + [0] {.port = 4, .address = 0x98}, + [1] {.port = 4, .address = 0x9A}, + [2] {.port = 4, .address = 0x9C}, + }, }; diff --git a/src/occ_405/occ_sys_config.h b/src/occ_405/occ_sys_config.h index e398a6c..f393256 100755 --- a/src/occ_405/occ_sys_config.h +++ b/src/occ_405/occ_sys_config.h @@ -295,6 +295,12 @@ typedef struct uint16_t reserved3; //reserved } mem_throt_config_data_t; +// Per GPU I2C Info +typedef struct +{ + uint8_t port; + uint8_t address; +} gpuI2CInfo_t; // Sys Config Structure @@ -434,7 +440,10 @@ typedef struct uint32_t gpu_sensor_ids[MAX_NUM_GPU_PER_DOMAIN]; uint16_t total_non_gpu_max_pwr_watts; uint16_t total_proc_mem_pwr_drop_watts; - uint8_t psr; // power shifting ratio for power capping between GPU/Proc&mem + uint8_t psr; // power shifting ratio for power capping between GPU/Proc&mem + uint8_t gpu_i2c_engine; // PIB I2CM engine for all GPUs + uint8_t gpu_i2c_bus_voltage; // GPU I2C bus voltage (1 = 0.1V) + gpuI2CInfo_t gpu_i2c_info[MAX_NUM_GPU_PER_DOMAIN]; // per GPU I2C info (port/address) } occSysConfigData_t; __attribute__ ((__aligned__ (128))) |