diff options
author | William Bryan <wilbryan@us.ibm.com> | 2017-10-19 15:08:10 -0500 |
---|---|---|
committer | William A. Bryan <wilbryan@us.ibm.com> | 2017-10-19 16:35:39 -0400 |
commit | bacb45ad1cc0da113290f0e169c33e5f0885c171 (patch) | |
tree | 1ce35c18045dcabeb5ecc8c8db9453b26b52816c | |
parent | c07a7207c8b1a2d74cf4cc55120eb8073ee07d96 (diff) | |
download | talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.tar.gz talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.zip |
GPU Timing Measurement Debug Command
Change-Id: I5d37db9ba1aa9dc90b09266da6762121195d2385
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48629
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.c | 55 | ||||
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.h | 2 | ||||
-rwxr-xr-x | src/occ_405/gpu/gpu.c | 73 | ||||
-rw-r--r-- | src/occ_405/gpu/gpu.h | 21 |
4 files changed, 148 insertions, 3 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 1f28f7a..86ee360 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -50,6 +50,8 @@ #include <avsbus.h> #include "wof.h" #include "sensor_main_memory.h" +#include "gpu.h" + extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap; extern bool G_vrm_thermal_monitoring; extern uint32_t G_first_proc_gpu_config; @@ -1416,6 +1418,55 @@ void cmdh_dbug_clear_ame_sensor(const cmdh_fsp_cmd_t * i_cmd_ptr, G_rsp_status = l_rc; } +void cmdh_dump_gpu_timings(void) +{ + extern gpuTimingTable_t G_gpu_tick_times; + int i = 0; + + for( ; i < MAX_NUM_GPU_PER_DOMAIN; i++) + { + TRAC_INFO("=======================================GPU%d===================================================", i); + TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|"); + TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.coretemp[i].max, + G_gpu_tick_times.coretemp[i].avg, + G_gpu_tick_times.coretemp[i].count_1s, + G_gpu_tick_times.coretemp[i].count_100ms, + G_gpu_tick_times.coretemp[i].count_lt100ms); + TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.memtemp[i].max, + G_gpu_tick_times.memtemp[i].avg, + G_gpu_tick_times.memtemp[i].count_1s, + G_gpu_tick_times.memtemp[i].count_100ms, + G_gpu_tick_times.memtemp[i].count_lt100ms); + TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.checkdriver[i].max, + G_gpu_tick_times.checkdriver[i].avg, + G_gpu_tick_times.checkdriver[i].count_1s, + G_gpu_tick_times.checkdriver[i].count_100ms, + G_gpu_tick_times.checkdriver[i].count_lt100ms); + TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.capabilities[i].max, + G_gpu_tick_times.capabilities[i].avg, + G_gpu_tick_times.capabilities[i].count_1s, + G_gpu_tick_times.capabilities[i].count_100ms, + G_gpu_tick_times.capabilities[i].count_lt100ms); + TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.getpcap[i].max, + G_gpu_tick_times.getpcap[i].avg, + G_gpu_tick_times.getpcap[i].count_1s, + G_gpu_tick_times.getpcap[i].count_100ms, + G_gpu_tick_times.getpcap[i].count_lt100ms); + TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d", + G_gpu_tick_times.setpcap[i].max, + G_gpu_tick_times.setpcap[i].avg, + G_gpu_tick_times.setpcap[i].count_1s, + G_gpu_tick_times.setpcap[i].count_100ms, + G_gpu_tick_times.setpcap[i].count_lt100ms); + TRAC_INFO("==============================================================================================", i); + } +} + // Function Specification // // Name: dbug_parse_cmd @@ -1458,6 +1509,10 @@ void cmdh_dbug_cmd (const cmdh_fsp_cmd_t * i_cmd_ptr, // Act on Debug Sub-Command switch ( l_sub_cmd ) { + case DBUG_DUMP_GPU_TIMINGS: + cmdh_dump_gpu_timings(); + break; + case DBUG_GET_AME_SENSOR: cmdh_dbug_get_ame_sensor(i_cmd_ptr, o_rsp_ptr); break; diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h index 2f3688f..9dda8dc 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.h +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h @@ -378,7 +378,7 @@ typedef enum // free = 0x05 DBUG_SET_PEXE_EVENT = 0x06, DBUG_GET_AME_SENSOR = 0x07, - // free = 0x08, + DBUG_DUMP_GPU_TIMINGS = 0x08, DBUG_PEEK = 0x09, DBUG_POKE = 0x0A, DBUG_DUMP_THEMAL = 0x0B, diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 94a8fc0..9c473b9 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -53,6 +53,9 @@ #define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) #define GPU_TIMEOUT ( 5000000 / (MICS_PER_TICK *2) ) +#define GPU_TICKS_TO_100MS ( 100000 / (MICS_PER_TICK * 2) ) +#define GPU_TICKS_TO_1S ( 1000000 / (MICS_PER_TICK * 2) ) + // Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time #define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds @@ -93,6 +96,33 @@ gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}}; uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed +gpuTimingTable_t G_gpu_tick_times; + +void update_gpu_tick_sensor(gpuTimingSensor_t *sensor, uint32_t ticks) +{ + if(ticks > sensor->max) + { + sensor->max = ticks; + } + + if(ticks > GPU_TICKS_TO_1S) + { + sensor->count_1s++; + } + else if( (ticks > GPU_TICKS_TO_100MS) ) + { + sensor->count_100ms++; + } + else + { + sensor->count_lt100ms++; + } + + sensor->count++; + sensor->accum += ticks; + sensor->avg = sensor->accum / sensor->count; +} + // Find first present non-failed GPU. returns 0xFF if no GPUs present/functional uint8_t get_first_gpu(void) { @@ -1085,6 +1115,10 @@ bool gpu_check_driver_loaded_sm() static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new read then need to check status of current state before moving on @@ -1186,6 +1220,7 @@ bool gpu_check_driver_loaded_sm() switch (L_check_driver_state) { case GPU_STATE_CHECK_DRIVER_LOADED_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args); break; @@ -1202,6 +1237,9 @@ bool gpu_check_driver_loaded_sm() break; case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE: + // Update GPU tick timing table + update_gpu_tick_sensor(&G_gpu_tick_times.checkdriver[G_current_gpu_id], L_num_ticks); + // Update driver loaded l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01; if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded) @@ -1292,6 +1330,10 @@ bool gpu_read_pwr_limit_sm() static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0}; static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new read then need to check status of current state before moving on @@ -1409,6 +1451,7 @@ bool gpu_read_pwr_limit_sm() { // Step 1 case GPU_STATE_READ_PWR_LIMIT_1_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args); break; @@ -1440,7 +1483,7 @@ bool gpu_read_pwr_limit_sm() // Step 3 case GPU_STATE_READ_PWR_LIMIT_3_START: - GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap", L_attempts); + GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap for GPU%d", L_attempts, G_current_gpu_id); L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args); break; @@ -1494,6 +1537,8 @@ bool gpu_read_pwr_limit_sm() break; case GPU_STATE_READ_PWR_LIMIT_COMPLETE: + update_gpu_tick_sensor(&G_gpu_tick_times.getpcap[G_current_gpu_id], L_num_ticks); + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE; // Update power limits g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE; @@ -1568,6 +1613,10 @@ bool gpu_set_pwr_limit_sm() static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new set limit then need to check status of current state before moving on @@ -1686,6 +1735,7 @@ bool gpu_set_pwr_limit_sm() { // Step 1 case GPU_STATE_SET_PWR_LIMIT_1_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args); break; @@ -1754,7 +1804,8 @@ bool gpu_set_pwr_limit_sm() break; case GPU_STATE_SET_PWR_LIMIT_COMPLETE: - GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap", L_attempts); + update_gpu_tick_sensor(&G_gpu_tick_times.setpcap[G_current_gpu_id], L_num_ticks); + GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap for GPU%d", L_attempts, G_current_gpu_id); // Update the requested power limit since it was successfully sent // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC // has caluclated a new desired pcap while this one was already in process of being set @@ -1818,6 +1869,10 @@ bool gpu_read_temp_sm() static bool L_trace_success = FALSE; static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new read then need to check status of current state before moving on @@ -1852,6 +1907,7 @@ bool gpu_read_temp_sm() switch (L_read_temp_state) { case GPU_STATE_READ_TEMP_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args); break; @@ -1860,6 +1916,7 @@ bool gpu_read_temp_sm() break; case GPU_STATE_READ_TEMP_COMPLETE: + update_gpu_tick_sensor(&G_gpu_tick_times.coretemp[G_current_gpu_id], L_num_ticks); if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) && (0 != G_gpu_op_req_args.data[0]) ) { @@ -1987,6 +2044,10 @@ bool gpu_read_mem_temp_capability_sm() static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW; static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new read then need to check status of current state before moving on @@ -2087,6 +2148,7 @@ bool gpu_read_mem_temp_capability_sm() switch (L_read_cap_state) { case GPU_STATE_READ_MEM_TEMP_CAPABLE_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args); break; @@ -2103,6 +2165,7 @@ bool gpu_read_mem_temp_capability_sm() break; case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE: + update_gpu_tick_sensor(&G_gpu_tick_times.capabilities[G_current_gpu_id], L_num_ticks); // Update capability g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01; @@ -2168,6 +2231,10 @@ bool gpu_read_memory_temp_sm() static uint8_t L_read_failure_count = 0; static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW; // 1st state for reading temp + static uint32_t L_num_ticks = 0; + + L_num_ticks++; + if (async_request_is_idle(&G_gpu_op_request.request)) { // If not starting a new read then need to check status of current state before moving on @@ -2281,6 +2348,7 @@ bool gpu_read_memory_temp_sm() switch (L_read_temp_state) { case GPU_STATE_READ_MEM_TEMP_START: + L_num_ticks = 1; L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args); break; @@ -2297,6 +2365,7 @@ bool gpu_read_memory_temp_sm() break; case GPU_STATE_READ_MEM_TEMP_COMPLETE: + update_gpu_tick_sensor(&G_gpu_tick_times.memtemp[G_current_gpu_id], L_num_ticks); // Update sensor l_temp = G_gpu_op_req_args.data[0]; sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp); diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h index 0baf721..2ee7043 100644 --- a/src/occ_405/gpu/gpu.h +++ b/src/occ_405/gpu/gpu.h @@ -156,5 +156,26 @@ void gpu_ipc_init(); // GPU state machine void task_gpu_sm(struct task *i_self); +typedef struct gpuTimingSensor +{ + uint32_t max; + uint32_t avg; + uint32_t count_1s; + uint32_t count_100ms; + uint32_t count_lt100ms; + uint64_t accum; + uint64_t count; +} gpuTimingSensor_t; + +// Table for GPU timings +typedef struct gpuTimingTable +{ + gpuTimingSensor_t getpcap[MAX_NUM_GPU_PER_DOMAIN]; + gpuTimingSensor_t setpcap[MAX_NUM_GPU_PER_DOMAIN]; + gpuTimingSensor_t coretemp[MAX_NUM_GPU_PER_DOMAIN]; + gpuTimingSensor_t memtemp[MAX_NUM_GPU_PER_DOMAIN]; + gpuTimingSensor_t capabilities[MAX_NUM_GPU_PER_DOMAIN]; + gpuTimingSensor_t checkdriver[MAX_NUM_GPU_PER_DOMAIN]; +} gpuTimingTable_t; #endif //_GPU_H |