summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2017-10-19 15:08:10 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2017-10-19 16:35:39 -0400
commitbacb45ad1cc0da113290f0e169c33e5f0885c171 (patch)
tree1ce35c18045dcabeb5ecc8c8db9453b26b52816c
parentc07a7207c8b1a2d74cf4cc55120eb8073ee07d96 (diff)
downloadtalos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.tar.gz
talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.zip
GPU Timing Measurement Debug Command
Change-Id: I5d37db9ba1aa9dc90b09266da6762121195d2385 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48629 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c55
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.h2
-rwxr-xr-xsrc/occ_405/gpu/gpu.c73
-rw-r--r--src/occ_405/gpu/gpu.h21
4 files changed, 148 insertions, 3 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 1f28f7a..86ee360 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -50,6 +50,8 @@
#include <avsbus.h>
#include "wof.h"
#include "sensor_main_memory.h"
+#include "gpu.h"
+
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;
extern uint32_t G_first_proc_gpu_config;
@@ -1416,6 +1418,55 @@ void cmdh_dbug_clear_ame_sensor(const cmdh_fsp_cmd_t * i_cmd_ptr,
G_rsp_status = l_rc;
}
+void cmdh_dump_gpu_timings(void)
+{
+ extern gpuTimingTable_t G_gpu_tick_times;
+ int i = 0;
+
+ for( ; i < MAX_NUM_GPU_PER_DOMAIN; i++)
+ {
+ TRAC_INFO("=======================================GPU%d===================================================", i);
+ TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|");
+ TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.coretemp[i].max,
+ G_gpu_tick_times.coretemp[i].avg,
+ G_gpu_tick_times.coretemp[i].count_1s,
+ G_gpu_tick_times.coretemp[i].count_100ms,
+ G_gpu_tick_times.coretemp[i].count_lt100ms);
+ TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.memtemp[i].max,
+ G_gpu_tick_times.memtemp[i].avg,
+ G_gpu_tick_times.memtemp[i].count_1s,
+ G_gpu_tick_times.memtemp[i].count_100ms,
+ G_gpu_tick_times.memtemp[i].count_lt100ms);
+ TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.checkdriver[i].max,
+ G_gpu_tick_times.checkdriver[i].avg,
+ G_gpu_tick_times.checkdriver[i].count_1s,
+ G_gpu_tick_times.checkdriver[i].count_100ms,
+ G_gpu_tick_times.checkdriver[i].count_lt100ms);
+ TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.capabilities[i].max,
+ G_gpu_tick_times.capabilities[i].avg,
+ G_gpu_tick_times.capabilities[i].count_1s,
+ G_gpu_tick_times.capabilities[i].count_100ms,
+ G_gpu_tick_times.capabilities[i].count_lt100ms);
+ TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.getpcap[i].max,
+ G_gpu_tick_times.getpcap[i].avg,
+ G_gpu_tick_times.getpcap[i].count_1s,
+ G_gpu_tick_times.getpcap[i].count_100ms,
+ G_gpu_tick_times.getpcap[i].count_lt100ms);
+ TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ G_gpu_tick_times.setpcap[i].max,
+ G_gpu_tick_times.setpcap[i].avg,
+ G_gpu_tick_times.setpcap[i].count_1s,
+ G_gpu_tick_times.setpcap[i].count_100ms,
+ G_gpu_tick_times.setpcap[i].count_lt100ms);
+ TRAC_INFO("==============================================================================================", i);
+ }
+}
+
// Function Specification
//
// Name: dbug_parse_cmd
@@ -1458,6 +1509,10 @@ void cmdh_dbug_cmd (const cmdh_fsp_cmd_t * i_cmd_ptr,
// Act on Debug Sub-Command
switch ( l_sub_cmd )
{
+ case DBUG_DUMP_GPU_TIMINGS:
+ cmdh_dump_gpu_timings();
+ break;
+
case DBUG_GET_AME_SENSOR:
cmdh_dbug_get_ame_sensor(i_cmd_ptr, o_rsp_ptr);
break;
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h
index 2f3688f..9dda8dc 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.h
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h
@@ -378,7 +378,7 @@ typedef enum
// free = 0x05
DBUG_SET_PEXE_EVENT = 0x06,
DBUG_GET_AME_SENSOR = 0x07,
- // free = 0x08,
+ DBUG_DUMP_GPU_TIMINGS = 0x08,
DBUG_PEEK = 0x09,
DBUG_POKE = 0x0A,
DBUG_DUMP_THEMAL = 0x0B,
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 94a8fc0..9c473b9 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -53,6 +53,9 @@
#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) )
#define GPU_TIMEOUT ( 5000000 / (MICS_PER_TICK *2) )
+#define GPU_TICKS_TO_100MS ( 100000 / (MICS_PER_TICK * 2) )
+#define GPU_TICKS_TO_1S ( 1000000 / (MICS_PER_TICK * 2) )
+
// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
#define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds
@@ -93,6 +96,33 @@ gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed
+gpuTimingTable_t G_gpu_tick_times;
+
+void update_gpu_tick_sensor(gpuTimingSensor_t *sensor, uint32_t ticks)
+{
+ if(ticks > sensor->max)
+ {
+ sensor->max = ticks;
+ }
+
+ if(ticks > GPU_TICKS_TO_1S)
+ {
+ sensor->count_1s++;
+ }
+ else if( (ticks > GPU_TICKS_TO_100MS) )
+ {
+ sensor->count_100ms++;
+ }
+ else
+ {
+ sensor->count_lt100ms++;
+ }
+
+ sensor->count++;
+ sensor->accum += ticks;
+ sensor->avg = sensor->accum / sensor->count;
+}
+
// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
uint8_t get_first_gpu(void)
{
@@ -1085,6 +1115,10 @@ bool gpu_check_driver_loaded_sm()
static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
@@ -1186,6 +1220,7 @@ bool gpu_check_driver_loaded_sm()
switch (L_check_driver_state)
{
case GPU_STATE_CHECK_DRIVER_LOADED_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
break;
@@ -1202,6 +1237,9 @@ bool gpu_check_driver_loaded_sm()
break;
case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
+ // Update GPU tick timing table
+ update_gpu_tick_sensor(&G_gpu_tick_times.checkdriver[G_current_gpu_id], L_num_ticks);
+
// Update driver loaded
l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
@@ -1292,6 +1330,10 @@ bool gpu_read_pwr_limit_sm()
static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0};
static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
@@ -1409,6 +1451,7 @@ bool gpu_read_pwr_limit_sm()
{
// Step 1
case GPU_STATE_READ_PWR_LIMIT_1_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;
@@ -1440,7 +1483,7 @@ bool gpu_read_pwr_limit_sm()
// Step 3
case GPU_STATE_READ_PWR_LIMIT_3_START:
- GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap", L_attempts);
+ GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap for GPU%d", L_attempts, G_current_gpu_id);
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args);
break;
@@ -1494,6 +1537,8 @@ bool gpu_read_pwr_limit_sm()
break;
case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
+ update_gpu_tick_sensor(&G_gpu_tick_times.getpcap[G_current_gpu_id], L_num_ticks);
+
g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
// Update power limits
g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
@@ -1568,6 +1613,10 @@ bool gpu_set_pwr_limit_sm()
static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new set limit then need to check status of current state before moving on
@@ -1686,6 +1735,7 @@ bool gpu_set_pwr_limit_sm()
{
// Step 1
case GPU_STATE_SET_PWR_LIMIT_1_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;
@@ -1754,7 +1804,8 @@ bool gpu_set_pwr_limit_sm()
break;
case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
- GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap", L_attempts);
+ update_gpu_tick_sensor(&G_gpu_tick_times.setpcap[G_current_gpu_id], L_num_ticks);
+ GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap for GPU%d", L_attempts, G_current_gpu_id);
// Update the requested power limit since it was successfully sent
// NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
// has caluclated a new desired pcap while this one was already in process of being set
@@ -1818,6 +1869,10 @@ bool gpu_read_temp_sm()
static bool L_trace_success = FALSE;
static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
@@ -1852,6 +1907,7 @@ bool gpu_read_temp_sm()
switch (L_read_temp_state)
{
case GPU_STATE_READ_TEMP_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
break;
@@ -1860,6 +1916,7 @@ bool gpu_read_temp_sm()
break;
case GPU_STATE_READ_TEMP_COMPLETE:
+ update_gpu_tick_sensor(&G_gpu_tick_times.coretemp[G_current_gpu_id], L_num_ticks);
if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
(0 != G_gpu_op_req_args.data[0]) )
{
@@ -1987,6 +2044,10 @@ bool gpu_read_mem_temp_capability_sm()
static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
@@ -2087,6 +2148,7 @@ bool gpu_read_mem_temp_capability_sm()
switch (L_read_cap_state)
{
case GPU_STATE_READ_MEM_TEMP_CAPABLE_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
break;
@@ -2103,6 +2165,7 @@ bool gpu_read_mem_temp_capability_sm()
break;
case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
+ update_gpu_tick_sensor(&G_gpu_tick_times.capabilities[G_current_gpu_id], L_num_ticks);
// Update capability
g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
@@ -2168,6 +2231,10 @@ bool gpu_read_memory_temp_sm()
static uint8_t L_read_failure_count = 0;
static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW; // 1st state for reading temp
+ static uint32_t L_num_ticks = 0;
+
+ L_num_ticks++;
+
if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
@@ -2281,6 +2348,7 @@ bool gpu_read_memory_temp_sm()
switch (L_read_temp_state)
{
case GPU_STATE_READ_MEM_TEMP_START:
+ L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
break;
@@ -2297,6 +2365,7 @@ bool gpu_read_memory_temp_sm()
break;
case GPU_STATE_READ_MEM_TEMP_COMPLETE:
+ update_gpu_tick_sensor(&G_gpu_tick_times.memtemp[G_current_gpu_id], L_num_ticks);
// Update sensor
l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index 0baf721..2ee7043 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -156,5 +156,26 @@ void gpu_ipc_init();
// GPU state machine
void task_gpu_sm(struct task *i_self);
+typedef struct gpuTimingSensor
+{
+ uint32_t max;
+ uint32_t avg;
+ uint32_t count_1s;
+ uint32_t count_100ms;
+ uint32_t count_lt100ms;
+ uint64_t accum;
+ uint64_t count;
+} gpuTimingSensor_t;
+
+// Table for GPU timings
+typedef struct gpuTimingTable
+{
+ gpuTimingSensor_t getpcap[MAX_NUM_GPU_PER_DOMAIN];
+ gpuTimingSensor_t setpcap[MAX_NUM_GPU_PER_DOMAIN];
+ gpuTimingSensor_t coretemp[MAX_NUM_GPU_PER_DOMAIN];
+ gpuTimingSensor_t memtemp[MAX_NUM_GPU_PER_DOMAIN];
+ gpuTimingSensor_t capabilities[MAX_NUM_GPU_PER_DOMAIN];
+ gpuTimingSensor_t checkdriver[MAX_NUM_GPU_PER_DOMAIN];
+} gpuTimingTable_t;
#endif //_GPU_H
OpenPOWER on IntegriCloud