GPU Timing Measurement Debug Command

Change-Id: I5d37db9ba1aa9dc90b09266da6762121195d2385 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48629 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
author: William Bryan <wilbryan@us.ibm.com> 2017-10-19 15:08:10 -0500
committer: William A. Bryan <wilbryan@us.ibm.com> 2017-10-19 16:35:39 -0400
commit: bacb45ad1cc0da113290f0e169c33e5f0885c171 (patch)
tree: 1ce35c18045dcabeb5ecc8c8db9453b26b52816c
parent: c07a7207c8b1a2d74cf4cc55120eb8073ee07d96 (diff)
download: talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.tar.gz
talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.zip
4 files changed, 148 insertions, 3 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 1f28f7a..86ee360 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -50,6 +50,8 @@
 #include <avsbus.h>
 #include "wof.h"
 #include "sensor_main_memory.h"
+#include "gpu.h"
+
 extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
 extern bool G_vrm_thermal_monitoring;
 extern uint32_t G_first_proc_gpu_config;
@@ -1416,6 +1418,55 @@ void cmdh_dbug_clear_ame_sensor(const cmdh_fsp_cmd_t * i_cmd_ptr,
     G_rsp_status = l_rc;
 }
 
+void cmdh_dump_gpu_timings(void)
+{
+    extern gpuTimingTable_t G_gpu_tick_times;
+    int i = 0;
+
+    for( ; i < MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        TRAC_INFO("=======================================GPU%d===================================================", i);
+        TRAC_INFO("|                          Max           Avg           1s count     100ms count  <100ms count|");
+        TRAC_INFO("| Core Temperatures        %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.coretemp[i].max,
+                  G_gpu_tick_times.coretemp[i].avg,
+                  G_gpu_tick_times.coretemp[i].count_1s,
+                  G_gpu_tick_times.coretemp[i].count_100ms,
+                  G_gpu_tick_times.coretemp[i].count_lt100ms);
+        TRAC_INFO("| Mem  Temperatures        %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.memtemp[i].max,
+                  G_gpu_tick_times.memtemp[i].avg,
+                  G_gpu_tick_times.memtemp[i].count_1s,
+                  G_gpu_tick_times.memtemp[i].count_100ms,
+                  G_gpu_tick_times.memtemp[i].count_lt100ms);
+        TRAC_INFO("| Check Driver Loaded      %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.checkdriver[i].max,
+                  G_gpu_tick_times.checkdriver[i].avg,
+                  G_gpu_tick_times.checkdriver[i].count_1s,
+                  G_gpu_tick_times.checkdriver[i].count_100ms,
+                  G_gpu_tick_times.checkdriver[i].count_lt100ms);
+        TRAC_INFO("| Mem  Capabilities        %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.capabilities[i].max,
+                  G_gpu_tick_times.capabilities[i].avg,
+                  G_gpu_tick_times.capabilities[i].count_1s,
+                  G_gpu_tick_times.capabilities[i].count_100ms,
+                  G_gpu_tick_times.capabilities[i].count_lt100ms);
+        TRAC_INFO("| Read Power Policy        %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.getpcap[i].max,
+                  G_gpu_tick_times.getpcap[i].avg,
+                  G_gpu_tick_times.getpcap[i].count_1s,
+                  G_gpu_tick_times.getpcap[i].count_100ms,
+                  G_gpu_tick_times.getpcap[i].count_lt100ms);
+        TRAC_INFO("| Set Power Cap            %-5d ticks   %-5d ticks   %-5d        %-5d        %-5d",
+                  G_gpu_tick_times.setpcap[i].max,
+                  G_gpu_tick_times.setpcap[i].avg,
+                  G_gpu_tick_times.setpcap[i].count_1s,
+                  G_gpu_tick_times.setpcap[i].count_100ms,
+                  G_gpu_tick_times.setpcap[i].count_lt100ms);
+        TRAC_INFO("==============================================================================================", i);
+    }
+}
+
 // Function Specification
 //
 // Name:  dbug_parse_cmd
@@ -1458,6 +1509,10 @@ void cmdh_dbug_cmd (const cmdh_fsp_cmd_t * i_cmd_ptr,
     // Act on Debug Sub-Command
     switch ( l_sub_cmd )
     {
+        case DBUG_DUMP_GPU_TIMINGS:
+            cmdh_dump_gpu_timings();
+            break;
+
         case DBUG_GET_AME_SENSOR:
             cmdh_dbug_get_ame_sensor(i_cmd_ptr, o_rsp_ptr);
             break;
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h
index 2f3688f..9dda8dc 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.h
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h
@@ -378,7 +378,7 @@ typedef enum
     // free = 0x05
     DBUG_SET_PEXE_EVENT     = 0x06,
     DBUG_GET_AME_SENSOR     = 0x07,
-    // free = 0x08,
+    DBUG_DUMP_GPU_TIMINGS   = 0x08,
     DBUG_PEEK               = 0x09,
     DBUG_POKE               = 0x0A,
     DBUG_DUMP_THEMAL        = 0x0B,
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 94a8fc0..9c473b9 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -53,6 +53,9 @@
 #define GPU_TEMP_READ_1S  ( 1000000 / (MICS_PER_TICK * 2) )
 #define GPU_TIMEOUT ( 5000000 / (MICS_PER_TICK *2) )
 
+#define GPU_TICKS_TO_100MS ( 100000 / (MICS_PER_TICK * 2) )
+#define GPU_TICKS_TO_1S ( 1000000 / (MICS_PER_TICK * 2) )
+
 // Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
 #define GPU_INIT_ERROR_COUNT 300  // approximately 300 seconds
 
@@ -93,6 +96,33 @@ gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
 
 uint8_t G_current_gpu_id = 0;   // ID 0..2 of GPU currently being processed
 
+gpuTimingTable_t G_gpu_tick_times;
+
+void update_gpu_tick_sensor(gpuTimingSensor_t *sensor, uint32_t ticks)
+{
+    if(ticks > sensor->max)
+    {
+        sensor->max = ticks;
+    }
+
+    if(ticks > GPU_TICKS_TO_1S)
+    {
+        sensor->count_1s++;
+    }
+    else if( (ticks > GPU_TICKS_TO_100MS) )
+    {
+        sensor->count_100ms++;
+    }
+    else
+    {
+        sensor->count_lt100ms++;
+    }
+
+    sensor->count++;
+    sensor->accum += ticks;
+    sensor->avg = sensor->accum / sensor->count;
+}
+
 // Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
 uint8_t get_first_gpu(void)
 {
@@ -1085,6 +1115,10 @@ bool gpu_check_driver_loaded_sm()
     static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
     static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
        // If not starting a new read then need to check status of current state before moving on
@@ -1186,6 +1220,7 @@ bool gpu_check_driver_loaded_sm()
        switch (L_check_driver_state)
        {
            case GPU_STATE_CHECK_DRIVER_LOADED_START:
+               L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
                break;
 
@@ -1202,6 +1237,9 @@ bool gpu_check_driver_loaded_sm()
                break;
 
            case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
+                // Update GPU tick timing table
+                update_gpu_tick_sensor(&G_gpu_tick_times.checkdriver[G_current_gpu_id], L_num_ticks);
+
                // Update driver loaded
                l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
                if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
@@ -1292,6 +1330,10 @@ bool gpu_read_pwr_limit_sm()
     static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0};
     static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0};
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
         // If not starting a new read then need to check status of current state before moving on
@@ -1409,6 +1451,7 @@ bool gpu_read_pwr_limit_sm()
         {
             // Step 1
             case GPU_STATE_READ_PWR_LIMIT_1_START:
+                L_num_ticks = 1;
                 L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
                 break;
 
@@ -1440,7 +1483,7 @@ bool gpu_read_pwr_limit_sm()
 
             // Step 3
             case GPU_STATE_READ_PWR_LIMIT_3_START:
-                GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap", L_attempts);
+                GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap for GPU%d", L_attempts, G_current_gpu_id);
                 L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args);
                 break;
 
@@ -1494,6 +1537,8 @@ bool gpu_read_pwr_limit_sm()
                 break;
 
             case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
+                update_gpu_tick_sensor(&G_gpu_tick_times.getpcap[G_current_gpu_id], L_num_ticks);
+
                 g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
                 // Update power limits
                 g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
@@ -1568,6 +1613,10 @@ bool gpu_set_pwr_limit_sm()
 
     static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0};
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
         // If not starting a new set limit then need to check status of current state before moving on
@@ -1686,6 +1735,7 @@ bool gpu_set_pwr_limit_sm()
         {
             // Step 1
             case GPU_STATE_SET_PWR_LIMIT_1_START:
+                L_num_ticks = 1;
                 L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
                 break;
 
@@ -1754,7 +1804,8 @@ bool gpu_set_pwr_limit_sm()
                 break;
 
             case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
-                GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap", L_attempts);
+                update_gpu_tick_sensor(&G_gpu_tick_times.setpcap[G_current_gpu_id], L_num_ticks);
+                GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap for GPU%d", L_attempts, G_current_gpu_id);
                 // Update the requested power limit since it was successfully sent
                 // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
                 // has caluclated a new desired pcap while this one was already in process of being set
@@ -1818,6 +1869,10 @@ bool gpu_read_temp_sm()
     static bool L_trace_success = FALSE;
     static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW;  // 1st state for reading temp
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
         // If not starting a new read then need to check status of current state before moving on
@@ -1852,6 +1907,7 @@ bool gpu_read_temp_sm()
         switch (L_read_temp_state)
         {
             case GPU_STATE_READ_TEMP_START:
+                L_num_ticks = 1;
                 L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
                 break;
 
@@ -1860,6 +1916,7 @@ bool gpu_read_temp_sm()
                 break;
 
             case GPU_STATE_READ_TEMP_COMPLETE:
+                update_gpu_tick_sensor(&G_gpu_tick_times.coretemp[G_current_gpu_id], L_num_ticks);
                 if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
                     (0 != G_gpu_op_req_args.data[0]) )
                 {
@@ -1987,6 +2044,10 @@ bool gpu_read_mem_temp_capability_sm()
     static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
     static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
        // If not starting a new read then need to check status of current state before moving on
@@ -2087,6 +2148,7 @@ bool gpu_read_mem_temp_capability_sm()
        switch (L_read_cap_state)
        {
            case GPU_STATE_READ_MEM_TEMP_CAPABLE_START:
+               L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
                break;
 
@@ -2103,6 +2165,7 @@ bool gpu_read_mem_temp_capability_sm()
                break;
 
            case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
+               update_gpu_tick_sensor(&G_gpu_tick_times.capabilities[G_current_gpu_id], L_num_ticks);
                // Update capability
                g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
 
@@ -2168,6 +2231,10 @@ bool gpu_read_memory_temp_sm()
     static uint8_t L_read_failure_count = 0;
     static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;  // 1st state for reading temp
 
+    static uint32_t L_num_ticks = 0;
+
+    L_num_ticks++;
+
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
        // If not starting a new read then need to check status of current state before moving on
@@ -2281,6 +2348,7 @@ bool gpu_read_memory_temp_sm()
        switch (L_read_temp_state)
        {
            case GPU_STATE_READ_MEM_TEMP_START:
+               L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
                break;
 
@@ -2297,6 +2365,7 @@ bool gpu_read_memory_temp_sm()
                 break;
 
            case GPU_STATE_READ_MEM_TEMP_COMPLETE:
+               update_gpu_tick_sensor(&G_gpu_tick_times.memtemp[G_current_gpu_id], L_num_ticks);
                // Update sensor
                l_temp = G_gpu_op_req_args.data[0];
                sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index 0baf721..2ee7043 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -156,5 +156,26 @@ void gpu_ipc_init();
 // GPU state machine
 void task_gpu_sm(struct task *i_self);
 
+typedef struct gpuTimingSensor
+{
+    uint32_t max;
+    uint32_t avg;
+    uint32_t count_1s;
+    uint32_t count_100ms;
+    uint32_t count_lt100ms;
+    uint64_t accum;
+    uint64_t count;
+} gpuTimingSensor_t;
+
+// Table for GPU timings
+typedef struct gpuTimingTable
+{
+    gpuTimingSensor_t getpcap[MAX_NUM_GPU_PER_DOMAIN];
+    gpuTimingSensor_t setpcap[MAX_NUM_GPU_PER_DOMAIN];
+    gpuTimingSensor_t coretemp[MAX_NUM_GPU_PER_DOMAIN];
+    gpuTimingSensor_t memtemp[MAX_NUM_GPU_PER_DOMAIN];
+    gpuTimingSensor_t capabilities[MAX_NUM_GPU_PER_DOMAIN];
+    gpuTimingSensor_t checkdriver[MAX_NUM_GPU_PER_DOMAIN];
+} gpuTimingTable_t;
 
 #endif //_GPU_H
author	William Bryan <wilbryan@us.ibm.com>	2017-10-19 15:08:10 -0500
committer	William A. Bryan <wilbryan@us.ibm.com>	2017-10-19 16:35:39 -0400
commit	bacb45ad1cc0da113290f0e169c33e5f0885c171 (patch)
tree	1ce35c18045dcabeb5ecc8c8db9453b26b52816c
parent	c07a7207c8b1a2d74cf4cc55120eb8073ee07d96 (diff)
download	talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.tar.gz talos-occ-bacb45ad1cc0da113290f0e169c33e5f0885c171.zip