summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2018-03-06 08:39:53 -0600
committerWilliam A. Bryan <wilbryan@us.ibm.com>2018-03-07 14:22:49 -0500
commit81196c350c52e3a36885e531808607872dd21c59 (patch)
tree6e2bd32b97436220b5b9da836ab533450307c216
parent1c7b23cc6b8f1d31a8851d8adc6dfe1c94502136 (diff)
downloadtalos-occ-81196c350c52e3a36885e531808607872dd21c59.tar.gz
talos-occ-81196c350c52e3a36885e531808607872dd21c59.zip
Try to PCAP GPU again after busy failure
CQ:SW414846 Change-Id: I7a4c42de414529da963c4f23f27b99a855a4b727 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/55109 Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r--src/common/gpe_err.h3
-rw-r--r--src/common/gpu_structs.h11
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c12
-rwxr-xr-xsrc/occ_405/gpu/gpu.c72
-rw-r--r--src/occ_405/gpu/gpu.h10
5 files changed, 75 insertions, 33 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h
index 23d55d2..9dfe1ca 100644
--- a/src/common/gpe_err.h
+++ b/src/common/gpe_err.h
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2017 */
+/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -56,5 +56,6 @@
#define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82 // GPU rejected command with no support
#define GPE_RC_GPU_CMD_FAILED 0x83 // An error occurred in the last GPU operation
#define GPE_RC_GPU_INIT_FAILED 0x84 // Failed to init GPU
+#define GPE_RC_GPU_BUSY 0x85
#endif //_GPE_ERR_H
diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h
index 33c79ce..e0fffee 100644
--- a/src/common/gpu_structs.h
+++ b/src/common/gpu_structs.h
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2016,2017 */
+/* Contributors Listed Below - COPYRIGHT 2016,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -83,12 +83,12 @@ typedef enum
// Set GPU power cap
GPU_REQ_SET_PWR_LIMIT_1_START = 0x20,
- GPU_REQ_SET_PWR_LIMIT_1_1 = 0x21,
- GPU_REQ_SET_PWR_LIMIT_1_2 = 0x22,
+ GPU_REQ_SET_PWR_LIMIT_1_2 = 0x21,
+ GPU_REQ_SET_PWR_LIMIT_1_3 = 0x22,
GPU_REQ_SET_PWR_LIMIT_1_FINISH = 0x23,
GPU_REQ_SET_PWR_LIMIT_2_START = 0x24,
- GPU_REQ_SET_PWR_LIMIT_2_1 = 0x25,
- GPU_REQ_SET_PWR_LIMIT_2_2 = 0x26,
+ GPU_REQ_SET_PWR_LIMIT_2_2 = 0x25,
+ GPU_REQ_SET_PWR_LIMIT_2_3 = 0x26,
GPU_REQ_SET_PWR_LIMIT_2_FINISH = 0x27,
GPU_REQ_SET_PWR_LIMIT_3_START = 0x28,
GPU_REQ_SET_PWR_LIMIT_3_2 = 0x29,
@@ -98,7 +98,6 @@ typedef enum
GPU_REQ_SET_PWR_LIMIT_4_2 = 0x2D,
GPU_REQ_SET_PWR_LIMIT_4_FINISH = 0x2E,
-
// Start check driver loaded
GPU_REQ_CHECK_DRIVER_START = 0x31,
GPU_REQ_CHECK_DRIVER_2 = 0x32,
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 9f52c0a..6253652 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -1551,37 +1551,37 @@ void cmdh_dump_gpu_timings(void)
{
TRAC_INFO("=======================================GPU%d===================================================", i);
TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|");
- TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Core Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.coretemp[i].max,
G_gpu_tick_times.coretemp[i].avg,
G_gpu_tick_times.coretemp[i].count_1s,
G_gpu_tick_times.coretemp[i].count_100ms,
G_gpu_tick_times.coretemp[i].count_lt100ms);
- TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Mem Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.memtemp[i].max,
G_gpu_tick_times.memtemp[i].avg,
G_gpu_tick_times.memtemp[i].count_1s,
G_gpu_tick_times.memtemp[i].count_100ms,
G_gpu_tick_times.memtemp[i].count_lt100ms);
- TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Check Driver Loaded %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.checkdriver[i].max,
G_gpu_tick_times.checkdriver[i].avg,
G_gpu_tick_times.checkdriver[i].count_1s,
G_gpu_tick_times.checkdriver[i].count_100ms,
G_gpu_tick_times.checkdriver[i].count_lt100ms);
- TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Mem Capabilities %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.capabilities[i].max,
G_gpu_tick_times.capabilities[i].avg,
G_gpu_tick_times.capabilities[i].count_1s,
G_gpu_tick_times.capabilities[i].count_100ms,
G_gpu_tick_times.capabilities[i].count_lt100ms);
- TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Read Power Policy %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.getpcap[i].max,
G_gpu_tick_times.getpcap[i].avg,
G_gpu_tick_times.getpcap[i].count_1s,
G_gpu_tick_times.getpcap[i].count_100ms,
G_gpu_tick_times.getpcap[i].count_lt100ms);
- TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d",
+ TRAC_INFO("| Set Power Cap %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.setpcap[i].max,
G_gpu_tick_times.setpcap[i].avg,
G_gpu_tick_times.setpcap[i].count_1s,
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 6e01d67..171a94e 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2017 */
+/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -828,12 +828,12 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
// Set GPU Power Limit
case GPU_REQ_SET_PWR_LIMIT_1_START:
- case GPU_REQ_SET_PWR_LIMIT_1_1:
case GPU_REQ_SET_PWR_LIMIT_1_2:
+ case GPU_REQ_SET_PWR_LIMIT_1_3:
case GPU_REQ_SET_PWR_LIMIT_1_FINISH:
case GPU_REQ_SET_PWR_LIMIT_2_START:
- case GPU_REQ_SET_PWR_LIMIT_2_1:
case GPU_REQ_SET_PWR_LIMIT_2_2:
+ case GPU_REQ_SET_PWR_LIMIT_2_3:
case GPU_REQ_SET_PWR_LIMIT_2_FINISH:
case GPU_REQ_SET_PWR_LIMIT_3_START:
case GPU_REQ_SET_PWR_LIMIT_3_2:
@@ -1336,6 +1336,8 @@ bool gpu_read_pwr_limit_sm()
static uint32_t L_num_ticks = 0;
+ static bool L_retry_necessary = FALSE;
+
L_num_ticks++;
if (async_request_is_idle(&G_gpu_op_request.request))
@@ -1354,6 +1356,12 @@ bool gpu_read_pwr_limit_sm()
L_state_failure_count = 0;
L_attempts++;
}
+ else if( (L_read_pwr_limit_state == GPU_STATE_READ_PWR_LIMIT_1_3) &&
+ (GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) )
+ {
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_1_FINISH;
+ L_retry_necessary = TRUE;
+ }
// Check if failure was due to driver change
else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
@@ -1364,6 +1372,7 @@ bool gpu_read_pwr_limit_sm()
L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
L_attempts = 0;
+ L_retry_necessary = FALSE;
return TRUE; // Done with this GPU, let GPU SM move to next
}
@@ -1446,7 +1455,17 @@ bool gpu_read_pwr_limit_sm()
else // success on last state go to next state and process it
{
L_state_failure_count = 0;
- L_read_pwr_limit_state++;
+ if( (GPU_STATE_READ_PWR_LIMIT_1_FINISH == L_read_pwr_limit_state) &&
+ (L_retry_necessary) )
+ {
+ // Let SM move on
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ return TRUE;
+ }
+ else
+ {
+ L_read_pwr_limit_state++;
+ }
}
L_scheduled = FALSE; // default nothing scheduled
@@ -1455,7 +1474,8 @@ bool gpu_read_pwr_limit_sm()
{
// Step 1
case GPU_STATE_READ_PWR_LIMIT_1_START:
- L_num_ticks = 1;
+ if(!L_retry_necessary) L_num_ticks = 1;
+ L_retry_necessary = FALSE;
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;
@@ -1619,6 +1639,8 @@ bool gpu_set_pwr_limit_sm()
static uint32_t L_num_ticks = 0;
+ static bool L_retry_necessary = FALSE;
+
L_num_ticks++;
if (async_request_is_idle(&G_gpu_op_request.request))
@@ -1637,6 +1659,12 @@ bool gpu_set_pwr_limit_sm()
L_state_failure_count = 0;
L_attempts++;
}
+ else if( (L_set_pwr_limit_state == GPU_STATE_SET_PWR_LIMIT_3_3) &&
+ (GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) )
+ {
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_3_FINISH;
+ L_retry_necessary = TRUE;
+ }
// Check if failure was due to driver change
else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
@@ -1647,6 +1675,7 @@ bool gpu_set_pwr_limit_sm()
L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
L_attempts = 0;
+ L_retry_necessary = FALSE;
return TRUE; // Done with this GPU, let GPU SM move to next
}
// If reached retry count give up on this read
@@ -1660,6 +1689,8 @@ bool gpu_set_pwr_limit_sm()
// It will be retried if detected that GPU is put in reset and then taken out or driver change
g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+ INTR_TRAC_ERR("gpu_set_pwr_limit: Timedout setting power limit %d for GPU%d [attempts:%d][state_fail:%d]",
+ G_gpu_op_req_args.data[0], G_current_gpu_id, L_attempts, L_state_failure_count);
}
// if GPU is not in reset then INC error count and check if reached threshold
if(g_amec->gpu[G_current_gpu_id].status.notReset)
@@ -1730,7 +1761,17 @@ bool gpu_set_pwr_limit_sm()
else // success on last state go to next state and process it
{
L_state_failure_count = 0;
- L_set_pwr_limit_state++;
+ if( (GPU_STATE_SET_PWR_LIMIT_4_FINISH == L_set_pwr_limit_state ) &&
+ (L_retry_necessary) )
+ {
+ // Let SM move to next
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ return TRUE;
+ }
+ else
+ {
+ L_set_pwr_limit_state++;
+ }
}
L_scheduled = FALSE; // default nothing scheduled
@@ -1739,18 +1780,19 @@ bool gpu_set_pwr_limit_sm()
{
// Step 1
case GPU_STATE_SET_PWR_LIMIT_1_START:
- L_num_ticks = 1;
+ if(!L_retry_necessary) L_num_ticks = 1;
+ L_retry_necessary = FALSE;
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;
- case GPU_STATE_SET_PWR_LIMIT_1_1:
- L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_1, G_new_gpu_req_args);
- break;
-
case GPU_STATE_SET_PWR_LIMIT_1_2:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_2, G_new_gpu_req_args);
break;
+ case GPU_STATE_SET_PWR_LIMIT_1_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_3, G_new_gpu_req_args);
+ break;
+
case GPU_STATE_SET_PWR_LIMIT_1_FINISH:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args);
break;
@@ -1764,14 +1806,14 @@ bool gpu_set_pwr_limit_sm()
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_START, G_new_gpu_req_args);
break;
- case GPU_STATE_SET_PWR_LIMIT_2_1:
- L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_1, G_new_gpu_req_args);
- break;
-
case GPU_STATE_SET_PWR_LIMIT_2_2:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_2, G_new_gpu_req_args);
break;
+ case GPU_STATE_SET_PWR_LIMIT_2_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_3, G_new_gpu_req_args);
+ break;
+
case GPU_STATE_SET_PWR_LIMIT_2_FINISH:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args);
break;
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index 2ee7043..144f8b5 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2011,2017 */
+/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -133,12 +133,12 @@ typedef enum
{
GPU_STATE_SET_PWR_LIMIT_NEW = 0x71,
GPU_STATE_SET_PWR_LIMIT_1_START = 0x72,
- GPU_STATE_SET_PWR_LIMIT_1_1 = 0x73,
- GPU_STATE_SET_PWR_LIMIT_1_2 = 0x74,
+ GPU_STATE_SET_PWR_LIMIT_1_2 = 0x73,
+ GPU_STATE_SET_PWR_LIMIT_1_3 = 0x74,
GPU_STATE_SET_PWR_LIMIT_1_FINISH = 0x75,
GPU_STATE_SET_PWR_LIMIT_2_START = 0x76,
- GPU_STATE_SET_PWR_LIMIT_2_1 = 0x77,
- GPU_STATE_SET_PWR_LIMIT_2_2 = 0x78,
+ GPU_STATE_SET_PWR_LIMIT_2_2 = 0x77,
+ GPU_STATE_SET_PWR_LIMIT_2_3 = 0x78,
GPU_STATE_SET_PWR_LIMIT_2_FINISH = 0x79,
GPU_STATE_SET_PWR_LIMIT_3_START = 0x7A,
GPU_STATE_SET_PWR_LIMIT_3_2 = 0x7B,
OpenPOWER on IntegriCloud