diff options
author | William Bryan <wilbryan@us.ibm.com> | 2018-03-06 08:39:53 -0600 |
---|---|---|
committer | William A. Bryan <wilbryan@us.ibm.com> | 2018-03-07 14:22:49 -0500 |
commit | 81196c350c52e3a36885e531808607872dd21c59 (patch) | |
tree | 6e2bd32b97436220b5b9da836ab533450307c216 | |
parent | 1c7b23cc6b8f1d31a8851d8adc6dfe1c94502136 (diff) | |
download | talos-occ-81196c350c52e3a36885e531808607872dd21c59.tar.gz talos-occ-81196c350c52e3a36885e531808607872dd21c59.zip |
Try to PCAP GPU again after busy failure
CQ:SW414846
Change-Id: I7a4c42de414529da963c4f23f27b99a855a4b727
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/55109
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r-- | src/common/gpe_err.h | 3 | ||||
-rw-r--r-- | src/common/gpu_structs.h | 11 | ||||
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.c | 12 | ||||
-rwxr-xr-x | src/occ_405/gpu/gpu.c | 72 | ||||
-rw-r--r-- | src/occ_405/gpu/gpu.h | 10 |
5 files changed, 75 insertions, 33 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h index 23d55d2..9dfe1ca 100644 --- a/src/common/gpe_err.h +++ b/src/common/gpe_err.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -56,5 +56,6 @@ #define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82 // GPU rejected command with no support #define GPE_RC_GPU_CMD_FAILED 0x83 // An error occurred in the last GPU operation #define GPE_RC_GPU_INIT_FAILED 0x84 // Failed to init GPU +#define GPE_RC_GPU_BUSY 0x85 #endif //_GPE_ERR_H diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h index 33c79ce..e0fffee 100644 --- a/src/common/gpu_structs.h +++ b/src/common/gpu_structs.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016,2017 */ +/* Contributors Listed Below - COPYRIGHT 2016,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -83,12 +83,12 @@ typedef enum // Set GPU power cap GPU_REQ_SET_PWR_LIMIT_1_START = 0x20, - GPU_REQ_SET_PWR_LIMIT_1_1 = 0x21, - GPU_REQ_SET_PWR_LIMIT_1_2 = 0x22, + GPU_REQ_SET_PWR_LIMIT_1_2 = 0x21, + GPU_REQ_SET_PWR_LIMIT_1_3 = 0x22, GPU_REQ_SET_PWR_LIMIT_1_FINISH = 0x23, GPU_REQ_SET_PWR_LIMIT_2_START = 0x24, - GPU_REQ_SET_PWR_LIMIT_2_1 = 0x25, - GPU_REQ_SET_PWR_LIMIT_2_2 = 0x26, + GPU_REQ_SET_PWR_LIMIT_2_2 = 0x25, + GPU_REQ_SET_PWR_LIMIT_2_3 = 0x26, GPU_REQ_SET_PWR_LIMIT_2_FINISH = 0x27, GPU_REQ_SET_PWR_LIMIT_3_START = 0x28, GPU_REQ_SET_PWR_LIMIT_3_2 = 0x29, @@ -98,7 +98,6 @@ typedef enum GPU_REQ_SET_PWR_LIMIT_4_2 = 0x2D, GPU_REQ_SET_PWR_LIMIT_4_FINISH = 0x2E, - // Start check driver loaded GPU_REQ_CHECK_DRIVER_START = 0x31, GPU_REQ_CHECK_DRIVER_2 = 0x32, diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 9f52c0a..6253652 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -1551,37 +1551,37 @@ void cmdh_dump_gpu_timings(void) { TRAC_INFO("=======================================GPU%d===================================================", i); TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|"); - TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Core Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.coretemp[i].max, G_gpu_tick_times.coretemp[i].avg, G_gpu_tick_times.coretemp[i].count_1s, G_gpu_tick_times.coretemp[i].count_100ms, G_gpu_tick_times.coretemp[i].count_lt100ms); - TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Mem Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.memtemp[i].max, G_gpu_tick_times.memtemp[i].avg, G_gpu_tick_times.memtemp[i].count_1s, G_gpu_tick_times.memtemp[i].count_100ms, G_gpu_tick_times.memtemp[i].count_lt100ms); - TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Check Driver Loaded %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.checkdriver[i].max, G_gpu_tick_times.checkdriver[i].avg, G_gpu_tick_times.checkdriver[i].count_1s, G_gpu_tick_times.checkdriver[i].count_100ms, G_gpu_tick_times.checkdriver[i].count_lt100ms); - TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Mem Capabilities %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.capabilities[i].max, G_gpu_tick_times.capabilities[i].avg, G_gpu_tick_times.capabilities[i].count_1s, G_gpu_tick_times.capabilities[i].count_100ms, G_gpu_tick_times.capabilities[i].count_lt100ms); - TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Read Power Policy %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.getpcap[i].max, G_gpu_tick_times.getpcap[i].avg, G_gpu_tick_times.getpcap[i].count_1s, G_gpu_tick_times.getpcap[i].count_100ms, G_gpu_tick_times.getpcap[i].count_lt100ms); - TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d", + TRAC_INFO("| Set Power Cap %-5d msecs %-5d msecs %-5d %-5d %-5d", G_gpu_tick_times.setpcap[i].max, G_gpu_tick_times.setpcap[i].avg, G_gpu_tick_times.setpcap[i].count_1s, diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 6e01d67..171a94e 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -828,12 +828,12 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args) // Set GPU Power Limit case GPU_REQ_SET_PWR_LIMIT_1_START: - case GPU_REQ_SET_PWR_LIMIT_1_1: case GPU_REQ_SET_PWR_LIMIT_1_2: + case GPU_REQ_SET_PWR_LIMIT_1_3: case GPU_REQ_SET_PWR_LIMIT_1_FINISH: case GPU_REQ_SET_PWR_LIMIT_2_START: - case GPU_REQ_SET_PWR_LIMIT_2_1: case GPU_REQ_SET_PWR_LIMIT_2_2: + case GPU_REQ_SET_PWR_LIMIT_2_3: case GPU_REQ_SET_PWR_LIMIT_2_FINISH: case GPU_REQ_SET_PWR_LIMIT_3_START: case GPU_REQ_SET_PWR_LIMIT_3_2: @@ -1336,6 +1336,8 @@ bool gpu_read_pwr_limit_sm() static uint32_t L_num_ticks = 0; + static bool L_retry_necessary = FALSE; + L_num_ticks++; if (async_request_is_idle(&G_gpu_op_request.request)) @@ -1354,6 +1356,12 @@ bool gpu_read_pwr_limit_sm() L_state_failure_count = 0; L_attempts++; } + else if( (L_read_pwr_limit_state == GPU_STATE_READ_PWR_LIMIT_1_3) && + (GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) ) + { + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_1_FINISH; + L_retry_necessary = TRUE; + } // Check if failure was due to driver change else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) { @@ -1364,6 +1372,7 @@ bool gpu_read_pwr_limit_sm() L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; L_attempts = 0; + L_retry_necessary = FALSE; return TRUE; // Done with this GPU, let GPU SM move to next } @@ -1446,7 +1455,17 @@ bool gpu_read_pwr_limit_sm() else // success on last state go to next state and process it { L_state_failure_count = 0; - L_read_pwr_limit_state++; + if( (GPU_STATE_READ_PWR_LIMIT_1_FINISH == L_read_pwr_limit_state) && + (L_retry_necessary) ) + { + // Let SM move on + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + return TRUE; + } + else + { + L_read_pwr_limit_state++; + } } L_scheduled = FALSE; // default nothing scheduled @@ -1455,7 +1474,8 @@ bool gpu_read_pwr_limit_sm() { // Step 1 case GPU_STATE_READ_PWR_LIMIT_1_START: - L_num_ticks = 1; + if(!L_retry_necessary) L_num_ticks = 1; + L_retry_necessary = FALSE; L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args); break; @@ -1619,6 +1639,8 @@ bool gpu_set_pwr_limit_sm() static uint32_t L_num_ticks = 0; + static bool L_retry_necessary = FALSE; + L_num_ticks++; if (async_request_is_idle(&G_gpu_op_request.request)) @@ -1637,6 +1659,12 @@ bool gpu_set_pwr_limit_sm() L_state_failure_count = 0; L_attempts++; } + else if( (L_set_pwr_limit_state == GPU_STATE_SET_PWR_LIMIT_3_3) && + (GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) ) + { + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_3_FINISH; + L_retry_necessary = TRUE; + } // Check if failure was due to driver change else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) { @@ -1647,6 +1675,7 @@ bool gpu_set_pwr_limit_sm() L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; L_attempts = 0; + L_retry_necessary = FALSE; return TRUE; // Done with this GPU, let GPU SM move to next } // If reached retry count give up on this read @@ -1660,6 +1689,8 @@ bool gpu_set_pwr_limit_sm() // It will be retried if detected that GPU is put in reset and then taken out or driver change g_amec->gpu[G_current_gpu_id].pcap.set_failed = true; L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; + INTR_TRAC_ERR("gpu_set_pwr_limit: Timedout setting power limit %d for GPU%d [attempts:%d][state_fail:%d]", + G_gpu_op_req_args.data[0], G_current_gpu_id, L_attempts, L_state_failure_count); } // if GPU is not in reset then INC error count and check if reached threshold if(g_amec->gpu[G_current_gpu_id].status.notReset) @@ -1730,7 +1761,17 @@ bool gpu_set_pwr_limit_sm() else // success on last state go to next state and process it { L_state_failure_count = 0; - L_set_pwr_limit_state++; + if( (GPU_STATE_SET_PWR_LIMIT_4_FINISH == L_set_pwr_limit_state ) && + (L_retry_necessary) ) + { + // Let SM move to next + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + return TRUE; + } + else + { + L_set_pwr_limit_state++; + } } L_scheduled = FALSE; // default nothing scheduled @@ -1739,18 +1780,19 @@ bool gpu_set_pwr_limit_sm() { // Step 1 case GPU_STATE_SET_PWR_LIMIT_1_START: - L_num_ticks = 1; + if(!L_retry_necessary) L_num_ticks = 1; + L_retry_necessary = FALSE; L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args); break; - case GPU_STATE_SET_PWR_LIMIT_1_1: - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_1, G_new_gpu_req_args); - break; - case GPU_STATE_SET_PWR_LIMIT_1_2: L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_2, G_new_gpu_req_args); break; + case GPU_STATE_SET_PWR_LIMIT_1_3: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_3, G_new_gpu_req_args); + break; + case GPU_STATE_SET_PWR_LIMIT_1_FINISH: L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args); break; @@ -1764,14 +1806,14 @@ bool gpu_set_pwr_limit_sm() L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_START, G_new_gpu_req_args); break; - case GPU_STATE_SET_PWR_LIMIT_2_1: - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_1, G_new_gpu_req_args); - break; - case GPU_STATE_SET_PWR_LIMIT_2_2: L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_2, G_new_gpu_req_args); break; + case GPU_STATE_SET_PWR_LIMIT_2_3: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_3, G_new_gpu_req_args); + break; + case GPU_STATE_SET_PWR_LIMIT_2_FINISH: L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args); break; diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h index 2ee7043..144f8b5 100644 --- a/src/occ_405/gpu/gpu.h +++ b/src/occ_405/gpu/gpu.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2017 */ +/* Contributors Listed Below - COPYRIGHT 2011,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -133,12 +133,12 @@ typedef enum { GPU_STATE_SET_PWR_LIMIT_NEW = 0x71, GPU_STATE_SET_PWR_LIMIT_1_START = 0x72, - GPU_STATE_SET_PWR_LIMIT_1_1 = 0x73, - GPU_STATE_SET_PWR_LIMIT_1_2 = 0x74, + GPU_STATE_SET_PWR_LIMIT_1_2 = 0x73, + GPU_STATE_SET_PWR_LIMIT_1_3 = 0x74, GPU_STATE_SET_PWR_LIMIT_1_FINISH = 0x75, GPU_STATE_SET_PWR_LIMIT_2_START = 0x76, - GPU_STATE_SET_PWR_LIMIT_2_1 = 0x77, - GPU_STATE_SET_PWR_LIMIT_2_2 = 0x78, + GPU_STATE_SET_PWR_LIMIT_2_2 = 0x77, + GPU_STATE_SET_PWR_LIMIT_2_3 = 0x78, GPU_STATE_SET_PWR_LIMIT_2_FINISH = 0x79, GPU_STATE_SET_PWR_LIMIT_3_START = 0x7A, GPU_STATE_SET_PWR_LIMIT_3_2 = 0x7B, |