From 80c29cbbecfc9ca98c6af5f24a1d50a41a09041e Mon Sep 17 00:00:00 2001 From: William Bryan Date: Mon, 9 Oct 2017 13:01:06 -0500 Subject: 405 GPU Power Capping Change-Id: Ieb37ad600463e678ef9b8cf61f3ebbbfaa89e67b Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48127 Tested-by: FSP CI Jenkins Reviewed-by: Martha Broyles Reviewed-by: Christopher J. Cain Reviewed-by: William A. Bryan --- src/common/gpu_structs.h | 84 +++-- src/occ_405/amec/amec_pcap.c | 16 +- src/occ_405/gpu/gpu.c | 833 ++++++++++++++++++++++++++++--------------- src/occ_405/gpu/gpu.h | 50 ++- 4 files changed, 661 insertions(+), 322 deletions(-) diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h index 98567d7..33c79ce 100644 --- a/src/common/gpu_structs.h +++ b/src/common/gpu_structs.h @@ -62,30 +62,72 @@ typedef enum // GPU Request Operations typedef enum { - GPU_REQ_INIT = 0x01, // Init interrupt registers - GPU_REQ_READ_TEMP_START = 0x02, // Start reading GPU information - GPU_REQ_READ_TEMP_FINISH = 0x03, // Read GPU temp register - GPU_REQ_READ_MEM_TEMP_START = 0x04, // Initiate memory temp reading - GPU_REQ_READ_MEM_TEMP_2 = 0x05, // mem temp step 2 - GPU_REQ_READ_MEM_TEMP_3 = 0x06, // mem temp step 3 - GPU_REQ_READ_MEM_TEMP_FINISH = 0x07, // Get memory temp reading - GPU_REQ_READ_CAPS_START = 0x08, // Start reading capabilities - GPU_REQ_READ_CAPS_2 = 0x09, // Capabilities read step 2 - GPU_REQ_READ_CAPS_3 = 0x0A, // Capabilities read step 3 - GPU_REQ_READ_CAPS_FINISH = 0x0B, // get capabilities - GPU_REQ_READ_PWR_LIMIT_START = 0x10, // Start reading GPU power limit - GPU_REQ_READ_PWR_LIMIT_2 = 0x11, - GPU_REQ_READ_PWR_LIMIT_3 = 0x12, - GPU_REQ_READ_PWR_LIMIT_FINISH = 0x13, - GPU_REQ_SET_PWR_LIMIT_START = 0x20, // Start setting GPU power limit - GPU_REQ_SET_PWR_LIMIT_2 = 0x21, - GPU_REQ_SET_PWR_LIMIT_3 = 0x22, - GPU_REQ_SET_PWR_LIMIT_FINISH = 0x23, - GPU_REQ_CHECK_DRIVER_START = 0x31, // Start check driver loaded + // Initialize the GPU state machine and I2C engine + GPU_REQ_INIT = 0x01, + + // Read GPU core temperature + GPU_REQ_READ_TEMP_START = 0x02, + GPU_REQ_READ_TEMP_FINISH = 0x03, + + // Read GPU memory temperature + GPU_REQ_READ_MEM_TEMP_START = 0x04, + GPU_REQ_READ_MEM_TEMP_2 = 0x05, + GPU_REQ_READ_MEM_TEMP_3 = 0x06, + GPU_REQ_READ_MEM_TEMP_FINISH = 0x07, + + // Read thermal capabilities + GPU_REQ_READ_CAPS_START = 0x08, + GPU_REQ_READ_CAPS_2 = 0x09, + GPU_REQ_READ_CAPS_3 = 0x0A, + GPU_REQ_READ_CAPS_FINISH = 0x0B, + + // Set GPU power cap + GPU_REQ_SET_PWR_LIMIT_1_START = 0x20, + GPU_REQ_SET_PWR_LIMIT_1_1 = 0x21, + GPU_REQ_SET_PWR_LIMIT_1_2 = 0x22, + GPU_REQ_SET_PWR_LIMIT_1_FINISH = 0x23, + GPU_REQ_SET_PWR_LIMIT_2_START = 0x24, + GPU_REQ_SET_PWR_LIMIT_2_1 = 0x25, + GPU_REQ_SET_PWR_LIMIT_2_2 = 0x26, + GPU_REQ_SET_PWR_LIMIT_2_FINISH = 0x27, + GPU_REQ_SET_PWR_LIMIT_3_START = 0x28, + GPU_REQ_SET_PWR_LIMIT_3_2 = 0x29, + GPU_REQ_SET_PWR_LIMIT_3_3 = 0x2A, + GPU_REQ_SET_PWR_LIMIT_3_FINISH = 0x2B, + GPU_REQ_SET_PWR_LIMIT_4_START = 0x2C, + GPU_REQ_SET_PWR_LIMIT_4_2 = 0x2D, + GPU_REQ_SET_PWR_LIMIT_4_FINISH = 0x2E, + + + // Start check driver loaded + GPU_REQ_CHECK_DRIVER_START = 0x31, GPU_REQ_CHECK_DRIVER_2 = 0x32, GPU_REQ_CHECK_DRIVER_3 = 0x33, GPU_REQ_CHECK_DRIVER_FINISH = 0x34, - GPU_REQ_RESET = 0x60, // Reset + + // Read power limit policy + GPU_REQ_GET_PWR_LIMIT_1_START = 0x40, + GPU_REQ_GET_PWR_LIMIT_1_2 = 0x41, + GPU_REQ_GET_PWR_LIMIT_1_3 = 0x42, + GPU_REQ_GET_PWR_LIMIT_1_FINISH = 0x43, + GPU_REQ_GET_PWR_LIMIT_2_START = 0x44, + GPU_REQ_GET_PWR_LIMIT_2_2 = 0x45, + GPU_REQ_GET_PWR_LIMIT_2_FINISH = 0x46, + GPU_REQ_GET_PWR_LIMIT_3_START = 0x47, + GPU_REQ_GET_PWR_LIMIT_3_2 = 0x48, + GPU_REQ_GET_PWR_LIMIT_3_3 = 0x49, + GPU_REQ_GET_PWR_LIMIT_3_FINISH = 0x4A, + GPU_REQ_GET_PWR_LIMIT_4_START = 0x4B, + GPU_REQ_GET_PWR_LIMIT_4_2 = 0x4C, + GPU_REQ_GET_PWR_LIMIT_4_3 = 0x4D, + GPU_REQ_GET_PWR_LIMIT_4_FINISH = 0x4E, + GPU_REQ_GET_PWR_LIMIT_5_START = 0x4F, + GPU_REQ_GET_PWR_LIMIT_5_2 = 0x50, + GPU_REQ_GET_PWR_LIMIT_5_3 = 0x51, + GPU_REQ_GET_PWR_LIMIT_5_FINISH = 0x52, + + // Reset the I2C master and slave + GPU_REQ_RESET = 0x60, } gpu_op_req_e; // GPU arguments diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c index 995324d..531c738 100755 --- a/src/occ_405/amec/amec_pcap.c +++ b/src/occ_405/amec/amec_pcap.c @@ -111,6 +111,8 @@ void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation static bool L_first_run = TRUE; // for calculations done only 1 time + static uint32_t L_last_pcap_traced[MAX_NUM_GPU_PER_DOMAIN] = {0}; + /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ @@ -119,7 +121,6 @@ void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t { // calculate total GPU power cap for oversubscription if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) - { // Take all non-GPU power away from the oversubscription power cap L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; @@ -242,7 +243,7 @@ void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t // system is not in oversubscription use N+1 mode cap l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap; } - + L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap; @@ -292,8 +293,15 @@ void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t // check if this is a new power limit if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) { - TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, - g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); + if( (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) || + (L_last_pcap_traced[i] != l_gpu_cap_mw) ) + { + L_last_pcap_traced[i] = l_gpu_cap_mw; + TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, + g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); + + } + g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw; if( (g_amec->gpu[i].pcap.gpu_min_cap_required) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) ) diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 0be5c9f..94a8fc0 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -49,7 +49,9 @@ #include "gpu_service_codes.h" #include "i2c.h" -#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) // Number calls with assumption called every other tick +// Number calls with assumption the GPU SM task is called every other tick +#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) +#define GPU_TIMEOUT ( 5000000 / (MICS_PER_TICK *2) ) // Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time #define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds @@ -297,24 +299,52 @@ uint8_t gpu_id_need_power_limits(void) uint8_t gpu_id = 0xFF; // default none uint8_t i = 0; - for (i=0; igpu[i].status.driverLoaded) && - (g_amec->gpu[i].pcap.check_pwr_limit)) + for (i=L_current_gpu_id; igpu[i].pcap.check_pwr_limit = false; - } - else - { - gpu_id = i; - break; - } + // to read power limits requires that the driver is loaded + if( (g_amec->gpu[i].status.driverLoaded) && + (g_amec->gpu[i].pcap.check_pwr_limit)) + { + // If there is no power capping support skip reading power limits + if(G_pwr_reading_type == PWR_READING_TYPE_NONE) + { + g_amec->gpu[i].pcap.check_pwr_limit = false; + } + else + { + gpu_id = i; + break; + } + } + } + + if(0xFF == gpu_id) + { + // We don't need to read power limits from any GPUs at the moment + L_current_gpu_id = 0; + } + else if( (MAX_NUM_GPU_PER_DOMAIN - 1) == gpu_id) + { + // We're reading from last GPU, do not check any next time + L_current_gpu_id = 0xFF; + } + else + { + // Next time look at next GPU ID first + L_current_gpu_id = gpu_id + 1; } } + return gpu_id; } @@ -325,17 +355,45 @@ uint8_t gpu_id_need_set_power_limit(void) uint8_t gpu_id = 0xFF; // default none uint8_t i = 0; - for (i=0; igpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) && - (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) && - (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) ) + for (i=L_current_gpu_id; igpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) && + (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) && + (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) ) + { + gpu_id = i; + break; + } + } + + if(0xFF == gpu_id) + { + // no GPU needs checking start back at 0 next time + L_current_gpu_id = 0; + } + else if( (MAX_NUM_GPU_PER_DOMAIN - 1) == gpu_id ) + { + // Last GPU is being set do not check any next time + L_current_gpu_id = 0xFF; + } + else + { + // next time look at the next GPU ID first + L_current_gpu_id = gpu_id + 1; } } + return gpu_id; } @@ -713,17 +771,43 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args) break; // Read GPU Power Limit - case GPU_REQ_READ_PWR_LIMIT_START: - case GPU_REQ_READ_PWR_LIMIT_2: - case GPU_REQ_READ_PWR_LIMIT_3: - case GPU_REQ_READ_PWR_LIMIT_FINISH: + case GPU_REQ_GET_PWR_LIMIT_1_START: + case GPU_REQ_GET_PWR_LIMIT_1_2: + case GPU_REQ_GET_PWR_LIMIT_1_3: + case GPU_REQ_GET_PWR_LIMIT_1_FINISH: + case GPU_REQ_GET_PWR_LIMIT_2_START: + case GPU_REQ_GET_PWR_LIMIT_2_2: + case GPU_REQ_GET_PWR_LIMIT_2_FINISH: + case GPU_REQ_GET_PWR_LIMIT_3_START: + case GPU_REQ_GET_PWR_LIMIT_3_2: + case GPU_REQ_GET_PWR_LIMIT_3_3: + case GPU_REQ_GET_PWR_LIMIT_3_FINISH: + case GPU_REQ_GET_PWR_LIMIT_4_START: + case GPU_REQ_GET_PWR_LIMIT_4_2: + case GPU_REQ_GET_PWR_LIMIT_4_3: + case GPU_REQ_GET_PWR_LIMIT_4_FINISH: + case GPU_REQ_GET_PWR_LIMIT_5_START: + case GPU_REQ_GET_PWR_LIMIT_5_2: + case GPU_REQ_GET_PWR_LIMIT_5_3: + case GPU_REQ_GET_PWR_LIMIT_5_FINISH: break; // Set GPU Power Limit - case GPU_REQ_SET_PWR_LIMIT_START: - case GPU_REQ_SET_PWR_LIMIT_2: - case GPU_REQ_SET_PWR_LIMIT_3: - case GPU_REQ_SET_PWR_LIMIT_FINISH: + case GPU_REQ_SET_PWR_LIMIT_1_START: + case GPU_REQ_SET_PWR_LIMIT_1_1: + case GPU_REQ_SET_PWR_LIMIT_1_2: + case GPU_REQ_SET_PWR_LIMIT_1_FINISH: + case GPU_REQ_SET_PWR_LIMIT_2_START: + case GPU_REQ_SET_PWR_LIMIT_2_1: + case GPU_REQ_SET_PWR_LIMIT_2_2: + case GPU_REQ_SET_PWR_LIMIT_2_FINISH: + case GPU_REQ_SET_PWR_LIMIT_3_START: + case GPU_REQ_SET_PWR_LIMIT_3_2: + case GPU_REQ_SET_PWR_LIMIT_3_3: + case GPU_REQ_SET_PWR_LIMIT_3_FINISH: + case GPU_REQ_SET_PWR_LIMIT_4_START: + case GPU_REQ_SET_PWR_LIMIT_4_2: + case GPU_REQ_SET_PWR_LIMIT_4_FINISH: break; // I2C reset @@ -1052,7 +1136,7 @@ bool gpu_check_driver_loaded_sm() * @moduleid GPU_MID_GPU_CHECK_DRIVER_LOADED * @reasoncode GPU_FAILURE * @userdata1 GPU ID - * @userdata2 0 + * @userdata2 GPU RC * @userdata4 ERC_GPU_CHECK_DRIVER_LOADED_FAILURE * @devdesc Failure to check GPU driver loaded * @@ -1064,7 +1148,7 @@ bool gpu_check_driver_loaded_sm() NULL, DEFAULT_TRACE_SIZE, G_current_gpu_id, - 0); + G_gpu_op_req_args.gpu_rc); // Callout the GPU if have sensor ID for it if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) @@ -1123,9 +1207,9 @@ bool gpu_check_driver_loaded_sm() if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded) { // Driver loaded status changed - INTR_TRAC_IMP("gpu_check_driver_loaded: GPU%d driver loaded changed to %d", - G_current_gpu_id, - l_new_driver_loaded); + GPU_DBG("gpu_check_driver_loaded: GPU%d driver loaded changed to %d", + G_current_gpu_id, + l_new_driver_loaded); if(l_new_driver_loaded) { @@ -1203,149 +1287,254 @@ bool gpu_read_pwr_limit_sm() static uint8_t L_state_failure_count = 0; static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + static uint32_t L_attempts = 0; + + static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0}; if (async_request_is_idle(&G_gpu_op_request.request)) { - // If not starting a new read then need to check status of current state before moving on - // stay in current state if the schedule failed or the state isn't finished/failed - if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) && - (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) - { - // Check if failure was due to driver change - if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) - { - handle_driver_change(); - // Request can't be processed by GPU at this time so we are done with this GPU - // setup to start new request - L_state_failure_count = 0; - L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change - L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; - return TRUE; // Done with this GPU, let GPU SM move to next - } + // If not starting a new read then need to check status of current state before moving on + // stay in current state if the schedule failed or the state isn't finished/failed + if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) && + (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) + { + // Repeat step 2 until it succeeds. More details on this in GPE code. + if( (L_read_pwr_limit_state == GPU_STATE_READ_PWR_LIMIT_2_FINISH) && + (GPE_RC_NOT_COMPLETE == G_gpu_op_req_args.error.rc) && + (L_attempts <= GPU_TIMEOUT) ) + { + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_2_START; + L_state_failure_count = 0; + L_attempts++; + } + // Check if failure was due to driver change + else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) + { + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + L_attempts = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } - // If reached retry count give up on this read - else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) - { - // if GPU is not in reset then INC error count and check if reached threshold - if(g_amec->gpu[G_current_gpu_id].status.notReset) - { - if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT) + // If reached retry count give up on this read + else if( (L_state_failure_count > MAX_GPU_READ_ATTEMPT) || + (L_attempts > GPU_TIMEOUT) ) + { + if(L_attempts > GPU_TIMEOUT) { - INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X", - G_current_gpu_id, - G_gpu_op_req_args.gpu_rc); - // give up trying to read power limits for this GPU // It will be retried if detected that GPU is put in reset and then taken out g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false; L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; - - // log one time error that power limits could not be read - if(!L_error_logged[G_current_gpu_id]) + } + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) + { + if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT) { - L_error_logged[G_current_gpu_id] = TRUE; + INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X", + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // give up trying to read power limits for this GPU + // It will be retried if detected that GPU is put in reset and then taken out + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false; + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; + + // log one time error that power limits could not be read + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_READ_PWR_LIMIT + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 GPU RC + * @userdata4 ERC_GPU_READ_PWR_LIMIT_FAILURE + * @devdesc Failure to read GPU power limits + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT, + GPU_FAILURE, + ERC_GPU_READ_PWR_LIMIT_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset + + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + L_state_failure_count = 0; + L_attempts = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } // if reached retry count + else + { + // INC failure count and retry current state + L_state_failure_count++; + } + } + else // success on last state go to next state and process it + { + L_state_failure_count = 0; + L_read_pwr_limit_state++; + } - // Log error - /* @ - * @errortype - * @moduleid GPU_MID_GPU_READ_PWR_LIMIT - * @reasoncode GPU_FAILURE - * @userdata1 GPU ID - * @userdata2 0 - * @userdata4 ERC_GPU_READ_PWR_LIMIT_FAILURE - * @devdesc Failure to read GPU power limits - * - */ - errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT, - GPU_FAILURE, - ERC_GPU_READ_PWR_LIMIT_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - G_current_gpu_id, - 0); + L_scheduled = FALSE; // default nothing scheduled - // Callout the GPU if have sensor ID for it - if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) - { - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_GPU_ID, - G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], - ERRL_CALLOUT_PRIORITY_MED); - } + switch (L_read_pwr_limit_state) + { + // Step 1 + case GPU_STATE_READ_PWR_LIMIT_1_START: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args); + break; - // Commit Error - commitErrl(&l_err); - } // if error not logged - } // if reached error count - } // if notReset + case GPU_STATE_READ_PWR_LIMIT_1_2: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_2, G_new_gpu_req_args); + break; - L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; - L_state_failure_count = 0; - return TRUE; // Done with this GPU, let GPU SM move to next - } // if reached retry count - else - { - // INC failure count and retry current state - L_state_failure_count++; - } - } - else // success on last state go to next state and process it - { - L_state_failure_count = 0; - L_read_pwr_limit_state++; - } + case GPU_STATE_READ_PWR_LIMIT_1_3: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_3, G_new_gpu_req_args); + break; - L_scheduled = FALSE; // default nothing scheduled + case GPU_STATE_READ_PWR_LIMIT_1_FINISH: + L_attempts = 0; + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args); + break; - switch (L_read_pwr_limit_state) - { - case GPU_STATE_READ_PWR_LIMIT_START: - L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_START, G_new_gpu_req_args); - break; + // Step 2 + case GPU_STATE_READ_PWR_LIMIT_2_START: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_START, G_new_gpu_req_args); + break; - case GPU_STATE_READ_PWR_LIMIT_2: - L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_2, G_new_gpu_req_args); - break; + case GPU_STATE_READ_PWR_LIMIT_2_2: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_2, G_new_gpu_req_args); + break; - case GPU_STATE_READ_PWR_LIMIT_3: - L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_3, G_new_gpu_req_args); - break; + case GPU_STATE_READ_PWR_LIMIT_2_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args); + break; - case GPU_STATE_READ_PWR_LIMIT_READ: - L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_FINISH, G_new_gpu_req_args); - break; + // Step 3 + case GPU_STATE_READ_PWR_LIMIT_3_START: + GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap", L_attempts); + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args); + break; - case GPU_STATE_READ_PWR_LIMIT_COMPLETE: - g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE; - // Update power limits - g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE; - g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0]; - g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = (uint32_t) G_gpu_op_req_args.data[1]; - g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = (uint32_t) G_gpu_op_req_args.data[2]; + case GPU_STATE_READ_PWR_LIMIT_3_2: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_2, G_new_gpu_req_args); + break; - // Done with this GPU ready to move to new one - L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; - L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; - l_complete = TRUE; - break; + case GPU_STATE_READ_PWR_LIMIT_3_3: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_3, G_new_gpu_req_args); + break; - default: - INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state); - L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; - l_complete = TRUE; - break; - } // switch L_read_pwr_limit_state + case GPU_STATE_READ_PWR_LIMIT_3_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_FINISH, G_new_gpu_req_args); + break; - if(L_scheduled) - { - GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d", - L_read_pwr_limit_state, GPU_TICK); - } - else if(!l_complete) // if not complete there must have been a failure on the schedule - { - INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state); - } + // Step 4 + case GPU_STATE_READ_PWR_LIMIT_4_START: + G_new_gpu_req_args.data[0] = G_gpu_op_req_args.data[0]; + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_START, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_4_2: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_2, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_4_3: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_3, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_4_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_FINISH, G_new_gpu_req_args); + break; + + // Step 5 + case GPU_STATE_READ_PWR_LIMIT_5_START: + G_new_gpu_req_args.data[0] = G_gpu_op_req_args.data[0]; + G_new_gpu_req_args.data[1] = G_gpu_op_req_args.data[1]; + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_START, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_5_2: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_2, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_5_3: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_3, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_5_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_FINISH, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_COMPLETE: + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE; + // Update power limits + g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE; + g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = G_gpu_op_req_args.data[0]; + g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = G_gpu_op_req_args.data[1]; + g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = G_gpu_op_req_args.data[2]; + + if( (g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw != L_last_min[G_current_gpu_id]) || + (g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw != L_last_max[G_current_gpu_id]) ) + { + L_last_min[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw; + L_last_max[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw; + TRAC_IMP("gpu_read_pwr_limit: GPU%d min=0x%08XmW max=0x%08XmW", + G_current_gpu_id, + g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw, + g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw); + } + + // Done with this GPU ready to move to new one + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + L_attempts = 0; + l_complete = TRUE; + break; + + default: + INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state); + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + } // switch L_read_pwr_limit_state + + if(L_scheduled) + { + GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d", + L_read_pwr_limit_state, GPU_TICK); + } + else if(!l_complete) // if not complete there must have been a failure on the schedule + { + INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state); + } } // if async_request_is_idle else @@ -1375,155 +1564,235 @@ bool gpu_set_pwr_limit_sm() static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0}; static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + static uint32_t L_attempts = 0; + + static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0}; if (async_request_is_idle(&G_gpu_op_request.request)) { - // If not starting a new set limit then need to check status of current state before moving on - // stay in current state if the schedule failed or the state isn't finished/failed - if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) && - (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) - { - // Check if failure was due to driver change - if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) - { - handle_driver_change(); - // Request can't be processed by GPU at this time so we are done with this GPU - // setup to start new request - L_state_failure_count = 0; - L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change - L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; - return TRUE; // Done with this GPU, let GPU SM move to next - } + // If not starting a new set limit then need to check status of current state before moving on + // stay in current state if the schedule failed or the state isn't finished/failed + if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) && + (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) + { + // Repeat step 4 until it succeeds. More details on this in GPE code. + if( (L_set_pwr_limit_state == GPU_STATE_SET_PWR_LIMIT_4_FINISH) && + (GPE_RC_NOT_COMPLETE == G_gpu_op_req_args.error.rc) && + (L_attempts <= GPU_TIMEOUT) ) + { + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_4_START; + L_state_failure_count = 0; + L_attempts++; + } + // Check if failure was due to driver change + else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) + { + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + L_attempts = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } + // If reached retry count give up on this read + else if( (L_state_failure_count > MAX_GPU_READ_ATTEMPT) || + (L_attempts > GPU_TIMEOUT) ) + { - // If reached retry count give up on this read - else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) - { - // if GPU is not in reset then INC error count and check if reached threshold - if(g_amec->gpu[G_current_gpu_id].status.notReset) - { - if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT) + if(L_attempts > GPU_TIMEOUT) { - INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X", - G_gpu_op_req_args.data[0], - G_current_gpu_id, - G_gpu_op_req_args.gpu_rc); - // give up trying to set power limit for this GPU // It will be retried if detected that GPU is put in reset and then taken out or driver change g_amec->gpu[G_current_gpu_id].pcap.set_failed = true; L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; - - // log error that power limit could not be set - if(!L_error_logged[G_current_gpu_id]) + } + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) + { + if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT) { - L_error_logged[G_current_gpu_id] = TRUE; + INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X", + G_gpu_op_req_args.data[0], + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // give up trying to set power limit for this GPU + // It will be retried if detected that GPU is put in reset and then taken out or driver change + g_amec->gpu[G_current_gpu_id].pcap.set_failed = true; + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; + + // log error that power limit could not be set + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_SET_PWR_LIMIT + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 GPU RC + * @userdata4 ERC_GPU_SET_PWR_LIMIT_FAILURE + * @devdesc Failure to set GPU power limit + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT, + GPU_FAILURE, + ERC_GPU_SET_PWR_LIMIT_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset + + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + L_state_failure_count = 0; + L_attempts = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } // if reached retry count + else + { + // INC failure count and retry current state + L_state_failure_count++; + } + } + else // success on last state go to next state and process it + { + L_state_failure_count = 0; + L_set_pwr_limit_state++; + } - // Log error - /* @ - * @errortype - * @moduleid GPU_MID_GPU_SET_PWR_LIMIT - * @reasoncode GPU_FAILURE - * @userdata1 GPU ID - * @userdata2 0 - * @userdata4 ERC_GPU_SET_PWR_LIMIT_FAILURE - * @devdesc Failure to set GPU power limit - * - */ - errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT, - GPU_FAILURE, - ERC_GPU_SET_PWR_LIMIT_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - G_current_gpu_id, - 0); + L_scheduled = FALSE; // default nothing scheduled - // Callout the GPU if have sensor ID for it - if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) - { - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_GPU_ID, - G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], - ERRL_CALLOUT_PRIORITY_MED); - } + switch (L_set_pwr_limit_state) + { + // Step 1 + case GPU_STATE_SET_PWR_LIMIT_1_START: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args); + break; - // Commit Error - commitErrl(&l_err); - } // if error not logged - } // if reached error count - } // if notReset + case GPU_STATE_SET_PWR_LIMIT_1_1: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_1, G_new_gpu_req_args); + break; - L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; - L_state_failure_count = 0; - return TRUE; // Done with this GPU, let GPU SM move to next - } // if reached retry count - else - { - // INC failure count and retry current state - L_state_failure_count++; - } - } - else // success on last state go to next state and process it - { - L_state_failure_count = 0; - L_set_pwr_limit_state++; - } + case GPU_STATE_SET_PWR_LIMIT_1_2: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_2, G_new_gpu_req_args); + break; - L_scheduled = FALSE; // default nothing scheduled + case GPU_STATE_SET_PWR_LIMIT_1_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args); + break; - switch (L_set_pwr_limit_state) - { - case GPU_STATE_SET_PWR_LIMIT_START: - // send the desired GPU power cap to the GPE to send to GPU - G_new_gpu_req_args.data[0] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw; - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_START, G_new_gpu_req_args); - break; + // Step 2 + case GPU_STATE_SET_PWR_LIMIT_2_START: + // send the desired GPU power cap to the GPE to send to GPU + GPU_DBG("gpu_set_pwr_limit_sm: setting power limit to %dmW on GPU%d", + g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw, G_current_gpu_id); + G_new_gpu_req_args.data[1] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw; + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_START, G_new_gpu_req_args); + break; - case GPU_STATE_SET_PWR_LIMIT_2: - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2, G_new_gpu_req_args); - break; + case GPU_STATE_SET_PWR_LIMIT_2_1: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_1, G_new_gpu_req_args); + break; - case GPU_STATE_SET_PWR_LIMIT_3: - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3, G_new_gpu_req_args); - break; + case GPU_STATE_SET_PWR_LIMIT_2_2: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_2, G_new_gpu_req_args); + break; - case GPU_STATE_SET_PWR_LIMIT_READ: - L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_FINISH, G_new_gpu_req_args); - break; + case GPU_STATE_SET_PWR_LIMIT_2_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args); + break; - case GPU_STATE_SET_PWR_LIMIT_COMPLETE: - // Update the requested power limit since it was successfully sent - // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC - // has caluclated a new desired pcap while this one was already in process of being set - g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0]; + // Step 3 + case GPU_STATE_SET_PWR_LIMIT_3_START: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_START, G_new_gpu_req_args); + break; - // Done with this GPU ready to move to new one - L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; - L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; - l_complete = TRUE; - break; + case GPU_STATE_SET_PWR_LIMIT_3_2: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_2, G_new_gpu_req_args); + break; - default: - INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state); - L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; - l_complete = TRUE; - break; - } // switch L_set_pwr_limit_state + case GPU_STATE_SET_PWR_LIMIT_3_3: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_3, G_new_gpu_req_args); + break; - if(L_scheduled) - { - GPU_DBG("gpu_set_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d", - L_set_pwr_limit_state, GPU_TICK); - } - else if(!l_complete) // if not complete there must have been a failure on the schedule - { - INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state); - } + case GPU_STATE_SET_PWR_LIMIT_3_FINISH: + L_attempts = 0; + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_FINISH, G_new_gpu_req_args); + break; + + // Step 4 + case GPU_STATE_SET_PWR_LIMIT_4_START: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_START, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_4_2: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_2, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_4_FINISH: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_FINISH, G_new_gpu_req_args); + break; + case GPU_STATE_SET_PWR_LIMIT_COMPLETE: + GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap", L_attempts); + // Update the requested power limit since it was successfully sent + // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC + // has caluclated a new desired pcap while this one was already in process of being set + g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0]; + if(g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw != L_last_pcap[G_current_gpu_id]) + { + L_last_pcap[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw; + TRAC_IMP("gpu_set_pwr_limit_sm: successfully set power limit to %dmW on GPU%d", + g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw, G_current_gpu_id); + } + + // Done with this GPU ready to move to new one + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + L_attempts = 0; + l_complete = TRUE; + break; + + default: + INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state); + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + } // switch L_set_pwr_limit_state + + if(L_scheduled) + { + GPU_DBG("gpu_set_pwr_limit: Scheduled set power cap state 0x%02X at tick %d", + L_set_pwr_limit_state, GPU_TICK); + } + else if(!l_complete) // if not complete there must have been a failure on the schedule + { + INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state); + } } // if async_request_is_idle else { - INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state); + INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state); } return l_complete; @@ -1768,7 +2037,7 @@ bool gpu_read_mem_temp_capability_sm() * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE * @reasoncode GPU_FAILURE * @userdata1 GPU ID - * @userdata2 0 + * @userdata2 GPU RC * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE * @devdesc Failure to read memory temp capability * @@ -1780,7 +2049,7 @@ bool gpu_read_mem_temp_capability_sm() NULL, DEFAULT_TRACE_SIZE, G_current_gpu_id, - 0); + G_gpu_op_req_args.gpu_rc); // Callout the GPU if have sensor ID for it if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) @@ -2135,8 +2404,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee { // Check for next state in order of priority -//TODO: Enable when functional -#if 0 // 1. Need to set a power limit on a GPU? l_gpu_id = gpu_id_need_set_power_limit(); if(l_gpu_id != 0xFF) @@ -2147,7 +2414,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee l_new_state = TRUE; break; } -#endif // 2. check if Host needs lock if (!check_and_update_i2c_lock(GPU_I2C_ENGINE)) @@ -2193,8 +2459,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee break; } -//TODO: Enable when functional -#if 0 // 5. Need to read power limits? l_gpu_id = gpu_id_need_power_limits(); if(l_gpu_id != 0xFF) @@ -2205,7 +2469,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee l_new_state = TRUE; break; } -#endif // 6. Need to read memory temps? if(i_mem_temp_needed) diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h index a26c9c0..0baf721 100644 --- a/src/occ_405/gpu/gpu.h +++ b/src/occ_405/gpu/gpu.h @@ -43,7 +43,7 @@ typedef enum GPU_STATE_CHECK_MEM_TEMP_CAPABLE = 0x30, // Read memory temperature capability GPU_STATE_CHECK_DRIVER_LOADED = 0x40, // Check if Driver loaded GPU_STATE_READ_PWR_LIMIT = 0x50, // Read Power Limits - GPU_STATE_SET_PWR_LIMIT = 0x60, // Set Power Limit + GPU_STATE_SET_PWR_LIMIT = 0x70, // Set Power Limit GPU_STATE_IDLE = 0xFE, // Ok to schedule new task GPU_STATE_NO_LOCK = 0xFF // Host owns, no communication allowed } gpuState_e; @@ -106,22 +106,48 @@ typedef enum typedef enum { GPU_STATE_READ_PWR_LIMIT_NEW = 0x51, - GPU_STATE_READ_PWR_LIMIT_START = 0x52, - GPU_STATE_READ_PWR_LIMIT_2 = 0x53, - GPU_STATE_READ_PWR_LIMIT_3 = 0x54, - GPU_STATE_READ_PWR_LIMIT_READ = 0x55, - GPU_STATE_READ_PWR_LIMIT_COMPLETE = 0x56, + GPU_STATE_READ_PWR_LIMIT_1_START = 0x52, + GPU_STATE_READ_PWR_LIMIT_1_2 = 0x53, + GPU_STATE_READ_PWR_LIMIT_1_3 = 0x54, + GPU_STATE_READ_PWR_LIMIT_1_FINISH = 0x55, + GPU_STATE_READ_PWR_LIMIT_2_START = 0x56, + GPU_STATE_READ_PWR_LIMIT_2_2 = 0x57, + GPU_STATE_READ_PWR_LIMIT_2_FINISH = 0x58, + GPU_STATE_READ_PWR_LIMIT_3_START = 0x59, + GPU_STATE_READ_PWR_LIMIT_3_2 = 0x5A, + GPU_STATE_READ_PWR_LIMIT_3_3 = 0x5B, + GPU_STATE_READ_PWR_LIMIT_3_FINISH = 0x5C, + GPU_STATE_READ_PWR_LIMIT_4_START = 0x5D, + GPU_STATE_READ_PWR_LIMIT_4_2 = 0x5E, + GPU_STATE_READ_PWR_LIMIT_4_3 = 0x5F, + GPU_STATE_READ_PWR_LIMIT_4_FINISH = 0x60, + GPU_STATE_READ_PWR_LIMIT_5_START = 0x61, + GPU_STATE_READ_PWR_LIMIT_5_2 = 0x62, + GPU_STATE_READ_PWR_LIMIT_5_3 = 0x63, + GPU_STATE_READ_PWR_LIMIT_5_FINISH = 0x64, + GPU_STATE_READ_PWR_LIMIT_COMPLETE = 0x65, } gpuReadPwrLimitState_e; // States for setting GPU power limit (gpu_set_pwr_limit_sm) typedef enum { - GPU_STATE_SET_PWR_LIMIT_NEW = 0x61, - GPU_STATE_SET_PWR_LIMIT_START = 0x62, - GPU_STATE_SET_PWR_LIMIT_2 = 0x63, - GPU_STATE_SET_PWR_LIMIT_3 = 0x64, - GPU_STATE_SET_PWR_LIMIT_READ = 0x65, - GPU_STATE_SET_PWR_LIMIT_COMPLETE = 0x66, + GPU_STATE_SET_PWR_LIMIT_NEW = 0x71, + GPU_STATE_SET_PWR_LIMIT_1_START = 0x72, + GPU_STATE_SET_PWR_LIMIT_1_1 = 0x73, + GPU_STATE_SET_PWR_LIMIT_1_2 = 0x74, + GPU_STATE_SET_PWR_LIMIT_1_FINISH = 0x75, + GPU_STATE_SET_PWR_LIMIT_2_START = 0x76, + GPU_STATE_SET_PWR_LIMIT_2_1 = 0x77, + GPU_STATE_SET_PWR_LIMIT_2_2 = 0x78, + GPU_STATE_SET_PWR_LIMIT_2_FINISH = 0x79, + GPU_STATE_SET_PWR_LIMIT_3_START = 0x7A, + GPU_STATE_SET_PWR_LIMIT_3_2 = 0x7B, + GPU_STATE_SET_PWR_LIMIT_3_3 = 0x7C, + GPU_STATE_SET_PWR_LIMIT_3_FINISH = 0x7D, + GPU_STATE_SET_PWR_LIMIT_4_START = 0x7E, + GPU_STATE_SET_PWR_LIMIT_4_2 = 0x7F, + GPU_STATE_SET_PWR_LIMIT_4_FINISH = 0x80, + GPU_STATE_SET_PWR_LIMIT_COMPLETE = 0x81, } gpuSetPwrLimitState_e; // GPU IPC initialization -- cgit v1.2.1