diff options
author | William Bryan <wilbryan@us.ibm.com> | 2017-09-28 13:32:29 -0500 |
---|---|---|
committer | William A. Bryan <wilbryan@us.ibm.com> | 2017-10-03 16:03:05 -0400 |
commit | 74f721c90235a18821b97782d98349cf51e0f12d (patch) | |
tree | 1f2fd59b41db514c0273632dd2dd7926e25a2030 | |
parent | 76b91d0038d59b30de14108e908bc78c6d988796 (diff) | |
download | talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.tar.gz talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.zip |
GPU 405 Enable Memory Temperatures
Change-Id: Id50d12a50a05b8b3a6a6f1ce3ce4512d3299caa7
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46882
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r-- | src/common/gpe_err.h | 6 | ||||
-rw-r--r-- | src/common/gpu_structs.h | 31 | ||||
-rw-r--r-- | src/include/registers/ocb_firmware_registers.h | 20 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_data.c | 6 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_pcap.c | 81 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_sys.h | 38 | ||||
-rwxr-xr-x | src/occ_405/cmdh/cmdh_fsp_cmds.c | 46 | ||||
-rwxr-xr-x | src/occ_405/gpu/gpu.c | 1332 | ||||
-rw-r--r-- | src/occ_405/gpu/gpu.h | 40 | ||||
-rwxr-xr-x | src/occ_405/gpu/gpu_service_codes.h | 3 | ||||
-rw-r--r-- | src/occ_405/occ_service_codes.h | 12 |
11 files changed, 1303 insertions, 312 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h index 8580012..3bb1fa0 100644 --- a/src/common/gpe_err.h +++ b/src/common/gpe_err.h @@ -51,6 +51,8 @@ #define GPE_RC_GET_NEST_DTS_FAILED 0x61 // Failed to collect nest DTS temperatures // GPU Errors -#define GPE_RC_NO_GPU_SUPPORT 0x8F // GPE1 image doesn't support GPUs - +#define GPE_RC_NO_GPU_SUPPORT 0x80 // GPE1 image doesn't support GPUs +#define GPE_RC_GPU_DRIVER_CHANGE 0x81 // GPU in transition or just completed phase change +#define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82 // GPU rejected command with no support +#define GPE_RC_GPU_CMD_FAILED 0x83 // An error occurred in the last GPU operation #endif //_GPE_ERR_H diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h index 7933adb..ba522e5 100644 --- a/src/common/gpu_structs.h +++ b/src/common/gpu_structs.h @@ -41,6 +41,12 @@ typedef enum { + GPU_CAP_MEM = 0x00, + GPU_CAP_CORE = 0x01 +} GPU_CAPABILITIES; + +typedef enum +{ ID_GPU0 = 0x00, ID_GPU1 = 0x01, ID_GPU2 = 0x02, @@ -64,12 +70,21 @@ typedef enum GPU_REQ_READ_MEM_TEMP_3 = 0x06, // mem temp step 3 GPU_REQ_READ_MEM_TEMP_FINISH = 0x07, // Get memory temp reading GPU_REQ_READ_CAPS_START = 0x08, // Start reading capabilities - GPU_REQ_READ_CAPS_2 = 0x09, // Start reading capabilities - GPU_REQ_READ_CAPS_3 = 0x0A, // Start reading capabilities - GPU_REQ_READ_CAPS_FINISH = 0x0B, - GPU_REQ_READ_PWR_LIMIT_START = 0x0C, // Start reading GPU information - GPU_REQ_READ_PWR_LIMIT_STOP = 0x0D, // Read GPU temp register - GPU_REQ_READ_PWR_LIMIT = 0x0E, // Start reading pwr limit + GPU_REQ_READ_CAPS_2 = 0x09, // Capabilities read step 2 + GPU_REQ_READ_CAPS_3 = 0x0A, // Capabilities read step 3 + GPU_REQ_READ_CAPS_FINISH = 0x0B, // get capabilities + GPU_REQ_READ_PWR_LIMIT_START = 0x10, // Start reading GPU power limit + GPU_REQ_READ_PWR_LIMIT_2 = 0x11, + GPU_REQ_READ_PWR_LIMIT_3 = 0x12, + GPU_REQ_READ_PWR_LIMIT_FINISH = 0x13, + GPU_REQ_SET_PWR_LIMIT_START = 0x20, // Start setting GPU power limit + GPU_REQ_SET_PWR_LIMIT_2 = 0x21, + GPU_REQ_SET_PWR_LIMIT_3 = 0x22, + GPU_REQ_SET_PWR_LIMIT_FINISH = 0x23, + GPU_REQ_CHECK_DRIVER_START = 0x31, // Start check driver loaded + GPU_REQ_CHECK_DRIVER_2 = 0x32, + GPU_REQ_CHECK_DRIVER_3 = 0x33, + GPU_REQ_CHECK_DRIVER_FINISH = 0x34, GPU_REQ_RESET = 0x60, // Reset } gpu_op_req_e; @@ -78,10 +93,10 @@ typedef struct { GpeErrorStruct error; uint8_t gpu_id; + uint8_t gpu_rc; uint8_t operation; - uint64_t data; + uint64_t data[3]; } gpu_sm_args_t; - #endif // _GPU_STRUCTS_H diff --git a/src/include/registers/ocb_firmware_registers.h b/src/include/registers/ocb_firmware_registers.h index 5b6705a..010ad02 100644 --- a/src/include/registers/ocb_firmware_registers.h +++ b/src/include/registers/ocb_firmware_registers.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2015,2016 */ +/* Contributors Listed Below - COPYRIGHT 2015,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -1411,9 +1411,21 @@ typedef union ocb_occflg { uint32_t i2c_engine2_lock_occ : 1; uint32_t i2c_engine3_lock_host : 1; uint32_t i2c_engine3_lock_occ : 1; - uint32_t reserved_occ : 10; -#else - uint32_t reserved_occ : 10; + uint32_t gpu0_reset_status : 1; + uint32_t gpu1_reset_status : 1; + uint32_t gpu2_reset_status : 1; + uint32_t reserved_occ : 3; + uint32_t wof_hcode_mode : 2; + uint32_t active_quad_update : 1; + uint32_t request_occ_safe : 1; +#else + uint32_t request_occ_safe : 1; + uint32_t active_quad_update : 1; + uint32_t wof_hcode_mode : 2; + uint32_t reserved_occ : 3; + uint32_t gpu2_reset_status : 1; + uint32_t gpu1_reset_status : 1; + uint32_t gpu0_reset_status : 1; uint32_t i2c_engine3_lock_occ : 1; uint32_t i2c_engine3_lock_host : 1; uint32_t i2c_engine2_lock_occ : 1; diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c index 3a82bb3..4c553d6 100755 --- a/src/occ_405/amec/amec_data.c +++ b/src/occ_405/amec/amec_data.c @@ -458,12 +458,6 @@ void amec_data_write_pcap(void) g_amec->pcap.ovs_node_pcap = G_sysConfigData.pcap.hard_min_pcap; } - //Oversubscription pcap can NOT be higher than a customer set pcap. - if(g_amec->pcap.ovs_node_pcap > l_customer) - { - g_amec->pcap.ovs_node_pcap = l_customer; - } - //for all new pcap data setting: If KVM, update the OPAL dynamic data if(G_sysConfigData.system_type.kvm) { diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c index 7584ddf..995324d 100755 --- a/src/occ_405/amec/amec_pcap.c +++ b/src/occ_405/amec/amec_pcap.c @@ -95,14 +95,16 @@ extern uint32_t G_first_num_gpus_sys; // Thread: Real Time Loop // // End Function Specification -void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power) +void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power) { /*------------------------------------------------------------------------*/ /* Local Variables */ /*------------------------------------------------------------------------*/ uint8_t i = 0; uint32_t l_gpu_cap_mw = 0; + uint16_t l_system_gpu_total_pcap = 0; // total GPU pcap required by system based on if currently in oversub or not static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect + static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0; // Total GPU pcap required for N+1 (not in oversubscription) static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU @@ -112,10 +114,12 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power) /*------------------------------------------------------------------------*/ /* Code */ /*------------------------------------------------------------------------*/ - // If this is the first time running calculate the total GPU power cap for oversubscription + // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1) if(L_first_run) { + // calculate total GPU power cap for oversubscription if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) + { // Take all non-GPU power away from the oversubscription power cap L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; @@ -157,6 +161,50 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power) ERRL_CALLOUT_PRIORITY_HIGH); commitErrl(&l_err); } + + // calculate total GPU power cap for N+1 (not in oversubscription) + if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) + { + // Take all non-GPU power away from the N+1 power cap + L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; + // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs + L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; + } + else + { + // This should not happen, the total non GPU power should never be higher than the N+1 mode cap + // Log error and set GPUs to minimum power cap + L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap + + TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW", + G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap); + + /* @ + * @errortype + * @moduleid AMEC_GPU_PCAP_MID + * @reasoncode GPU_FAILURE + * @userdata1 N+1 mode Power Cap watts + * @userdata2 Total non-GPU power watts + * @userdata4 ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE + * @devdesc Total non-GPU power more than N+1 mode power cap + * + */ + errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, + GPU_FAILURE, + ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_sysConfigData.pcap.system_pcap, + G_sysConfigData.total_non_gpu_max_pwr_watts); + + //Callout firmware + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + commitErrl(&l_err); + } } // if first run // Calculate the total GPU power cap for the current active limit and PSR @@ -180,12 +228,23 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power) G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap); } - // Total GPU power cap is the lower of oversubscription and active power limit - // must always account for oversubscription to ensure when a power supply is lost the OCC - // can react fast enough, GPU power capping is too slow and must have GPU power cap already - // set to account for oversubscription case - L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? - L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap; + // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub) + // and the active power limit. We do not need to always account for oversubscription since + // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is + // entered from the time OCC can set and GPUs react to a new power limit + if(i_oversubscription) + { + // system in oversubscription use N mode cap + l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap; + } + else + { + // system is not in oversubscription use N+1 mode cap + l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap; + } + + L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? + l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap; // Divide the total equally across all GPUs in the system if(G_first_num_gpus_sys) @@ -282,8 +341,8 @@ void amec_pcap_calc(void) l_oversub_state = AMEC_INTF_GET_OVERSUBSCRIPTION(); // Determine the active power cap. norm_node_pcap is set as lowest - // between sys and user in amec_data_write_pcap() - // when in oversub only use oversub pcap if lower than norm_node_pcap + // between sys (N+1 mode) and user in amec_data_write_pcap() + // when in oversub (N mode) only use oversub pcap if lower than norm_node_pcap // to handle user set power cap lower than the oversub power cap if( (TRUE == l_oversub_state) && (g_amec->pcap.ovs_node_pcap < g_amec->pcap.norm_node_pcap) ) @@ -312,7 +371,7 @@ void amec_pcap_calc(void) // Determine GPU power cap if there are GPUs present if(G_first_proc_gpu_config) { - amec_gpu_pcap(l_active_pcap_changed, l_avail_power); + amec_gpu_pcap(l_oversub_state, l_active_pcap_changed, l_avail_power); } if(l_node_pwr != 0) diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 803ca28..3f1d333 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -436,27 +436,35 @@ typedef struct //------------------------------------------------------------- typedef struct { - bool disabled; // GPU has been marked failed and no longer monitored - bool readOnce; // Comm has been established with GPU - bool overtempError; // Core OT error has been logged against GPU - bool memOvertempError; // Memory OT error has been logged against GPU + bool disabled; // GPU has been marked failed and no longer monitored + bool readOnce; // Comm has been established with GPU + bool commErrorLogged; // GPU has been called out due to comm error + bool overtempError; // Core OT error has been logged against GPU + bool memOvertempError; // Memory OT error has been logged against GPU bool checkDriverLoaded; // Indicates if need to check if driver is loaded - bool driverLoaded; // Indicates if GPU driver is loaded + bool driverLoaded; // Indicates if GPU driver is loaded bool checkMemTempSupport; // Indicates if need to check if mem monitoring is supported - bool memTempSupported; // Indicates if memory temperature monitoring is supported - uint8_t memErrorCount; // count of consecutive GPU mem temp read failures - uint8_t errorCount; // count of consecutive GPU core temp read failures + bool memTempSupported; // Indicates if memory temperature monitoring is supported + bool notReset; // '1' = GPU NOT in reset. Read from OCC FLAGS register + bool coreTempNotAvailable; // for fan control: '1' = core temp not available. (send 0 for fan control) + bool memTempNotAvailable; // for fan control: '1' = Mem temp not available. (send 0 for fan control) + bool coreTempFailure; // for fan control: '1' = timeout failure reading core temp (send 0xFF for fan control) + bool memTempFailure; // for fan control: '1' = timeout failure reading Mem temp (send 0xFF for fan control) + uint8_t memErrorCount; // count of consecutive GPU mem temp read failures when GPU not in reset + uint8_t errorCount; // count of consecutive GPU core temp read failures when GPU not in reset + uint8_t retryCount; // count of consecutive GPU core temp read failures before I2C reset } gpuStatus_t; typedef struct { - bool check_pwr_limit; // Indicates if need to read power limits from GPU - bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max - bool gpu_min_cap_required; // Indicates if power limits were read i.e. have min/max - uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU - uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU - uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set + bool check_pwr_limit; // Indicates if need to read power limits from GPU + bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max + bool set_failed; // Indicates if failed to set power limit + bool gpu_min_cap_required; // Indicates if GPU requires min cap + uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU + uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU + uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set uint32_t gpu_requested_pcap_mw; // Requested power cap in mW sent to GPU - uint32_t gpu_actual_pcap_mw; // Actual power cap in mW read back from the GPU + uint32_t gpu_default_pcap_mw; // Default power cap in mW read from the GPU } gpuPcap_t; diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 4606f6d..422dc38 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -348,7 +348,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) const sensor_t *vrfan = getSensorByGsid(VRMPROCOT); if (vrfan != NULL) { - l_tempSensorList[l_sensorHeader.count].id = G_sysConfigData.proc_huid; + l_tempSensorList[l_sensorHeader.count].id = 0; l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM; l_tempSensorList[l_sensorHeader.count].value = vrfan->sample & 0xFF; l_sensorHeader.count++; @@ -358,24 +358,46 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) // Add GPU temperatures for (k=0; k<MAX_NUM_GPU_PER_DOMAIN; k++) { - if(GPU_PRESENT(k)) // temp until GPU sensor IDs are sent make sensor ids "GPU"<gpu#> + if(GPU_PRESENT(k)) { // GPU core temperature - if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp - l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid; - else - l_tempSensorList[l_sensorHeader.count].id = 0xC6 + (9 * G_pbax_id.chip_id) + (k*3); // temp + l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid; l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU; - l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF; + if(g_amec->gpu[k].status.coreTempFailure) + { + // failed to read core temperature return 0xFF + l_tempSensorList[l_sensorHeader.count].value = 0xFF; + } + else if(g_amec->gpu[k].status.coreTempNotAvailable) + { + // core temperature not available return 0 + l_tempSensorList[l_sensorHeader.count].value = 0; + } + else + { + // have a good core temperature return the reading + l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF; + } l_sensorHeader.count++; // GPU memory temperature - if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp - l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid; - else - l_tempSensorList[l_sensorHeader.count].id = 0xC7 + (9 * G_pbax_id.chip_id) + (k*3); // temp + l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid; l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM; - l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF; + if(g_amec->gpu[k].status.memTempFailure) + { + // failed to read memory temperature return 0xFF + l_tempSensorList[l_sensorHeader.count].value = 0xFF; + } + else if(g_amec->gpu[k].status.memTempNotAvailable) + { + // memory temperature not available return 0 + l_tempSensorList[l_sensorHeader.count].value = 0; + } + else + { + // have a good memory temperature return the reading + l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF; + } l_sensorHeader.count++; } } diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 8666e12..1a27565 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -51,18 +51,25 @@ #define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) // Number calls with assumption called every other tick -// Time in seconds to ignore errors from the start of GPU SM -// Right now this time must include PRST and GPU init time -// this may be reduced after adding in OS interlock for PRST -#define GPU_COMM_ESTAB_TIMEOUT_SECONDS 600 +// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time +#define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds -#define MAX_CONSECUTIVE_GPU_RESETS 3 +#define MAX_CONSECUTIVE_GPU_RESETS 5 #define MAX_GPU_RESET_STATE_RETRY 3 #define MAX_RESET_STATE_NOT_DONE_COUNT 100 #define MAX_GPU_READ_ATTEMPT 3 +#define GPU_ERRORS_BEFORE_I2C_RESET 5 + +// consecutive error counts for GPU command failures before error is logged if GPU is not in reset +#define GPU_CHECK_DRIVER_ERROR_COUNT 5 +#define GPU_READ_MEM_CAP_ERROR_COUNT 5 +#define GPU_READ_PWR_LIMIT_ERROR_COUNT 5 +#define GPU_SET_PWR_LIMIT_ERROR_COUNT 5 + #define GPU_I2C_ENGINE PIB_I2C_ENGINE_C extern data_cnfg_t * G_data_cnfg; +extern PWR_READING_TYPE G_pwr_reading_type; // this is the global GPU task sm state each task within the GPU SM may have its own "state" // to allow several calls to complete the task @@ -71,7 +78,6 @@ gpuState_e G_gpu_state = GPU_STATE_IDLE; bool G_gpu_monitoring_allowed = FALSE; // Set to true if GPU is present bool G_gpu_i2c_reset_required = FALSE; uint32_t G_gpu_reset_cause = 0; -uint64_t G_gpu_sm_start_time = 0; // GPE Requests GpeRequest G_gpu_op_request; @@ -82,11 +88,6 @@ GPE_BUFFER(gpu_sm_args_t G_gpu_op_req_args); gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}}; uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed -bool G_gpu_read_issued = false; - -// Read OCC_MISC register to see if an I2C interrupt was generated for -// the specified engine. -bool check_for_i2c_interrupt(const uint8_t i_engine); // Find first present non-failed GPU. returns 0xFF if no GPUs present/functional uint8_t get_first_gpu(void) @@ -141,14 +142,47 @@ uint8_t gpu_id_need_driver_check(void) uint8_t gpu_id = 0xFF; // default none needs checking uint8_t i = 0; - for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) + // checking for driver loaded can be repeated until driver is loaded which may never happen + // to avoid infinite loop checking the same GPU over and over we will use a static and each call + // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs + static uint8_t L_current_gpu_id = 0; + + if(L_current_gpu_id == 0xFF) { - if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkDriverLoaded)) - { - gpu_id = i; - break; - } + // checked all GPUs once, do not check any this time and start over with GPU 0 on next call + L_current_gpu_id = 0; } + else + { + for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++) + { + // only check for driver after i2c comm (readOnce) has been established + if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) && + (g_amec->gpu[i].status.readOnce) && (g_amec->gpu[i].status.checkDriverLoaded)) + { + gpu_id = i; + break; + } + } + + // setup L_current_gpu_id for next call based on what is happening this time + if(gpu_id == 0xFF) + { + // no GPU needs checking start back at 0 next time + L_current_gpu_id = 0; + } + else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) ) + { + // last GPU is having driver checked do not check any next time + L_current_gpu_id = 0xFF; + } + else + { + // next time look at the next GPU ID first + L_current_gpu_id = gpu_id + 1; + } + } + return gpu_id; } @@ -157,13 +191,45 @@ uint8_t gpu_id_need_memory_temp_capability_check(void) uint8_t gpu_id = 0xFF; // default none needs checking uint8_t i = 0; - for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) + // checking for memory temp capability will be repeated until memory temp is capable which may never happen + // to avoid infinite loop checking the same GPU over and over we will use a static and each call + // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs + static uint8_t L_current_gpu_id = 0; + + if(L_current_gpu_id == 0xFF) { - if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkMemTempSupport)) - { - gpu_id = i; - break; - } + // checked all GPUs once, do not check any this time and start over with GPU 0 on next call + L_current_gpu_id = 0; + } + else + { + for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++) + { + // driver must be loaded for memory temp capability + if( (!g_amec->gpu[i].status.disabled) && (g_amec->gpu[i].status.driverLoaded) && + (g_amec->gpu[i].status.checkMemTempSupport) ) + { + gpu_id = i; + break; + } + } + + // setup L_current_gpu_id for next call based on what is happening this time + if(gpu_id == 0xFF) + { + // no GPU needs checking start back at 0 next time + L_current_gpu_id = 0; + } + else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) ) + { + // last GPU is having memory capability checked do not check any next time + L_current_gpu_id = 0xFF; + } + else + { + // next time look at the next GPU ID first + L_current_gpu_id = gpu_id + 1; + } } return gpu_id; } @@ -178,7 +244,7 @@ uint8_t get_first_mem_temp_capable_gpu(void) for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) { if( (!g_amec->gpu[i].status.disabled) && - (g_amec->gpu[i].status.memTempSupported) ) + (g_amec->gpu[i].status.memTempSupported) ) // memTempSupported implies that driver is loaded { first_gpu = i; break; @@ -202,7 +268,7 @@ uint8_t get_next_mem_temp_capable_gpu(void) next_gpu = 0; } if( (!g_amec->gpu[next_gpu].status.disabled) && - (g_amec->gpu[next_gpu].status.memTempSupported) ) + (g_amec->gpu[next_gpu].status.memTempSupported) ) // memTempSupported implies that driver is loaded { break; } @@ -213,6 +279,10 @@ uint8_t get_next_mem_temp_capable_gpu(void) { next_gpu = 0xFF; } + else if( (next_gpu != 0xFF) && (!g_amec->gpu[next_gpu].status.memTempSupported) ) + { + next_gpu = 0xFF; + } return next_gpu; } @@ -231,8 +301,16 @@ uint8_t gpu_id_need_power_limits(void) if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.check_pwr_limit)) { - gpu_id = i; - break; + // If there is no power capping support skip reading power limits + if(G_pwr_reading_type == PWR_READING_TYPE_NONE) + { + g_amec->gpu[i].pcap.check_pwr_limit = false; + } + else + { + gpu_id = i; + break; + } } } return gpu_id; @@ -247,8 +325,9 @@ uint8_t gpu_id_need_set_power_limit(void) for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++) { - // to set power limit requires that the driver is loaded - if( (g_amec->gpu[i].status.driverLoaded) && + // to set power limit requires that the driver is loaded and power limits were read + if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) && + (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) ) { gpu_id = i; @@ -258,6 +337,97 @@ uint8_t gpu_id_need_set_power_limit(void) return gpu_id; } +// For the given GPU clear status/data that requires GPU driver to be loaded +void clear_gpu_driver_status(uint8_t i_gpu_num) +{ + g_amec->gpu[i_gpu_num].status.checkDriverLoaded = false; + g_amec->gpu[i_gpu_num].status.driverLoaded = false; + + // Reading memory temperature requires driver to be loaded. + g_amec->gpu[i_gpu_num].status.checkMemTempSupport = false; + g_amec->gpu[i_gpu_num].status.memTempSupported = false; + g_amec->gpu[i_gpu_num].status.memErrorCount = 0; + + // Power capping requires driver to be loaded. Clear GPU power limits + g_amec->gpu[i_gpu_num].pcap.check_pwr_limit = false; + g_amec->gpu[i_gpu_num].pcap.pwr_limits_read = false; + g_amec->gpu[i_gpu_num].pcap.set_failed = false; + g_amec->gpu[i_gpu_num].pcap.gpu_min_pcap_mw = 0; + g_amec->gpu[i_gpu_num].pcap.gpu_max_pcap_mw = 0; + g_amec->gpu[i_gpu_num].pcap.gpu_requested_pcap_mw = 0; + g_amec->gpu[i_gpu_num].pcap.gpu_default_pcap_mw = 0; + //amec will need to recalculate after power limits are read to handle any clipping with new GPU min/max + g_amec->gpu[i_gpu_num].pcap.gpu_desired_pcap_mw = 0; +} + +// Handles GPU not able to process request due to driver load or un-load +void handle_driver_change(void) +{ + // Clear out driver status while driver change completes and is determined if loaded/un-loaded + clear_gpu_driver_status(G_current_gpu_id); + + // memory temp only available when driver is loaded. clear error and set not available + g_amec->gpu[G_current_gpu_id].status.memTempFailure = false; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true; + + // when driver change is complete we must re-query to see if driver is loaded or not + g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true; +} + +// For all GPUs read GPU reset status and take action if reset status has changed +void update_gpu_reset_status(void) +{ + uint8_t gpu_num = 0; + + // GPU reset status is in the OCC FLAGS register and is updated by OPAL + // Read the current reset status for all GPUs. A reset status of '1' indicates NOT in reset + ocb_occflg_t occ_flags = {0}; + occ_flags.value = in32(OCB_OCCFLG); + bool not_in_reset[3] = {occ_flags.fields.gpu0_reset_status, + occ_flags.fields.gpu1_reset_status, + occ_flags.fields.gpu2_reset_status}; + + // reset status of '0' (IN reset) is the default + // the OCC will still try to read GPU when IN reset but will not log errors + // this is so we still communicate with the GPUs without OPAL support to indicate + // a GPU is not in reset. + // Full OCC support below for when OPAL starts updating the reset status + for (gpu_num=0; gpu_num<MAX_NUM_GPU_PER_DOMAIN; gpu_num++) + { + if(not_in_reset[gpu_num] != g_amec->gpu[gpu_num].status.notReset) + { + INTR_TRAC_IMP("update_gpu_reset_status: GPU%d NOT in reset is now = %d", + gpu_num, + not_in_reset[gpu_num]); + + // There has been a change to the reset status clear everything out except for errors logged so we don't log again + clear_gpu_driver_status(gpu_num); + g_amec->gpu[gpu_num].status.errorCount = 0; + g_amec->gpu[gpu_num].status.retryCount = 0; + + // readOnce of false will force comm to be established and once established then checkDriverLoaded will get set + g_amec->gpu[gpu_num].status.readOnce = false; + + if(not_in_reset[gpu_num]) + { + // GPU was taken out of reset clear disabled to allow communication again + g_amec->gpu[gpu_num].status.disabled = false; + } + else + { + // GPU was put in reset. Clear temperature sensor errors and set to not available + g_amec->gpu[gpu_num].status.coreTempFailure = false; + g_amec->gpu[gpu_num].status.coreTempNotAvailable = true; + g_amec->gpu[gpu_num].status.memTempFailure = false; + g_amec->gpu[gpu_num].status.memTempNotAvailable = true; + } + + g_amec->gpu[gpu_num].status.notReset = not_in_reset[gpu_num]; + + } // if GPU reset status changed + } // for each GPU +} // end update_gpu_reset_status() + // Disable GPU monitoring for all GPUs void disable_all_gpus(void) { @@ -327,95 +497,103 @@ void gpu_ipc_init() } } -// Called after a failure for a specified GPU. The error will +// Called after a failure reading core temp for a specified GPU. The error will // be counted and if threshold is reached, an error will be created with -// the GPU as a callout and then set flag to force reset +// the GPU as a callout if the GPU is not in reset void mark_gpu_failed(const gpu_sm_args_t *i_arg) { uint32_t gpu_id = i_arg->gpu_id; do { - // ignore all errors if haven't reached timeout for comm established - if( (false == g_amec->gpu[gpu_id].status.readOnce) && - (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) ) - { - // do nothing but reset at this time - break; - } if((false == g_amec->gpu[gpu_id].status.disabled) && (true == g_amec->gpu[gpu_id].status.readOnce)) { - INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X " - "(ffdc 0x%08X%08X)", - gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount, - WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc)); + GPU_DBG("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X " + "(ffdc 0x%08X%08X)", + gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount, + WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc)); } - if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) && - (false == g_amec->gpu[gpu_id].status.disabled) && - (true == g_amec->gpu[gpu_id].status.readOnce)) + // Always inc retry count for I2C reset regardless of if GPU is in reset or not + g_amec->gpu[gpu_id].status.retryCount++; + + // Only inc error count if it is known that GPU is NOT in reset + // NOTE: Default is IN reset so this will only be true when OPAL/OS supports telling the OCC reset status + // if OS never tells the OCC reset status the OCC will never disable or log a comm error + if(g_amec->gpu[gpu_id].status.notReset) { - G_gpu_state = GPU_STATE_IDLE; - - // Something has gone wrong and it may be that OPAL has put - // the GPU into reset. For now, if this happens we will just - // continue polling the GPU until it comes back. - g_amec->gpu[gpu_id].status.readOnce = false; - g_amec->gpu[gpu_id].status.checkDriverLoaded = true; - g_amec->gpu[gpu_id].status.driverLoaded = false; - g_amec->gpu[gpu_id].status.checkMemTempSupport = true; - g_amec->gpu[gpu_id].status.memTempSupported = false; - g_amec->gpu[gpu_id].status.memErrorCount = 0; - g_amec->gpu[gpu_id].status.errorCount = 0; - -// This code can be used if an interlock with OPAL is ever introduced -#if 0 - // Disable this GPU, collect FFDC and log error - g_amec->gpu[gpu_id].status.disabled = true; - - INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)", - gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation); - errlHndl_t l_err = NULL; - /* - * @errortype - * @moduleid GPU_MID_MARK_GPU_FAILED - * @reasoncode GPU_FAILURE - * @userdata1 GPE returned rc code - * @userdata4 ERC_GPU_COMPLETE_FAILURE - * @devdesc GPU failure - */ - l_err = createErrl(GPU_MID_MARK_GPU_FAILED, - GPU_FAILURE, - ERC_GPU_COMPLETE_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - i_arg->error.rc, - 0); - addUsrDtlsToErrl(l_err, - (uint8_t*)&i_arg->error.ffdc, - sizeof(i_arg->error.ffdc), - ERRL_STRUCT_VERSION_1, - ERRL_USR_DTL_BINARY_DATA); - - // Callout the GPU if have sensor ID for it - if(G_sysConfigData.gpu_sensor_ids[gpu_id]) + // INC count and check if reached error threshold + if( ++g_amec->gpu[gpu_id].status.errorCount > GPU_INIT_ERROR_COUNT) { - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_GPU_ID, - G_sysConfigData.gpu_sensor_ids[gpu_id], - ERRL_CALLOUT_PRIORITY_MED); - } + // set that GPU temperature readings failed + g_amec->gpu[gpu_id].status.memTempFailure = true; + g_amec->gpu[gpu_id].status.memTempNotAvailable = true; + g_amec->gpu[gpu_id].status.coreTempFailure = true; + g_amec->gpu[gpu_id].status.coreTempNotAvailable = true; + + // Disable this GPU. GPU will get re-enabled if detected that GPU is put in reset and then taken out + g_amec->gpu[gpu_id].status.disabled = true; + + INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)", + gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation); + + if(g_amec->gpu[gpu_id].status.commErrorLogged == false) + { + + errlHndl_t l_err = NULL; + /* + * @errortype + * @moduleid GPU_MID_MARK_GPU_FAILED + * @reasoncode GPU_FAILURE + * @userdata1 GPE returned rc code + * @userdata4 ERC_GPU_COMPLETE_FAILURE + * @devdesc GPU failure + */ + l_err = createErrl(GPU_MID_MARK_GPU_FAILED, + GPU_FAILURE, + ERC_GPU_COMPLETE_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + i_arg->error.rc, + 0); + addUsrDtlsToErrl(l_err, + (uint8_t*)&i_arg->error.ffdc, + sizeof(i_arg->error.ffdc), + ERRL_STRUCT_VERSION_1, + ERRL_USR_DTL_BINARY_DATA); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + commitErrl(&l_err); + g_amec->gpu[gpu_id].status.commErrorLogged = true; + + } // if !commErrorLogged + + } // if errorCount > threshold + + } // if notReset - commitErrl(&l_err); -#endif - } } while(0); - // Reset GPU - G_gpu_i2c_reset_required = true; - G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF); + // Do an I2C reset if reached retry count + // don't want to do I2C reset every time since could be that this GPU really is in reset and + // while resetting I2C we are unable to read other GPUs that may not be in reset + if( g_amec->gpu[gpu_id].status.retryCount > GPU_ERRORS_BEFORE_I2C_RESET) + { + g_amec->gpu[gpu_id].status.retryCount = 0; + G_gpu_i2c_reset_required = true; + G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF); + } + } // end mark_gpu_failed() // Schedule a GPE request for GPU operation @@ -461,6 +639,27 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args) case GPU_REQ_READ_MEM_TEMP_FINISH: break; + // Check if driver is loaded + case GPU_REQ_CHECK_DRIVER_START: + case GPU_REQ_CHECK_DRIVER_2: + case GPU_REQ_CHECK_DRIVER_3: + case GPU_REQ_CHECK_DRIVER_FINISH: + break; + + // Read GPU Power Limit + case GPU_REQ_READ_PWR_LIMIT_START: + case GPU_REQ_READ_PWR_LIMIT_2: + case GPU_REQ_READ_PWR_LIMIT_3: + case GPU_REQ_READ_PWR_LIMIT_FINISH: + break; + + // Set GPU Power Limit + case GPU_REQ_SET_PWR_LIMIT_START: + case GPU_REQ_SET_PWR_LIMIT_2: + case GPU_REQ_SET_PWR_LIMIT_3: + case GPU_REQ_SET_PWR_LIMIT_FINISH: + break; + // I2C reset case GPU_REQ_RESET: break; @@ -498,6 +697,7 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args) { // Clear errors and init common arguments for GPE G_gpu_op_req_args.error.error = 0; + G_gpu_op_req_args.gpu_rc = 0; G_gpu_op_req_args.operation = i_operation; G_gpu_op_req_args.gpu_id = G_current_gpu_id; @@ -574,14 +774,14 @@ bool gpu_reset_sm() /* * @errortype * @moduleid GPU_MID_GPU_RESET_SM - * @reasoncode GPU_FAILURE + * @reasoncode GPU_NO_GPE_SUPPORT * @userdata1 0 * @userdata2 0 * @userdata4 ERC_GPU_NO_GPE_SUPPORT * @devdesc GPE1 image doesn't support GPU communication */ errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM, - GPU_FAILURE, + GPU_NO_GPE_SUPPORT, ERC_GPU_NO_GPE_SUPPORT, ERRL_SEV_UNRECOVERABLE, NULL, @@ -606,9 +806,7 @@ bool gpu_reset_sm() else // this reset attempt failed { // Stop trying if reached max resets - if( (L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) && - (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) >= - GPU_COMM_ESTAB_TIMEOUT_SECONDS)) + if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) { INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X", L_reset_state); @@ -662,12 +860,12 @@ bool gpu_reset_sm() break; case GPU_RESET_STATE_RESET_MASTER: - G_new_gpu_req_args.data = GPU_RESET_REQ_MASTER; + G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER; L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args); break; case GPU_RESET_STATE_RESET_SLAVE: - G_new_gpu_req_args.data = GPU_RESET_REQ_SLV; + G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV; L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args); break; @@ -677,7 +875,7 @@ bool gpu_reset_sm() break; case GPU_RESET_STATE_RESET_SLAVE_COMPLETE: - G_new_gpu_req_args.data = GPU_RESET_REQ_SLV_COMPLETE; + G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE; L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args); break; @@ -718,6 +916,555 @@ bool gpu_reset_sm() // Function Specification // +// Name: gpu_check_driver_loaded_sm +// +// Description: Called from gpu_task_sm to check if driver is loaded for G_current_gpu_id +// This function should only return that complete is TRUE when the check +// is complete (or determined failed) and ready for a different GPU +// +// Pre-Req: Caller must have G_current_gpu_id set for GPU to check +// +// End Function Specification +bool gpu_check_driver_loaded_sm() +{ + bool l_complete = FALSE; // only return TRUE when the read is complete or failed + bool l_new_driver_loaded = FALSE; + static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled + static uint8_t L_check_driver_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint8_t L_state_failure_count = 0; + static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; + static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + + if (async_request_is_idle(&G_gpu_op_request.request)) + { + // If not starting a new read then need to check status of current state before moving on + // stay in current state if the schedule failed or the state isn't finished/failed + if( (L_check_driver_state != GPU_STATE_CHECK_DRIVER_LOADED_NEW) && + (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) + { + // Check if failure was due to driver change + if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) + { + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_check_driver_failure_count[G_current_gpu_id] = 0; // clear driver failure count since there's a driver change + L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; + return TRUE; // Done with this GPU, let GPU SM move to next + } + + // If reached state retry count give up on this read + else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) + { + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) + { + if(++L_check_driver_failure_count[G_current_gpu_id] > GPU_CHECK_DRIVER_ERROR_COUNT) + { + INTR_TRAC_ERR("gpu_check_driver_loaded: Failed to check driver loaded for GPU%d RC: 0x%02X", + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // give up checking driver loaded for this GPU + // It will be retried if detected that GPU is put in reset and then taken out + g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false; + L_check_driver_failure_count[G_current_gpu_id] = 0; + + // without driver loaded cannot read memory temp, mark memory temp as failed + g_amec->gpu[G_current_gpu_id].status.memTempFailure = true; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true; + + // log one time error that driver loaded couldn't be determined + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_CHECK_DRIVER_LOADED + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 0 + * @userdata4 ERC_GPU_CHECK_DRIVER_LOADED_FAILURE + * @devdesc Failure to check GPU driver loaded + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_CHECK_DRIVER_LOADED, + GPU_FAILURE, + ERC_GPU_CHECK_DRIVER_LOADED_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + 0); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset + + L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; + L_state_failure_count = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } // if reached state retry count + else + { + // INC failure count and retry current state + L_state_failure_count++; + } + } + else // success on last state go to next state and process it + { + L_state_failure_count = 0; + L_check_driver_state++; + } + + L_scheduled = FALSE; // default nothing scheduled + + switch (L_check_driver_state) + { + case GPU_STATE_CHECK_DRIVER_LOADED_START: + L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args); + break; + + case GPU_STATE_CHECK_DRIVER_LOADED_2: + L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_2, G_new_gpu_req_args); + break; + + case GPU_STATE_CHECK_DRIVER_LOADED_3: + L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_3, G_new_gpu_req_args); + break; + + case GPU_STATE_CHECK_DRIVER_LOADED_READ: + L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_FINISH, G_new_gpu_req_args); + break; + + case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE: + // Update driver loaded + l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01; + if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded) + { + // Driver loaded status changed + INTR_TRAC_IMP("gpu_check_driver_loaded: GPU%d driver loaded changed to %d", + G_current_gpu_id, + l_new_driver_loaded); + + if(l_new_driver_loaded) + { + // Driver is now loaded do checking that required driver to be loaded + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = true; + g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true; + // done checking for driver to be loaded + g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false; + } + else + { + // Driver is no longer loaded + clear_gpu_driver_status(G_current_gpu_id); + + // memory temp only available when driver is loaded + // clear error and set not available + g_amec->gpu[G_current_gpu_id].status.memTempFailure = false; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true; + + // Need to keep query for driver loaded to detect when driver is loaded + g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true; + } + + g_amec->gpu[G_current_gpu_id].status.driverLoaded = l_new_driver_loaded; + } + + // Done with this GPU ready to move to new one + L_check_driver_failure_count[G_current_gpu_id] = 0; + L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; + l_complete = TRUE; + break; + + default: + INTR_TRAC_ERR("gpu_check_driver_loaded: INVALID STATE: 0x%02X", L_check_driver_state); + L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW; + l_complete = TRUE; + break; + } // switch L_check_driver_state + + if(L_scheduled) + { + GPU_DBG("gpu_check_driver_loaded: Scheduled check driver loaded state 0x%02X at tick %d", + L_check_driver_state, GPU_TICK); + } + else if(!l_complete) // if not complete there must have been a failure on the schedule + { + INTR_TRAC_ERR("gpu_check_driver_loaded: failed to schedule state 0x%02X", L_check_driver_state); + } + + } // if async_request_is_idle + else + { + INTR_TRAC_ERR("gpu_check_driver_loaded: NOT idle for state 0x%02X", L_check_driver_state); + } + + return l_complete; +} // end gpu_check_driver_loaded_sm() + +// Function Specification +// +// Name: gpu_read_pwr_limit_sm +// +// Description: Called from gpu_task_sm to read GPU power limits for G_current_gpu_id +// This function should only return that complete is TRUE when the read +// is complete (or determined failed) and ready for a different GPU +// +// Pre-Req: Caller must have G_current_gpu_id set for GPU to read +// +// End Function Specification +bool gpu_read_pwr_limit_sm() +{ + bool l_complete = FALSE; // only return TRUE when the read is complete or failed + static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled + static uint8_t L_read_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint8_t L_state_failure_count = 0; + static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + + if (async_request_is_idle(&G_gpu_op_request.request)) + { + // If not starting a new read then need to check status of current state before moving on + // stay in current state if the schedule failed or the state isn't finished/failed + if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) && + (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) + { + // Check if failure was due to driver change + if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) + { + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + return TRUE; // Done with this GPU, let GPU SM move to next + } + + // If reached retry count give up on this read + else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) + { + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) + { + if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT) + { + INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X", + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // give up trying to read power limits for this GPU + // It will be retried if detected that GPU is put in reset and then taken out + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false; + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; + + // log one time error that power limits could not be read + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_READ_PWR_LIMIT + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 0 + * @userdata4 ERC_GPU_READ_PWR_LIMIT_FAILURE + * @devdesc Failure to read GPU power limits + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT, + GPU_FAILURE, + ERC_GPU_READ_PWR_LIMIT_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + 0); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset + + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + L_state_failure_count = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } // if reached retry count + else + { + // INC failure count and retry current state + L_state_failure_count++; + } + } + else // success on last state go to next state and process it + { + L_state_failure_count = 0; + L_read_pwr_limit_state++; + } + + L_scheduled = FALSE; // default nothing scheduled + + switch (L_read_pwr_limit_state) + { + case GPU_STATE_READ_PWR_LIMIT_START: + L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_START, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_2: + L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_2, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_3: + L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_3, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_READ: + L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_FINISH, G_new_gpu_req_args); + break; + + case GPU_STATE_READ_PWR_LIMIT_COMPLETE: + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE; + // Update power limits + g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE; + g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0]; + g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = (uint32_t) G_gpu_op_req_args.data[1]; + g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = (uint32_t) G_gpu_op_req_args.data[2]; + + // Done with this GPU ready to move to new one + L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + + default: + INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state); + L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + } // switch L_read_pwr_limit_state + + if(L_scheduled) + { + GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d", + L_read_pwr_limit_state, GPU_TICK); + } + else if(!l_complete) // if not complete there must have been a failure on the schedule + { + INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state); + } + + } // if async_request_is_idle + else + { + INTR_TRAC_ERR("gpu_read_pwr_limit: NOT idle for state 0x%02X", L_read_pwr_limit_state); + } + + return l_complete; +} // end gpu_read_pwr_limit_sm() + +// Function Specification +// +// Name: gpu_set_pwr_limit_sm +// +// Description: Called from gpu_task_sm to set GPU power limit for G_current_gpu_id +// This function should only return that complete is TRUE when the set +// is complete (or determined failed) and ready for a different GPU +// +// Pre-Req: Caller must have G_current_gpu_id set for GPU to read +// +// End Function Specification +bool gpu_set_pwr_limit_sm() +{ + bool l_complete = FALSE; // only return TRUE when complete or failed + static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled + static uint8_t L_state_failure_count = 0; + static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; + + if (async_request_is_idle(&G_gpu_op_request.request)) + { + // If not starting a new set limit then need to check status of current state before moving on + // stay in current state if the schedule failed or the state isn't finished/failed + if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) && + (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) + { + // Check if failure was due to driver change + if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) + { + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + return TRUE; // Done with this GPU, let GPU SM move to next + } + + // If reached retry count give up on this read + else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) + { + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) + { + if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT) + { + INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X", + G_gpu_op_req_args.data[0], + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); + + // give up trying to set power limit for this GPU + // It will be retried if detected that GPU is put in reset and then taken out or driver change + g_amec->gpu[G_current_gpu_id].pcap.set_failed = true; + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; + + // log error that power limit could not be set + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_SET_PWR_LIMIT + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 0 + * @userdata4 ERC_GPU_SET_PWR_LIMIT_FAILURE + * @devdesc Failure to set GPU power limit + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT, + GPU_FAILURE, + ERC_GPU_SET_PWR_LIMIT_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + 0); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset + + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + L_state_failure_count = 0; + return TRUE; // Done with this GPU, let GPU SM move to next + } // if reached retry count + else + { + // INC failure count and retry current state + L_state_failure_count++; + } + } + else // success on last state go to next state and process it + { + L_state_failure_count = 0; + L_set_pwr_limit_state++; + } + + L_scheduled = FALSE; // default nothing scheduled + + switch (L_set_pwr_limit_state) + { + case GPU_STATE_SET_PWR_LIMIT_START: + // send the desired GPU power cap to the GPE to send to GPU + G_new_gpu_req_args.data[0] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw; + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_START, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_2: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_3: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_READ: + L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_FINISH, G_new_gpu_req_args); + break; + + case GPU_STATE_SET_PWR_LIMIT_COMPLETE: + // Update the requested power limit since it was successfully sent + // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC + // has caluclated a new desired pcap while this one was already in process of being set + g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0]; + + // Done with this GPU ready to move to new one + L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + + default: + INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state); + L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW; + l_complete = TRUE; + break; + } // switch L_set_pwr_limit_state + + if(L_scheduled) + { + GPU_DBG("gpu_set_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d", + L_set_pwr_limit_state, GPU_TICK); + } + else if(!l_complete) // if not complete there must have been a failure on the schedule + { + INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state); + } + + } // if async_request_is_idle + else + { + INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state); + } + + return l_complete; +} // end gpu_set_pwr_limit_sm() + +// Function Specification +// // Name: gpu_read_temp_sm // // Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id @@ -750,6 +1497,7 @@ bool gpu_read_temp_sm() mark_gpu_failed(&G_gpu_op_req_args); L_read_temp_state = GPU_STATE_READ_TEMP_NEW; + L_read_failure_count = 0; return TRUE; // Done with this GPU, let GPU SM move to next } else @@ -778,7 +1526,7 @@ bool gpu_read_temp_sm() case GPU_STATE_READ_TEMP_COMPLETE: if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) && - (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp? + (0 != G_gpu_op_req_args.data[0]) ) { g_amec->gpu[G_current_gpu_id].status.readOnce = true; @@ -791,15 +1539,17 @@ bool gpu_read_temp_sm() } // comm is now established update for capability checking to take place - g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE; g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE; } // Update sensor - l_temp = G_gpu_op_req_args.data; + l_temp = G_gpu_op_req_args.data[0]; sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp); // Clear all past errors + g_amec->gpu[G_current_gpu_id].status.coreTempFailure = false; + g_amec->gpu[G_current_gpu_id].status.coreTempNotAvailable = false; g_amec->gpu[G_current_gpu_id].status.errorCount = 0; + g_amec->gpu[G_current_gpu_id].status.retryCount = 0; // check if there is an overtemp that hasn't been reported if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) && @@ -897,8 +1647,10 @@ bool gpu_read_mem_temp_capability_sm() { bool l_complete = FALSE; // only return TRUE when the read is complete or failed static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled - static uint8_t L_read_failure_count = 0; + static uint8_t L_read_mem_cap_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0}; + static uint8_t L_state_failure_count = 0; static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW; + static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE}; if (async_request_is_idle(&G_gpu_op_request.request)) { @@ -907,57 +1659,91 @@ bool gpu_read_mem_temp_capability_sm() if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) && (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) { - // If reached retry count give up on this read - if(L_read_failure_count > MAX_GPU_READ_ATTEMPT) + // Check if failure was due to driver change + if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) { - // log error that memory temp capability couldn't be determined - // memory temp support will be left as not supported - INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d", G_current_gpu_id); - - // Log error - /* @ - * @errortype - * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE - * @reasoncode GPU_FAILURE - * @userdata1 GPU ID - * @userdata2 0 - * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE - * @devdesc Failure to read GPU memory temp capability - * - */ - errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE, - GPU_FAILURE, - ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - G_current_gpu_id, - 0); - - // Callout the GPU if have sensor ID for it - if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_state_failure_count = 0; + L_read_mem_cap_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change + L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW; + return TRUE; // Done with this GPU, let GPU SM move to next + } + // If reached state retry count give up on this read + else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT) + { + // if GPU is not in reset then INC error count and check if reached threshold + if(g_amec->gpu[G_current_gpu_id].status.notReset) { - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_GPU_ID, - G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], - ERRL_CALLOUT_PRIORITY_MED); - } + if(++L_read_mem_cap_failure_count[G_current_gpu_id] > GPU_READ_MEM_CAP_ERROR_COUNT) + { + INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d RC: 0x%02X", + G_current_gpu_id, + G_gpu_op_req_args.gpu_rc); - // Commit Error - commitErrl(&l_err); + // give up trying to read mem temp capability for this GPU + // It will be retried if detected that GPU driver is re-loaded + g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE; + L_read_mem_cap_failure_count[G_current_gpu_id] = 0; + + // cannot determine memory temp capability, mark memory temp as failed + g_amec->gpu[G_current_gpu_id].status.memTempFailure = true; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true; + + // log one time error that memory temp capability couldn't be determined + if(!L_error_logged[G_current_gpu_id]) + { + L_error_logged[G_current_gpu_id] = TRUE; + + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 0 + * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE + * @devdesc Failure to read memory temp capability + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE, + GPU_FAILURE, + ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_current_gpu_id, + 0); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + } // if error not logged + } // if reached error count + } // if notReset L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW; + L_state_failure_count = 0; return TRUE; // Done with this GPU, let GPU SM move to next - } + } // if reached state retry count else { // INC failure count and retry current state - L_read_failure_count++; + L_state_failure_count++; } } else // success on last state go to next state and process it { - L_read_failure_count = 0; + L_state_failure_count = 0; L_read_cap_state++; } @@ -983,8 +1769,21 @@ bool gpu_read_mem_temp_capability_sm() case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE: // Update capability - g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data & 0x01; + g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01; + + if(g_amec->gpu[G_current_gpu_id].status.memTempSupported) + { + // mem temp is supported no need to re-check capability + g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE; + } + else + { + // Need to keep query for mem temp capability to detect if ever changes to capable + g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE; + } + // Done with this GPU ready to move to new one + L_read_mem_cap_failure_count[G_current_gpu_id] = 0; L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW; l_complete = TRUE; break; @@ -1041,60 +1840,95 @@ bool gpu_read_memory_temp_sm() if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) && (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) ) { - // If reached retry count give up on this read - if(L_read_failure_count > MAX_GPU_READ_ATTEMPT) + // Check if failure was due to driver change + if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE) { - // INC memory error count and check if reached timeout threshold for new mem temp - uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout; - g_amec->gpu[G_current_gpu_id].status.memErrorCount++; - if((max_read_timeout) && (max_read_timeout != 0xFF) && - (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) ) + handle_driver_change(); + // Request can't be processed by GPU at this time so we are done with this GPU + // setup to start new request + L_read_failure_count = 0; + g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0; + L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW; + return TRUE; // Done with this GPU, let GPU SM move to next + } + + // If reached retry count or GPU indicated cmd not supported then give up on this read + else if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) || + (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) ) + { + // if GPU is not in reset or the GPU responded with command not supported then + // INC memory error count and check if reached timeout for new mem temp + if( (g_amec->gpu[G_current_gpu_id].status.notReset) || + (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) ) { - // Disable memory temp reading for this GPU and log error - g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE; - // so BMC knows there is an error for fan control set sensor to 0xFF - sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), 0xFFFF); - - INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors", - G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount); - - // Log error - /* @ - * @errortype - * @moduleid GPU_MID_GPU_READ_MEM_TEMP - * @reasoncode GPU_FAILURE - * @userdata1 GPU ID - * @userdata2 number consecutive read mem temp failures - * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT - * @devdesc Timeout reading new GPU memory temperature - * - */ - errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP, - GPU_FAILURE, - ERC_GPU_READ_MEM_TEMP_TIMEOUT, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - G_current_gpu_id, - g_amec->gpu[G_current_gpu_id].status.memErrorCount); + g_amec->gpu[G_current_gpu_id].status.memErrorCount++; - // Callout the GPU if have sensor ID for it - if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) - { - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_GPU_ID, - G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], - ERRL_CALLOUT_PRIORITY_MED); - } + uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout; + if((max_read_timeout) && (max_read_timeout != 0xFF) && + (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) ) + { + // Disable memory temp reading for this GPU and log error + g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE; + // so BMC knows there is an error for fan control set failure + g_amec->gpu[G_current_gpu_id].status.memTempFailure = true; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true; - // Commit Error - commitErrl(&l_err); + INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors", + G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount); - } // if timeout error + if(g_amec->gpu[G_current_gpu_id].status.commErrorLogged == false) + { + INTR_TRAC_ERR("notReset: %d rc: 0x%0X", g_amec->gpu[G_current_gpu_id].status.notReset, + G_gpu_op_req_args.error.rc); + // Log error + /* @ + * @errortype + * @moduleid GPU_MID_GPU_READ_MEM_TEMP + * @reasoncode GPU_FAILURE + * @userdata1 GPU ID + * @userdata2 GPU RC + * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT + * @devdesc Timeout reading new GPU memory temperature + * + */ + errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP, + GPU_FAILURE, + ERC_GPU_READ_MEM_TEMP_TIMEOUT, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + G_gpu_op_req_args.gpu_rc, + g_amec->gpu[G_current_gpu_id].status.memErrorCount); + + // Callout the GPU if have sensor ID for it + if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) + { + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_GPU_ID, + G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], + ERRL_CALLOUT_PRIORITY_MED); + } + + // Commit Error + commitErrl(&l_err); + g_amec->gpu[G_current_gpu_id].status.commErrorLogged = true; + } // if !commErrorLogged + } // if timeout error + else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) + { + // GPU indicated command not supported, re-check mem temp capability + // if we try to read mem temp again that means mem temp was reported capable + // and if this continues to fail eventually an error will be logged above at timeout + g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true; + g_amec->gpu[G_current_gpu_id].status.memTempSupported = false; + } + } // if notReset or command not supported + // setup to start new request + L_read_failure_count = 0; L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW; return TRUE; // Done with this GPU, let GPU SM move to next - } + } // else if failure count exceeded or command not supported else { // INC failure count and retry current state @@ -1115,20 +1949,26 @@ bool gpu_read_memory_temp_sm() L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args); break; - case GPU_STATE_READ_MEM_TEMP_STOP: + case GPU_STATE_READ_MEM_TEMP_2: L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args); break; - case GPU_STATE_READ_MEM_TEMP_READ: + case GPU_STATE_READ_MEM_TEMP_3: L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args); break; + case GPU_STATE_READ_MEM_TEMP_READ: + L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_FINISH, G_new_gpu_req_args); + break; + case GPU_STATE_READ_MEM_TEMP_COMPLETE: // Update sensor - l_temp = G_gpu_op_req_args.data; + l_temp = G_gpu_op_req_args.data[0]; sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp); // Clear past errors + g_amec->gpu[G_current_gpu_id].status.memTempFailure = false; + g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = false; g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0; // check if there is an overtemp that hasn't been reported @@ -1253,9 +2093,30 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee break; } -//TODO: Enable when functional -#if 0 - // 3. Need to check if driver is loaded? + // 3. Time to start new temperature reads? + if(i_read_temp_start_needed) + { + // Start reading core temp from first present and functional GPU + l_gpu_id = get_first_gpu(); + if(l_gpu_id != 0xFF) + { + // Read core temp for this GPU + G_current_gpu_id = l_gpu_id; + G_gpu_state = GPU_STATE_READ_TEMP; + l_new_state = TRUE; + break; + } + else // no functional GPUs + { + // release I2C lock to the host for this engine and stop monitoring + occ_i2c_lock_release(GPU_I2C_ENGINE); + G_gpu_state = GPU_STATE_NO_LOCK; + G_gpu_monitoring_allowed = FALSE; + l_new_state = FALSE; // No new state for GPU communication + break; + } + } + // 4. Need to check if driver is loaded? l_gpu_id = gpu_id_need_driver_check(); if(l_gpu_id != 0xFF) { @@ -1266,7 +2127,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee break; } - // 4. Need to read power limits? +//TODO: Enable when functional +#if 0 + // 5. Need to read power limits? l_gpu_id = gpu_id_need_power_limits(); if(l_gpu_id != 0xFF) { @@ -1276,8 +2139,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee l_new_state = TRUE; break; } +#endif - // 5. Need to read memory temps? + // 6. Need to read memory temps? if(i_mem_temp_needed) { // first check if there is a GPU that needs memory temp capability checked @@ -1304,31 +2168,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee } } } -#endif - - // 6. Time to start new temperature reads? - if(i_read_temp_start_needed) - { - // Start reading core temp from first present and functional GPU - l_gpu_id = get_first_gpu(); - if(l_gpu_id != 0xFF) - { - // Read core temp for this GPU - G_current_gpu_id = l_gpu_id; - G_gpu_state = GPU_STATE_READ_TEMP; - l_new_state = TRUE; - break; - } - else // no functional GPUs - { - // release I2C lock to the host for this engine and stop monitoring - occ_i2c_lock_release(GPU_I2C_ENGINE); - G_gpu_state = GPU_STATE_NO_LOCK; - G_gpu_monitoring_allowed = FALSE; - l_new_state = FALSE; // No new state for GPU communication - break; - } - } // Else nothing stay idle }while(0); @@ -1361,12 +2200,13 @@ void task_gpu_sm(struct task *i_self) // are functional or GPU I2C interface is broken if(G_gpu_monitoring_allowed) { + // Read and update reset status for all GPUs + update_gpu_reset_status(); // Initialize the IPC commands if this is our first run if(L_gpu_first_run) { gpu_ipc_init(); - G_gpu_sm_start_time = ssx_timebase_get(); // used for timeout establishing comm L_gpu_first_run = FALSE; } @@ -1379,6 +2219,7 @@ void task_gpu_sm(struct task *i_self) if(L_numCallsForTempRead >= GPU_TEMP_READ_1S) { L_read_temp_start_needed = TRUE; + L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish } } @@ -1430,6 +2271,7 @@ void task_gpu_sm(struct task *i_self) // Start first with reading core temp of first functional GPU L_numCallsForTempRead = 0; // to track start of next temp reading in 1s L_read_temp_start_needed = FALSE; // start is no longer needed + L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish l_gpu_id = get_first_gpu(); if(l_gpu_id != 0xFF) { @@ -1506,7 +2348,6 @@ void task_gpu_sm(struct task *i_self) { // Capability check complete for this GPU, go to IDLE state // to let IDLE SM decide what to do next - g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE; G_gpu_state = GPU_STATE_IDLE; l_start_next_state = TRUE; } @@ -1514,17 +2355,14 @@ void task_gpu_sm(struct task *i_self) case GPU_STATE_CHECK_DRIVER_LOADED: // Check if driver is loaded for current GPU - if(1) // TODO + if(gpu_check_driver_loaded_sm()) { - // Driver check complete for this GPU, go to IDLE state - // to let IDLE SM decide what to do next - g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE; - g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE; - if(g_amec->gpu[G_current_gpu_id].status.driverLoaded) - { - // Driver is loaded, read the power limits so we can start GPU power capping - g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE; - } + // Driver check complete for this GPU, + // NOTE: Do not set status.checkDriverLoaded to false here, if driver is + // not loaded we need to keep checking for driver to be loaded this is decided + // inside gpu_check_driver_loaded_sm() + + // go to IDLE state to let IDLE SM decide what to do next G_gpu_state = GPU_STATE_IDLE; l_start_next_state = TRUE; } @@ -1532,11 +2370,10 @@ void task_gpu_sm(struct task *i_self) case GPU_STATE_READ_PWR_LIMIT: // Read power limits for current GPU - if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported + if(gpu_read_pwr_limit_sm()) { // Read power limits complete for this GPU, go to IDLE state // to let IDLE SM decide what to do next - g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE; G_gpu_state = GPU_STATE_IDLE; l_start_next_state = TRUE; } @@ -1544,7 +2381,7 @@ void task_gpu_sm(struct task *i_self) case GPU_STATE_SET_PWR_LIMIT: // Set power limit on current GPU - if(1) // TODO + if(gpu_set_pwr_limit_sm()) { // Set power limit complete for this GPU, go to IDLE state // to let IDLE SM decide what to do next @@ -1596,6 +2433,7 @@ void task_gpu_sm(struct task *i_self) // new state to read core temp reset temperature reading timer L_numCallsForTempRead = 0; L_read_temp_start_needed = FALSE; // start no longer needed + L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish } else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP) { @@ -1606,6 +2444,4 @@ void task_gpu_sm(struct task *i_self) } }while((l_start_next_state) && (!l_next_state)); } // GPU monitoring enabled - - } // end task_gpu_sm() diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h index c8f13ff..a26c9c0 100644 --- a/src/occ_405/gpu/gpu.h +++ b/src/occ_405/gpu/gpu.h @@ -74,9 +74,10 @@ typedef enum { GPU_STATE_READ_MEM_TEMP_NEW = 0x21, GPU_STATE_READ_MEM_TEMP_START = 0x22, - GPU_STATE_READ_MEM_TEMP_STOP = 0x23, - GPU_STATE_READ_MEM_TEMP_READ = 0x24, - GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x25, + GPU_STATE_READ_MEM_TEMP_2 = 0x23, + GPU_STATE_READ_MEM_TEMP_3 = 0x24, + GPU_STATE_READ_MEM_TEMP_READ = 0x25, + GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x26, } gpuReadMemTempState_e; // States for checking GPU memory temperature capability (gpu_read_mem_temp_capability_sm) @@ -90,6 +91,39 @@ typedef enum GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x36, } gpuReadMemTempCapableState_e; +// States for checking if GPU driver is loaded (gpu_check_driver_loaded_sm) +typedef enum +{ + GPU_STATE_CHECK_DRIVER_LOADED_NEW = 0x41, + GPU_STATE_CHECK_DRIVER_LOADED_START = 0x42, + GPU_STATE_CHECK_DRIVER_LOADED_2 = 0x43, + GPU_STATE_CHECK_DRIVER_LOADED_3 = 0x44, + GPU_STATE_CHECK_DRIVER_LOADED_READ = 0x45, + GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE = 0x46, +} gpuCheckDriverLoadedState_e; + +// States for reading GPU power limits (gpu_read_pwr_limit_sm) +typedef enum +{ + GPU_STATE_READ_PWR_LIMIT_NEW = 0x51, + GPU_STATE_READ_PWR_LIMIT_START = 0x52, + GPU_STATE_READ_PWR_LIMIT_2 = 0x53, + GPU_STATE_READ_PWR_LIMIT_3 = 0x54, + GPU_STATE_READ_PWR_LIMIT_READ = 0x55, + GPU_STATE_READ_PWR_LIMIT_COMPLETE = 0x56, +} gpuReadPwrLimitState_e; + +// States for setting GPU power limit (gpu_set_pwr_limit_sm) +typedef enum +{ + GPU_STATE_SET_PWR_LIMIT_NEW = 0x61, + GPU_STATE_SET_PWR_LIMIT_START = 0x62, + GPU_STATE_SET_PWR_LIMIT_2 = 0x63, + GPU_STATE_SET_PWR_LIMIT_3 = 0x64, + GPU_STATE_SET_PWR_LIMIT_READ = 0x65, + GPU_STATE_SET_PWR_LIMIT_COMPLETE = 0x66, +} gpuSetPwrLimitState_e; + // GPU IPC initialization void gpu_ipc_init(); diff --git a/src/occ_405/gpu/gpu_service_codes.h b/src/occ_405/gpu/gpu_service_codes.h index 41cb3f9..4ea7c6f 100755 --- a/src/occ_405/gpu/gpu_service_codes.h +++ b/src/occ_405/gpu/gpu_service_codes.h @@ -39,6 +39,9 @@ enum gpuModuleId GPU_MID_GPU_READ_TEMP = GPU_COMP_ID | 0x06, GPU_MID_GPU_READ_MEM_TEMP = GPU_COMP_ID | 0x07, GPU_MID_GPU_READ_MEM_TEMP_CAPABLE = GPU_COMP_ID | 0x08, + GPU_MID_GPU_CHECK_DRIVER_LOADED = GPU_COMP_ID | 0x09, + GPU_MID_GPU_READ_PWR_LIMIT = GPU_COMP_ID | 0x0A, + GPU_MID_GPU_SET_PWR_LIMIT = GPU_COMP_ID | 0x0B, }; #endif /* #ifndef _GPU_SERVICE_CODES_H_ */ diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index a379f87..c740c05 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -87,12 +87,14 @@ enum occReasonCode /// Firmware Failure: equivalent to assertion failures INTERNAL_FW_FAILURE = 0xA0, + /// Build problem, gpe1 image doesn't support GPU interface + GPU_NO_GPE_SUPPORT = 0xA1, /// Error with GPU tasks - GPU_FAILURE = 0xA1, + GPU_FAILURE = 0xA2, /// GPU core reached error threshold - GPU_ERROR_TEMP = 0xA2, + GPU_ERROR_TEMP = 0xA3, /// GPU memory reached error threshold - GPU_MEMORY_ERROR_TEMP = 0xA3, + GPU_MEMORY_ERROR_TEMP = 0xA4, /// Failure within the OCC Complex of the processor INTERNAL_HW_FAILURE = 0xB0, @@ -289,7 +291,11 @@ enum occExtReasonCode ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE = 0x00F6, ERC_GPU_INVALID_GPU_OPERATION = 0x00F7, ERC_GPU_N_MODE_PCAP_CALC_FAILURE = 0x00F8, + ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE = 0x00F9, ERC_GPU_NO_GPE_SUPPORT = 0x00FF, + ERC_GPU_CHECK_DRIVER_LOADED_FAILURE = 0x0100, + ERC_GPU_READ_PWR_LIMIT_FAILURE = 0x0101, + ERC_GPU_SET_PWR_LIMIT_FAILURE = 0x0102, ERC_STATE_FROM_ALL_TO_STB_FAILURE = 0x0123, ERC_STATE_FROM_ACT_TO_CHR_FAILURE = 0x0124, |