summaryrefslogtreecommitdiffstats
path: root/src/occ_405/gpu/gpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/occ_405/gpu/gpu.c')
-rwxr-xr-xsrc/occ_405/gpu/gpu.c1332
1 files changed, 1084 insertions, 248 deletions
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 8666e12..1a27565 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -51,18 +51,25 @@
#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) // Number calls with assumption called every other tick
-// Time in seconds to ignore errors from the start of GPU SM
-// Right now this time must include PRST and GPU init time
-// this may be reduced after adding in OS interlock for PRST
-#define GPU_COMM_ESTAB_TIMEOUT_SECONDS 600
+// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
+#define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds
-#define MAX_CONSECUTIVE_GPU_RESETS 3
+#define MAX_CONSECUTIVE_GPU_RESETS 5
#define MAX_GPU_RESET_STATE_RETRY 3
#define MAX_RESET_STATE_NOT_DONE_COUNT 100
#define MAX_GPU_READ_ATTEMPT 3
+#define GPU_ERRORS_BEFORE_I2C_RESET 5
+
+// consecutive error counts for GPU command failures before error is logged if GPU is not in reset
+#define GPU_CHECK_DRIVER_ERROR_COUNT 5
+#define GPU_READ_MEM_CAP_ERROR_COUNT 5
+#define GPU_READ_PWR_LIMIT_ERROR_COUNT 5
+#define GPU_SET_PWR_LIMIT_ERROR_COUNT 5
+
#define GPU_I2C_ENGINE PIB_I2C_ENGINE_C
extern data_cnfg_t * G_data_cnfg;
+extern PWR_READING_TYPE G_pwr_reading_type;
// this is the global GPU task sm state each task within the GPU SM may have its own "state"
// to allow several calls to complete the task
@@ -71,7 +78,6 @@ gpuState_e G_gpu_state = GPU_STATE_IDLE;
bool G_gpu_monitoring_allowed = FALSE; // Set to true if GPU is present
bool G_gpu_i2c_reset_required = FALSE;
uint32_t G_gpu_reset_cause = 0;
-uint64_t G_gpu_sm_start_time = 0;
// GPE Requests
GpeRequest G_gpu_op_request;
@@ -82,11 +88,6 @@ GPE_BUFFER(gpu_sm_args_t G_gpu_op_req_args);
gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed
-bool G_gpu_read_issued = false;
-
-// Read OCC_MISC register to see if an I2C interrupt was generated for
-// the specified engine.
-bool check_for_i2c_interrupt(const uint8_t i_engine);
// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
uint8_t get_first_gpu(void)
@@ -141,14 +142,47 @@ uint8_t gpu_id_need_driver_check(void)
uint8_t gpu_id = 0xFF; // default none needs checking
uint8_t i = 0;
- for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ // checking for driver loaded can be repeated until driver is loaded which may never happen
+ // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+ // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+ static uint8_t L_current_gpu_id = 0;
+
+ if(L_current_gpu_id == 0xFF)
{
- if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkDriverLoaded))
- {
- gpu_id = i;
- break;
- }
+ // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+ L_current_gpu_id = 0;
}
+ else
+ {
+ for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ {
+ // only check for driver after i2c comm (readOnce) has been established
+ if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) &&
+ (g_amec->gpu[i].status.readOnce) && (g_amec->gpu[i].status.checkDriverLoaded))
+ {
+ gpu_id = i;
+ break;
+ }
+ }
+
+ // setup L_current_gpu_id for next call based on what is happening this time
+ if(gpu_id == 0xFF)
+ {
+ // no GPU needs checking start back at 0 next time
+ L_current_gpu_id = 0;
+ }
+ else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+ {
+ // last GPU is having driver checked do not check any next time
+ L_current_gpu_id = 0xFF;
+ }
+ else
+ {
+ // next time look at the next GPU ID first
+ L_current_gpu_id = gpu_id + 1;
+ }
+ }
+
return gpu_id;
}
@@ -157,13 +191,45 @@ uint8_t gpu_id_need_memory_temp_capability_check(void)
uint8_t gpu_id = 0xFF; // default none needs checking
uint8_t i = 0;
- for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ // checking for memory temp capability will be repeated until memory temp is capable which may never happen
+ // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+ // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+ static uint8_t L_current_gpu_id = 0;
+
+ if(L_current_gpu_id == 0xFF)
{
- if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkMemTempSupport))
- {
- gpu_id = i;
- break;
- }
+ // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+ L_current_gpu_id = 0;
+ }
+ else
+ {
+ for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ {
+ // driver must be loaded for memory temp capability
+ if( (!g_amec->gpu[i].status.disabled) && (g_amec->gpu[i].status.driverLoaded) &&
+ (g_amec->gpu[i].status.checkMemTempSupport) )
+ {
+ gpu_id = i;
+ break;
+ }
+ }
+
+ // setup L_current_gpu_id for next call based on what is happening this time
+ if(gpu_id == 0xFF)
+ {
+ // no GPU needs checking start back at 0 next time
+ L_current_gpu_id = 0;
+ }
+ else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+ {
+ // last GPU is having memory capability checked do not check any next time
+ L_current_gpu_id = 0xFF;
+ }
+ else
+ {
+ // next time look at the next GPU ID first
+ L_current_gpu_id = gpu_id + 1;
+ }
}
return gpu_id;
}
@@ -178,7 +244,7 @@ uint8_t get_first_mem_temp_capable_gpu(void)
for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
{
if( (!g_amec->gpu[i].status.disabled) &&
- (g_amec->gpu[i].status.memTempSupported) )
+ (g_amec->gpu[i].status.memTempSupported) ) // memTempSupported implies that driver is loaded
{
first_gpu = i;
break;
@@ -202,7 +268,7 @@ uint8_t get_next_mem_temp_capable_gpu(void)
next_gpu = 0;
}
if( (!g_amec->gpu[next_gpu].status.disabled) &&
- (g_amec->gpu[next_gpu].status.memTempSupported) )
+ (g_amec->gpu[next_gpu].status.memTempSupported) ) // memTempSupported implies that driver is loaded
{
break;
}
@@ -213,6 +279,10 @@ uint8_t get_next_mem_temp_capable_gpu(void)
{
next_gpu = 0xFF;
}
+ else if( (next_gpu != 0xFF) && (!g_amec->gpu[next_gpu].status.memTempSupported) )
+ {
+ next_gpu = 0xFF;
+ }
return next_gpu;
}
@@ -231,8 +301,16 @@ uint8_t gpu_id_need_power_limits(void)
if( (g_amec->gpu[i].status.driverLoaded) &&
(g_amec->gpu[i].pcap.check_pwr_limit))
{
- gpu_id = i;
- break;
+ // If there is no power capping support skip reading power limits
+ if(G_pwr_reading_type == PWR_READING_TYPE_NONE)
+ {
+ g_amec->gpu[i].pcap.check_pwr_limit = false;
+ }
+ else
+ {
+ gpu_id = i;
+ break;
+ }
}
}
return gpu_id;
@@ -247,8 +325,9 @@ uint8_t gpu_id_need_set_power_limit(void)
for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
{
- // to set power limit requires that the driver is loaded
- if( (g_amec->gpu[i].status.driverLoaded) &&
+ // to set power limit requires that the driver is loaded and power limits were read
+ if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) &&
+ (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) &&
(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) )
{
gpu_id = i;
@@ -258,6 +337,97 @@ uint8_t gpu_id_need_set_power_limit(void)
return gpu_id;
}
+// For the given GPU clear status/data that requires GPU driver to be loaded
+void clear_gpu_driver_status(uint8_t i_gpu_num)
+{
+ g_amec->gpu[i_gpu_num].status.checkDriverLoaded = false;
+ g_amec->gpu[i_gpu_num].status.driverLoaded = false;
+
+ // Reading memory temperature requires driver to be loaded.
+ g_amec->gpu[i_gpu_num].status.checkMemTempSupport = false;
+ g_amec->gpu[i_gpu_num].status.memTempSupported = false;
+ g_amec->gpu[i_gpu_num].status.memErrorCount = 0;
+
+ // Power capping requires driver to be loaded. Clear GPU power limits
+ g_amec->gpu[i_gpu_num].pcap.check_pwr_limit = false;
+ g_amec->gpu[i_gpu_num].pcap.pwr_limits_read = false;
+ g_amec->gpu[i_gpu_num].pcap.set_failed = false;
+ g_amec->gpu[i_gpu_num].pcap.gpu_min_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_max_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_requested_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_default_pcap_mw = 0;
+ //amec will need to recalculate after power limits are read to handle any clipping with new GPU min/max
+ g_amec->gpu[i_gpu_num].pcap.gpu_desired_pcap_mw = 0;
+}
+
+// Handles GPU not able to process request due to driver load or un-load
+void handle_driver_change(void)
+{
+ // Clear out driver status while driver change completes and is determined if loaded/un-loaded
+ clear_gpu_driver_status(G_current_gpu_id);
+
+ // memory temp only available when driver is loaded. clear error and set not available
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // when driver change is complete we must re-query to see if driver is loaded or not
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+}
+
+// For all GPUs read GPU reset status and take action if reset status has changed
+void update_gpu_reset_status(void)
+{
+ uint8_t gpu_num = 0;
+
+ // GPU reset status is in the OCC FLAGS register and is updated by OPAL
+ // Read the current reset status for all GPUs. A reset status of '1' indicates NOT in reset
+ ocb_occflg_t occ_flags = {0};
+ occ_flags.value = in32(OCB_OCCFLG);
+ bool not_in_reset[3] = {occ_flags.fields.gpu0_reset_status,
+ occ_flags.fields.gpu1_reset_status,
+ occ_flags.fields.gpu2_reset_status};
+
+ // reset status of '0' (IN reset) is the default
+ // the OCC will still try to read GPU when IN reset but will not log errors
+ // this is so we still communicate with the GPUs without OPAL support to indicate
+ // a GPU is not in reset.
+ // Full OCC support below for when OPAL starts updating the reset status
+ for (gpu_num=0; gpu_num<MAX_NUM_GPU_PER_DOMAIN; gpu_num++)
+ {
+ if(not_in_reset[gpu_num] != g_amec->gpu[gpu_num].status.notReset)
+ {
+ INTR_TRAC_IMP("update_gpu_reset_status: GPU%d NOT in reset is now = %d",
+ gpu_num,
+ not_in_reset[gpu_num]);
+
+ // There has been a change to the reset status clear everything out except for errors logged so we don't log again
+ clear_gpu_driver_status(gpu_num);
+ g_amec->gpu[gpu_num].status.errorCount = 0;
+ g_amec->gpu[gpu_num].status.retryCount = 0;
+
+ // readOnce of false will force comm to be established and once established then checkDriverLoaded will get set
+ g_amec->gpu[gpu_num].status.readOnce = false;
+
+ if(not_in_reset[gpu_num])
+ {
+ // GPU was taken out of reset clear disabled to allow communication again
+ g_amec->gpu[gpu_num].status.disabled = false;
+ }
+ else
+ {
+ // GPU was put in reset. Clear temperature sensor errors and set to not available
+ g_amec->gpu[gpu_num].status.coreTempFailure = false;
+ g_amec->gpu[gpu_num].status.coreTempNotAvailable = true;
+ g_amec->gpu[gpu_num].status.memTempFailure = false;
+ g_amec->gpu[gpu_num].status.memTempNotAvailable = true;
+ }
+
+ g_amec->gpu[gpu_num].status.notReset = not_in_reset[gpu_num];
+
+ } // if GPU reset status changed
+ } // for each GPU
+} // end update_gpu_reset_status()
+
// Disable GPU monitoring for all GPUs
void disable_all_gpus(void)
{
@@ -327,95 +497,103 @@ void gpu_ipc_init()
}
}
-// Called after a failure for a specified GPU. The error will
+// Called after a failure reading core temp for a specified GPU. The error will
// be counted and if threshold is reached, an error will be created with
-// the GPU as a callout and then set flag to force reset
+// the GPU as a callout if the GPU is not in reset
void mark_gpu_failed(const gpu_sm_args_t *i_arg)
{
uint32_t gpu_id = i_arg->gpu_id;
do
{
- // ignore all errors if haven't reached timeout for comm established
- if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
- (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
- {
- // do nothing but reset at this time
- break;
- }
if((false == g_amec->gpu[gpu_id].status.disabled) &&
(true == g_amec->gpu[gpu_id].status.readOnce))
{
- INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
- "(ffdc 0x%08X%08X)",
- gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
- WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
+ GPU_DBG("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
+ "(ffdc 0x%08X%08X)",
+ gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
+ WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
}
- if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
- (false == g_amec->gpu[gpu_id].status.disabled) &&
- (true == g_amec->gpu[gpu_id].status.readOnce))
+ // Always inc retry count for I2C reset regardless of if GPU is in reset or not
+ g_amec->gpu[gpu_id].status.retryCount++;
+
+ // Only inc error count if it is known that GPU is NOT in reset
+ // NOTE: Default is IN reset so this will only be true when OPAL/OS supports telling the OCC reset status
+ // if OS never tells the OCC reset status the OCC will never disable or log a comm error
+ if(g_amec->gpu[gpu_id].status.notReset)
{
- G_gpu_state = GPU_STATE_IDLE;
-
- // Something has gone wrong and it may be that OPAL has put
- // the GPU into reset. For now, if this happens we will just
- // continue polling the GPU until it comes back.
- g_amec->gpu[gpu_id].status.readOnce = false;
- g_amec->gpu[gpu_id].status.checkDriverLoaded = true;
- g_amec->gpu[gpu_id].status.driverLoaded = false;
- g_amec->gpu[gpu_id].status.checkMemTempSupport = true;
- g_amec->gpu[gpu_id].status.memTempSupported = false;
- g_amec->gpu[gpu_id].status.memErrorCount = 0;
- g_amec->gpu[gpu_id].status.errorCount = 0;
-
-// This code can be used if an interlock with OPAL is ever introduced
-#if 0
- // Disable this GPU, collect FFDC and log error
- g_amec->gpu[gpu_id].status.disabled = true;
-
- INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
- gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
- errlHndl_t l_err = NULL;
- /*
- * @errortype
- * @moduleid GPU_MID_MARK_GPU_FAILED
- * @reasoncode GPU_FAILURE
- * @userdata1 GPE returned rc code
- * @userdata4 ERC_GPU_COMPLETE_FAILURE
- * @devdesc GPU failure
- */
- l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
- GPU_FAILURE,
- ERC_GPU_COMPLETE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- i_arg->error.rc,
- 0);
- addUsrDtlsToErrl(l_err,
- (uint8_t*)&i_arg->error.ffdc,
- sizeof(i_arg->error.ffdc),
- ERRL_STRUCT_VERSION_1,
- ERRL_USR_DTL_BINARY_DATA);
-
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ // INC count and check if reached error threshold
+ if( ++g_amec->gpu[gpu_id].status.errorCount > GPU_INIT_ERROR_COUNT)
{
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ // set that GPU temperature readings failed
+ g_amec->gpu[gpu_id].status.memTempFailure = true;
+ g_amec->gpu[gpu_id].status.memTempNotAvailable = true;
+ g_amec->gpu[gpu_id].status.coreTempFailure = true;
+ g_amec->gpu[gpu_id].status.coreTempNotAvailable = true;
+
+ // Disable this GPU. GPU will get re-enabled if detected that GPU is put in reset and then taken out
+ g_amec->gpu[gpu_id].status.disabled = true;
+
+ INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
+ gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
+
+ if(g_amec->gpu[gpu_id].status.commErrorLogged == false)
+ {
+
+ errlHndl_t l_err = NULL;
+ /*
+ * @errortype
+ * @moduleid GPU_MID_MARK_GPU_FAILED
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPE returned rc code
+ * @userdata4 ERC_GPU_COMPLETE_FAILURE
+ * @devdesc GPU failure
+ */
+ l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
+ GPU_FAILURE,
+ ERC_GPU_COMPLETE_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ i_arg->error.rc,
+ 0);
+ addUsrDtlsToErrl(l_err,
+ (uint8_t*)&i_arg->error.ffdc,
+ sizeof(i_arg->error.ffdc),
+ ERRL_STRUCT_VERSION_1,
+ ERRL_USR_DTL_BINARY_DATA);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ commitErrl(&l_err);
+ g_amec->gpu[gpu_id].status.commErrorLogged = true;
+
+ } // if !commErrorLogged
+
+ } // if errorCount > threshold
+
+ } // if notReset
- commitErrl(&l_err);
-#endif
- }
} while(0);
- // Reset GPU
- G_gpu_i2c_reset_required = true;
- G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+ // Do an I2C reset if reached retry count
+ // don't want to do I2C reset every time since could be that this GPU really is in reset and
+ // while resetting I2C we are unable to read other GPUs that may not be in reset
+ if( g_amec->gpu[gpu_id].status.retryCount > GPU_ERRORS_BEFORE_I2C_RESET)
+ {
+ g_amec->gpu[gpu_id].status.retryCount = 0;
+ G_gpu_i2c_reset_required = true;
+ G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+ }
+
} // end mark_gpu_failed()
// Schedule a GPE request for GPU operation
@@ -461,6 +639,27 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
case GPU_REQ_READ_MEM_TEMP_FINISH:
break;
+ // Check if driver is loaded
+ case GPU_REQ_CHECK_DRIVER_START:
+ case GPU_REQ_CHECK_DRIVER_2:
+ case GPU_REQ_CHECK_DRIVER_3:
+ case GPU_REQ_CHECK_DRIVER_FINISH:
+ break;
+
+ // Read GPU Power Limit
+ case GPU_REQ_READ_PWR_LIMIT_START:
+ case GPU_REQ_READ_PWR_LIMIT_2:
+ case GPU_REQ_READ_PWR_LIMIT_3:
+ case GPU_REQ_READ_PWR_LIMIT_FINISH:
+ break;
+
+ // Set GPU Power Limit
+ case GPU_REQ_SET_PWR_LIMIT_START:
+ case GPU_REQ_SET_PWR_LIMIT_2:
+ case GPU_REQ_SET_PWR_LIMIT_3:
+ case GPU_REQ_SET_PWR_LIMIT_FINISH:
+ break;
+
// I2C reset
case GPU_REQ_RESET:
break;
@@ -498,6 +697,7 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
{
// Clear errors and init common arguments for GPE
G_gpu_op_req_args.error.error = 0;
+ G_gpu_op_req_args.gpu_rc = 0;
G_gpu_op_req_args.operation = i_operation;
G_gpu_op_req_args.gpu_id = G_current_gpu_id;
@@ -574,14 +774,14 @@ bool gpu_reset_sm()
/*
* @errortype
* @moduleid GPU_MID_GPU_RESET_SM
- * @reasoncode GPU_FAILURE
+ * @reasoncode GPU_NO_GPE_SUPPORT
* @userdata1 0
* @userdata2 0
* @userdata4 ERC_GPU_NO_GPE_SUPPORT
* @devdesc GPE1 image doesn't support GPU communication
*/
errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
- GPU_FAILURE,
+ GPU_NO_GPE_SUPPORT,
ERC_GPU_NO_GPE_SUPPORT,
ERRL_SEV_UNRECOVERABLE,
NULL,
@@ -606,9 +806,7 @@ bool gpu_reset_sm()
else // this reset attempt failed
{
// Stop trying if reached max resets
- if( (L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) &&
- (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) >=
- GPU_COMM_ESTAB_TIMEOUT_SECONDS))
+ if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
{
INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
L_reset_state);
@@ -662,12 +860,12 @@ bool gpu_reset_sm()
break;
case GPU_RESET_STATE_RESET_MASTER:
- G_new_gpu_req_args.data = GPU_RESET_REQ_MASTER;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
case GPU_RESET_STATE_RESET_SLAVE:
- G_new_gpu_req_args.data = GPU_RESET_REQ_SLV;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
@@ -677,7 +875,7 @@ bool gpu_reset_sm()
break;
case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
- G_new_gpu_req_args.data = GPU_RESET_REQ_SLV_COMPLETE;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
@@ -718,6 +916,555 @@ bool gpu_reset_sm()
// Function Specification
//
+// Name: gpu_check_driver_loaded_sm
+//
+// Description: Called from gpu_task_sm to check if driver is loaded for G_current_gpu_id
+// This function should only return that complete is TRUE when the check
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to check
+//
+// End Function Specification
+bool gpu_check_driver_loaded_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when the read is complete or failed
+ bool l_new_driver_loaded = FALSE;
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_check_driver_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
+ static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new read then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_check_driver_state != GPU_STATE_CHECK_DRIVER_LOADED_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_check_driver_failure_count[G_current_gpu_id] = 0; // clear driver failure count since there's a driver change
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached state retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_check_driver_failure_count[G_current_gpu_id] > GPU_CHECK_DRIVER_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: Failed to check driver loaded for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up checking driver loaded for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+ L_check_driver_failure_count[G_current_gpu_id] = 0;
+
+ // without driver loaded cannot read memory temp, mark memory temp as failed
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // log one time error that driver loaded couldn't be determined
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_CHECK_DRIVER_LOADED
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_CHECK_DRIVER_LOADED_FAILURE
+ * @devdesc Failure to check GPU driver loaded
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_CHECK_DRIVER_LOADED,
+ GPU_FAILURE,
+ ERC_GPU_CHECK_DRIVER_LOADED_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached state retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_check_driver_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_check_driver_state)
+ {
+ case GPU_STATE_CHECK_DRIVER_LOADED_START:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
+ // Update driver loaded
+ l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
+ if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
+ {
+ // Driver loaded status changed
+ INTR_TRAC_IMP("gpu_check_driver_loaded: GPU%d driver loaded changed to %d",
+ G_current_gpu_id,
+ l_new_driver_loaded);
+
+ if(l_new_driver_loaded)
+ {
+ // Driver is now loaded do checking that required driver to be loaded
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = true;
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+ // done checking for driver to be loaded
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+ }
+ else
+ {
+ // Driver is no longer loaded
+ clear_gpu_driver_status(G_current_gpu_id);
+
+ // memory temp only available when driver is loaded
+ // clear error and set not available
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // Need to keep query for driver loaded to detect when driver is loaded
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+ }
+
+ g_amec->gpu[G_current_gpu_id].status.driverLoaded = l_new_driver_loaded;
+ }
+
+ // Done with this GPU ready to move to new one
+ L_check_driver_failure_count[G_current_gpu_id] = 0;
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_check_driver_loaded: INVALID STATE: 0x%02X", L_check_driver_state);
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_check_driver_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_check_driver_loaded: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_check_driver_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: failed to schedule state 0x%02X", L_check_driver_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: NOT idle for state 0x%02X", L_check_driver_state);
+ }
+
+ return l_complete;
+} // end gpu_check_driver_loaded_sm()
+
+// Function Specification
+//
+// Name: gpu_read_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to read GPU power limits for G_current_gpu_id
+// This function should only return that complete is TRUE when the read
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_read_pwr_limit_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when the read is complete or failed
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_read_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
+ static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new read then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up trying to read power limits for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false;
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+ // log one time error that power limits could not be read
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_PWR_LIMIT
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_READ_PWR_LIMIT_FAILURE
+ * @devdesc Failure to read GPU power limits
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT,
+ GPU_FAILURE,
+ ERC_GPU_READ_PWR_LIMIT_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_read_pwr_limit_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_read_pwr_limit_state)
+ {
+ case GPU_STATE_READ_PWR_LIMIT_START:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
+ // Update power limits
+ g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = (uint32_t) G_gpu_op_req_args.data[1];
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = (uint32_t) G_gpu_op_req_args.data[2];
+
+ // Done with this GPU ready to move to new one
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state);
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_read_pwr_limit_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_read_pwr_limit_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit: NOT idle for state 0x%02X", L_read_pwr_limit_state);
+ }
+
+ return l_complete;
+} // end gpu_read_pwr_limit_sm()
+
+// Function Specification
+//
+// Name: gpu_set_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to set GPU power limit for G_current_gpu_id
+// This function should only return that complete is TRUE when the set
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_set_pwr_limit_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when complete or failed
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_state_failure_count = 0;
+ static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new set limit then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X",
+ G_gpu_op_req_args.data[0],
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up trying to set power limit for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out or driver change
+ g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+ // log error that power limit could not be set
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_SET_PWR_LIMIT
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_SET_PWR_LIMIT_FAILURE
+ * @devdesc Failure to set GPU power limit
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT,
+ GPU_FAILURE,
+ ERC_GPU_SET_PWR_LIMIT_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_set_pwr_limit_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_set_pwr_limit_state)
+ {
+ case GPU_STATE_SET_PWR_LIMIT_START:
+ // send the desired GPU power cap to the GPE to send to GPU
+ G_new_gpu_req_args.data[0] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw;
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
+ // Update the requested power limit since it was successfully sent
+ // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
+ // has caluclated a new desired pcap while this one was already in process of being set
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+
+ // Done with this GPU ready to move to new one
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state);
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_set_pwr_limit_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_set_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_set_pwr_limit_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state);
+ }
+
+ return l_complete;
+} // end gpu_set_pwr_limit_sm()
+
+// Function Specification
+//
// Name: gpu_read_temp_sm
//
// Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id
@@ -750,6 +1497,7 @@ bool gpu_read_temp_sm()
mark_gpu_failed(&G_gpu_op_req_args);
L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+ L_read_failure_count = 0;
return TRUE; // Done with this GPU, let GPU SM move to next
}
else
@@ -778,7 +1526,7 @@ bool gpu_read_temp_sm()
case GPU_STATE_READ_TEMP_COMPLETE:
if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
- (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp?
+ (0 != G_gpu_op_req_args.data[0]) )
{
g_amec->gpu[G_current_gpu_id].status.readOnce = true;
@@ -791,15 +1539,17 @@ bool gpu_read_temp_sm()
}
// comm is now established update for capability checking to take place
- g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
}
// Update sensor
- l_temp = G_gpu_op_req_args.data;
+ l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
// Clear all past errors
+ g_amec->gpu[G_current_gpu_id].status.coreTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.coreTempNotAvailable = false;
g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
+ g_amec->gpu[G_current_gpu_id].status.retryCount = 0;
// check if there is an overtemp that hasn't been reported
if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
@@ -897,8 +1647,10 @@ bool gpu_read_mem_temp_capability_sm()
{
bool l_complete = FALSE; // only return TRUE when the read is complete or failed
static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
- static uint8_t L_read_failure_count = 0;
+ static uint8_t L_read_mem_cap_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
if (async_request_is_idle(&G_gpu_op_request.request))
{
@@ -907,57 +1659,91 @@ bool gpu_read_mem_temp_capability_sm()
if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) &&
(!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
{
- // If reached retry count give up on this read
- if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
- // log error that memory temp capability couldn't be determined
- // memory temp support will be left as not supported
- INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d", G_current_gpu_id);
-
- // Log error
- /* @
- * @errortype
- * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
- * @reasoncode GPU_FAILURE
- * @userdata1 GPU ID
- * @userdata2 0
- * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
- * @devdesc Failure to read GPU memory temp capability
- *
- */
- errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
- GPU_FAILURE,
- ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_current_gpu_id,
- 0);
-
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+ // If reached state retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
{
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ if(++L_read_mem_cap_failure_count[G_current_gpu_id] > GPU_READ_MEM_CAP_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
- // Commit Error
- commitErrl(&l_err);
+ // give up trying to read mem temp capability for this GPU
+ // It will be retried if detected that GPU driver is re-loaded
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
+
+ // cannot determine memory temp capability, mark memory temp as failed
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // log one time error that memory temp capability couldn't be determined
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
+ * @devdesc Failure to read memory temp capability
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
+ GPU_FAILURE,
+ ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ L_state_failure_count = 0;
return TRUE; // Done with this GPU, let GPU SM move to next
- }
+ } // if reached state retry count
else
{
// INC failure count and retry current state
- L_read_failure_count++;
+ L_state_failure_count++;
}
}
else // success on last state go to next state and process it
{
- L_read_failure_count = 0;
+ L_state_failure_count = 0;
L_read_cap_state++;
}
@@ -983,8 +1769,21 @@ bool gpu_read_mem_temp_capability_sm()
case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
// Update capability
- g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data & 0x01;
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
+
+ if(g_amec->gpu[G_current_gpu_id].status.memTempSupported)
+ {
+ // mem temp is supported no need to re-check capability
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+ }
+ else
+ {
+ // Need to keep query for mem temp capability to detect if ever changes to capable
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
+ }
+
// Done with this GPU ready to move to new one
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
l_complete = TRUE;
break;
@@ -1041,60 +1840,95 @@ bool gpu_read_memory_temp_sm()
if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) &&
(!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
{
- // If reached retry count give up on this read
- if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
- // INC memory error count and check if reached timeout threshold for new mem temp
- uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
- g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
- if((max_read_timeout) && (max_read_timeout != 0xFF) &&
- (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_read_failure_count = 0;
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
+ L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count or GPU indicated cmd not supported then give up on this read
+ else if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
+ (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
+ {
+ // if GPU is not in reset or the GPU responded with command not supported then
+ // INC memory error count and check if reached timeout for new mem temp
+ if( (g_amec->gpu[G_current_gpu_id].status.notReset) ||
+ (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
{
- // Disable memory temp reading for this GPU and log error
- g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
- // so BMC knows there is an error for fan control set sensor to 0xFF
- sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), 0xFFFF);
-
- INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
- G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
-
- // Log error
- /* @
- * @errortype
- * @moduleid GPU_MID_GPU_READ_MEM_TEMP
- * @reasoncode GPU_FAILURE
- * @userdata1 GPU ID
- * @userdata2 number consecutive read mem temp failures
- * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT
- * @devdesc Timeout reading new GPU memory temperature
- *
- */
- errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
- GPU_FAILURE,
- ERC_GPU_READ_MEM_TEMP_TIMEOUT,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_current_gpu_id,
- g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
- {
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
+ if((max_read_timeout) && (max_read_timeout != 0xFF) &&
+ (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+ {
+ // Disable memory temp reading for this GPU and log error
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
+ // so BMC knows there is an error for fan control set failure
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
- // Commit Error
- commitErrl(&l_err);
+ INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
+ G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
- } // if timeout error
+ if(g_amec->gpu[G_current_gpu_id].status.commErrorLogged == false)
+ {
+ INTR_TRAC_ERR("notReset: %d rc: 0x%0X", g_amec->gpu[G_current_gpu_id].status.notReset,
+ G_gpu_op_req_args.error.rc);
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_MEM_TEMP
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 GPU RC
+ * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT
+ * @devdesc Timeout reading new GPU memory temperature
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
+ GPU_FAILURE,
+ ERC_GPU_READ_MEM_TEMP_TIMEOUT,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_gpu_op_req_args.gpu_rc,
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ g_amec->gpu[G_current_gpu_id].status.commErrorLogged = true;
+ } // if !commErrorLogged
+ } // if timeout error
+ else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED)
+ {
+ // GPU indicated command not supported, re-check mem temp capability
+ // if we try to read mem temp again that means mem temp was reported capable
+ // and if this continues to fail eventually an error will be logged above at timeout
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = false;
+ }
+ } // if notReset or command not supported
+ // setup to start new request
+ L_read_failure_count = 0;
L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
return TRUE; // Done with this GPU, let GPU SM move to next
- }
+ } // else if failure count exceeded or command not supported
else
{
// INC failure count and retry current state
@@ -1115,20 +1949,26 @@ bool gpu_read_memory_temp_sm()
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
break;
- case GPU_STATE_READ_MEM_TEMP_STOP:
+ case GPU_STATE_READ_MEM_TEMP_2:
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args);
break;
- case GPU_STATE_READ_MEM_TEMP_READ:
+ case GPU_STATE_READ_MEM_TEMP_3:
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args);
break;
+ case GPU_STATE_READ_MEM_TEMP_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_FINISH, G_new_gpu_req_args);
+ break;
+
case GPU_STATE_READ_MEM_TEMP_COMPLETE:
// Update sensor
- l_temp = G_gpu_op_req_args.data;
+ l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
// Clear past errors
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = false;
g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
// check if there is an overtemp that hasn't been reported
@@ -1253,9 +2093,30 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
break;
}
-//TODO: Enable when functional
-#if 0
- // 3. Need to check if driver is loaded?
+ // 3. Time to start new temperature reads?
+ if(i_read_temp_start_needed)
+ {
+ // Start reading core temp from first present and functional GPU
+ l_gpu_id = get_first_gpu();
+ if(l_gpu_id != 0xFF)
+ {
+ // Read core temp for this GPU
+ G_current_gpu_id = l_gpu_id;
+ G_gpu_state = GPU_STATE_READ_TEMP;
+ l_new_state = TRUE;
+ break;
+ }
+ else // no functional GPUs
+ {
+ // release I2C lock to the host for this engine and stop monitoring
+ occ_i2c_lock_release(GPU_I2C_ENGINE);
+ G_gpu_state = GPU_STATE_NO_LOCK;
+ G_gpu_monitoring_allowed = FALSE;
+ l_new_state = FALSE; // No new state for GPU communication
+ break;
+ }
+ }
+ // 4. Need to check if driver is loaded?
l_gpu_id = gpu_id_need_driver_check();
if(l_gpu_id != 0xFF)
{
@@ -1266,7 +2127,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
break;
}
- // 4. Need to read power limits?
+//TODO: Enable when functional
+#if 0
+ // 5. Need to read power limits?
l_gpu_id = gpu_id_need_power_limits();
if(l_gpu_id != 0xFF)
{
@@ -1276,8 +2139,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
l_new_state = TRUE;
break;
}
+#endif
- // 5. Need to read memory temps?
+ // 6. Need to read memory temps?
if(i_mem_temp_needed)
{
// first check if there is a GPU that needs memory temp capability checked
@@ -1304,31 +2168,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
}
}
}
-#endif
-
- // 6. Time to start new temperature reads?
- if(i_read_temp_start_needed)
- {
- // Start reading core temp from first present and functional GPU
- l_gpu_id = get_first_gpu();
- if(l_gpu_id != 0xFF)
- {
- // Read core temp for this GPU
- G_current_gpu_id = l_gpu_id;
- G_gpu_state = GPU_STATE_READ_TEMP;
- l_new_state = TRUE;
- break;
- }
- else // no functional GPUs
- {
- // release I2C lock to the host for this engine and stop monitoring
- occ_i2c_lock_release(GPU_I2C_ENGINE);
- G_gpu_state = GPU_STATE_NO_LOCK;
- G_gpu_monitoring_allowed = FALSE;
- l_new_state = FALSE; // No new state for GPU communication
- break;
- }
- }
// Else nothing stay idle
}while(0);
@@ -1361,12 +2200,13 @@ void task_gpu_sm(struct task *i_self)
// are functional or GPU I2C interface is broken
if(G_gpu_monitoring_allowed)
{
+ // Read and update reset status for all GPUs
+ update_gpu_reset_status();
// Initialize the IPC commands if this is our first run
if(L_gpu_first_run)
{
gpu_ipc_init();
- G_gpu_sm_start_time = ssx_timebase_get(); // used for timeout establishing comm
L_gpu_first_run = FALSE;
}
@@ -1379,6 +2219,7 @@ void task_gpu_sm(struct task *i_self)
if(L_numCallsForTempRead >= GPU_TEMP_READ_1S)
{
L_read_temp_start_needed = TRUE;
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
}
}
@@ -1430,6 +2271,7 @@ void task_gpu_sm(struct task *i_self)
// Start first with reading core temp of first functional GPU
L_numCallsForTempRead = 0; // to track start of next temp reading in 1s
L_read_temp_start_needed = FALSE; // start is no longer needed
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
l_gpu_id = get_first_gpu();
if(l_gpu_id != 0xFF)
{
@@ -1506,7 +2348,6 @@ void task_gpu_sm(struct task *i_self)
{
// Capability check complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1514,17 +2355,14 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_CHECK_DRIVER_LOADED:
// Check if driver is loaded for current GPU
- if(1) // TODO
+ if(gpu_check_driver_loaded_sm())
{
- // Driver check complete for this GPU, go to IDLE state
- // to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE;
- g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE;
- if(g_amec->gpu[G_current_gpu_id].status.driverLoaded)
- {
- // Driver is loaded, read the power limits so we can start GPU power capping
- g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE;
- }
+ // Driver check complete for this GPU,
+ // NOTE: Do not set status.checkDriverLoaded to false here, if driver is
+ // not loaded we need to keep checking for driver to be loaded this is decided
+ // inside gpu_check_driver_loaded_sm()
+
+ // go to IDLE state to let IDLE SM decide what to do next
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1532,11 +2370,10 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_READ_PWR_LIMIT:
// Read power limits for current GPU
- if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported
+ if(gpu_read_pwr_limit_sm())
{
// Read power limits complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1544,7 +2381,7 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_SET_PWR_LIMIT:
// Set power limit on current GPU
- if(1) // TODO
+ if(gpu_set_pwr_limit_sm())
{
// Set power limit complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
@@ -1596,6 +2433,7 @@ void task_gpu_sm(struct task *i_self)
// new state to read core temp reset temperature reading timer
L_numCallsForTempRead = 0;
L_read_temp_start_needed = FALSE; // start no longer needed
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
}
else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP)
{
@@ -1606,6 +2444,4 @@ void task_gpu_sm(struct task *i_self)
}
}while((l_start_next_state) && (!l_next_state));
} // GPU monitoring enabled
-
-
} // end task_gpu_sm()
OpenPOWER on IntegriCloud