summaryrefslogtreecommitdiffstats
path: root/src/occ_405/gpu
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2017-08-22 11:50:01 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2017-08-25 12:02:53 -0400
commit3f57751abd8ca0308e3938dc86d5a313b7599ebc (patch)
treeeb5b20878c292a0e1773a049247e25fc3ac77f09 /src/occ_405/gpu
parent8e2f38a98ba958b3fe520836cb1f509fa0432030 (diff)
downloadtalos-occ-3f57751abd8ca0308e3938dc86d5a313b7599ebc.tar.gz
talos-occ-3f57751abd8ca0308e3938dc86d5a313b7599ebc.zip
405 Side GPU Core Temp Collection
Change-Id: Ia1b10f5208c49ba168dcf338f0cbeb2c4ab46971 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44982 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Diffstat (limited to 'src/occ_405/gpu')
-rwxr-xr-xsrc/occ_405/gpu/gpu.c647
-rw-r--r--src/occ_405/gpu/gpu.h24
2 files changed, 341 insertions, 330 deletions
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 2fd2b82..522ea84 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -23,7 +23,6 @@
/* */
/* IBM_PROLOG_END_TAG */
-//#define GPU_DEBUG
#ifdef GPU_DEBUG
#define GPU_DBG(frmt,args...) DBG_PRINT(frmt,##args)
#else
@@ -335,67 +334,69 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg)
{
uint32_t gpu_id = i_arg->gpu_id;
- // ignore all errors if haven't reached timeout for comm established
- if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
- (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
- {
- // do nothing at this time
- return;
- }
- if((false == g_amec->gpu[gpu_id].status.disabled) &&
- (true == g_amec->gpu[gpu_id].status.readOnce))
- {
- INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
- "(ffdc 0x%08X%08X)",
- gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
- WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
- }
-
- if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
- (false == g_amec->gpu[gpu_id].status.disabled) &&
- (true == g_amec->gpu[gpu_id].status.readOnce))
+ do
{
- G_gpu_state = GPU_STATE_IDLE;
- // Disable this GPU, collect FFDC and log error
- g_amec->gpu[gpu_id].status.disabled = true;
-
- INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
- gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
- errlHndl_t l_err = NULL;
- /*
- * @errortype
- * @moduleid GPU_MID_MARK_GPU_FAILED
- * @reasoncode GPU_FAILURE
- * @userdata1 GPE returned rc code
- * @userdata4 ERC_GPU_COMPLETE_FAILURE
- * @devdesc GPU failure
- */
- l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
- GPU_FAILURE,
- ERC_GPU_COMPLETE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- i_arg->error.rc,
- 0);
-
- addUsrDtlsToErrl(l_err,
- (uint8_t*)&i_arg->error.ffdc,
- sizeof(i_arg->error.ffdc),
- ERRL_STRUCT_VERSION_1,
- ERRL_USR_DTL_BINARY_DATA);
-
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ // ignore all errors if haven't reached timeout for comm established
+ if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
+ (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
{
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- G_sysConfigData.gpu_sensor_ids[gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
+ // do nothing but reset at this time
+ break;
+ }
+ if((false == g_amec->gpu[gpu_id].status.disabled) &&
+ (true == g_amec->gpu[gpu_id].status.readOnce))
+ {
+ INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
+ "(ffdc 0x%08X%08X)",
+ gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
+ WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
}
- commitErrl(&l_err);
- }
+ if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
+ (false == g_amec->gpu[gpu_id].status.disabled) &&
+ (true == g_amec->gpu[gpu_id].status.readOnce))
+ {
+ G_gpu_state = GPU_STATE_IDLE;
+ // Disable this GPU, collect FFDC and log error
+ g_amec->gpu[gpu_id].status.disabled = true;
+
+ INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
+ gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
+ errlHndl_t l_err = NULL;
+ /*
+ * @errortype
+ * @moduleid GPU_MID_MARK_GPU_FAILED
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPE returned rc code
+ * @userdata4 ERC_GPU_COMPLETE_FAILURE
+ * @devdesc GPU failure
+ */
+ l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
+ GPU_FAILURE,
+ ERC_GPU_COMPLETE_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ i_arg->error.rc,
+ 0);
+ addUsrDtlsToErrl(l_err,
+ (uint8_t*)&i_arg->error.ffdc,
+ sizeof(i_arg->error.ffdc),
+ ERRL_STRUCT_VERSION_1,
+ ERRL_USR_DTL_BINARY_DATA);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_HUID,
+ G_sysConfigData.gpu_sensor_ids[gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ commitErrl(&l_err);
+ }
+ } while(0);
// Reset GPU
G_gpu_i2c_reset_required = true;
@@ -428,20 +429,21 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
// Read GPU memory temp capability
case GPU_REQ_READ_CAPS_START:
- case GPU_REQ_READ_CAPS_STOP:
- case GPU_REQ_READ_CAPS:
+ case GPU_REQ_READ_CAPS_2:
+ case GPU_REQ_READ_CAPS_3:
+ case GPU_REQ_READ_CAPS_FINISH:
break;
- // Read GPU memory temp
+ // Read GPU core temp
case GPU_REQ_READ_TEMP_START:
- case GPU_REQ_READ_TEMP_STOP:
- case GPU_REQ_READ_TEMP:
+ case GPU_REQ_READ_TEMP_FINISH:
break;
- // Read GPU core temp
- case GPU_REQ_READ_TEMP_SIMPLE_START:
- case GPU_REQ_READ_TEMP_SIMPLE_STOP:
- case GPU_REQ_READ_TEMP_SIMPLE:
+ // Read GPU memory temp
+ case GPU_REQ_READ_MEM_TEMP_START:
+ case GPU_REQ_READ_MEM_TEMP_2:
+ case GPU_REQ_READ_MEM_TEMP_3:
+ case GPU_REQ_READ_MEM_TEMP_FINISH:
break;
// I2C reset
@@ -543,27 +545,27 @@ bool gpu_reset_sm()
if (async_request_is_idle(&G_gpu_op_request.request))
{
- // check if the previous state was successfully scheduled and success/done
- if( (L_reset_state != GPU_RESET_STATE_NEW) &&
- (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) &&
- (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
- {
- // Check if failure was due to GPE image not having GPU support
- if(G_gpu_op_req_args.error.rc == GPE_RC_NO_GPU_SUPPORT)
- {
- // No GPU Support, log error and disable all GPUs
- INTR_TRAC_ERR("gpu_reset_sm: GPE image doesn't support GPUs!");
+ // check if the previous state was successfully scheduled and success/done
+ if( (L_reset_state != GPU_RESET_STATE_NEW) &&
+ (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to GPE image not having GPU support
+ if(G_gpu_op_req_args.error.rc == GPE_RC_NO_GPU_SUPPORT)
+ {
+ // No GPU Support, log error and disable all GPUs
+ INTR_TRAC_ERR("gpu_reset_sm: GPE image doesn't support GPUs!");
- /*
- * @errortype
- * @moduleid GPU_MID_GPU_RESET_SM
- * @reasoncode GPU_FAILURE
- * @userdata1 0
- * @userdata2 0
- * @userdata4 ERC_GPU_NO_GPE_SUPPORT
- * @devdesc GPE1 image doesn't support GPU communication
- */
- errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
+ /*
+ * @errortype
+ * @moduleid GPU_MID_GPU_RESET_SM
+ * @reasoncode GPU_FAILURE
+ * @userdata1 0
+ * @userdata2 0
+ * @userdata4 ERC_GPU_NO_GPE_SUPPORT
+ * @devdesc GPE1 image doesn't support GPU communication
+ */
+ errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
GPU_FAILURE,
ERC_GPU_NO_GPE_SUPPORT,
ERRL_SEV_UNRECOVERABLE,
@@ -571,39 +573,41 @@ bool gpu_reset_sm()
DEFAULT_TRACE_SIZE,
0,
0);
- commitErrl(&err);
+ commitErrl(&err);
- disable_all_gpus();
+ disable_all_gpus();
- L_reset_state = GPU_RESET_STATE_NEW;
- return FALSE; // GPUs are not ready for communication
- }
- else
- {
- // Stay in current state if haven't reached state retry count
- if(L_state_retry_count < MAX_GPU_RESET_STATE_RETRY)
- {
- // INC state retry count and retry current state
- L_state_retry_count++;
- }
- else // this reset attempt failed
- {
- // Stop trying if reached max resets
- if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
+ L_reset_state = GPU_RESET_STATE_NEW;
+ return FALSE; // GPUs are not ready for communication
+ }
+ else
+ {
+ // Stay in current state if haven't reached state retry count
+ if(L_state_retry_count < MAX_GPU_RESET_STATE_RETRY)
{
- INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
- L_reset_state);
-
- /*
- * @errortype
- * @moduleid GPU_MID_GPU_RESET_SM
- * @reasoncode GPU_FAILURE
- * @userdata1 GPU reset state
- * @userdata2 0
- * @userdata4 ERC_GPU_RESET_FAILURE
- * @devdesc Failure resetting GPU interface
- */
- errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
+ // INC state retry count and retry current state
+ L_state_retry_count++;
+ }
+ else // this reset attempt failed
+ {
+ // Stop trying if reached max resets
+ if( (L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) &&
+ (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) >=
+ GPU_COMM_ESTAB_TIMEOUT_SECONDS))
+ {
+ INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
+ L_reset_state);
+
+ /*
+ * @errortype
+ * @moduleid GPU_MID_GPU_RESET_SM
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU reset state
+ * @userdata2 0
+ * @userdata4 ERC_GPU_RESET_FAILURE
+ * @devdesc Failure resetting GPU interface
+ */
+ errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
GPU_FAILURE,
ERC_GPU_RESET_FAILURE,
ERRL_SEV_UNRECOVERABLE,
@@ -611,68 +615,64 @@ bool gpu_reset_sm()
DEFAULT_TRACE_SIZE,
L_reset_state,
0);
- commitErrl(&err);
-
- disable_all_gpus();
-
- L_reset_state = GPU_RESET_STATE_NEW;
- return FALSE; // GPUs are not ready for communication
- }
- else // try the reset again from the beginning
- {
- L_consec_reset_failure_count++;
- L_state_retry_count = 0;
- L_reset_state = GPU_RESET_STATE_RESET_MASTER;
- }
- } // else reset attempt failed
- } // else GPE supports GPU
- }// if previous state failed
- else // success on last state go to next state and process it
- {
- L_state_retry_count = 0;
- L_reset_state++;
- }
+ commitErrl(&err);
+
+ disable_all_gpus();
+
+ L_reset_state = GPU_RESET_STATE_NEW;
+ return FALSE; // GPUs are not ready for communication
+ }
+ else // try the reset again from the beginning
+ {
+ L_consec_reset_failure_count++;
+ L_state_retry_count = 0;
+ L_reset_state = GPU_RESET_STATE_RESET_MASTER;
+ }
+ } // else reset attempt failed
+ } // else GPE supports GPU
+ }// if previous state failed
+ else // success on last state go to next state and process it
+ {
+ L_state_retry_count = 0;
+ L_reset_state++;
+ }
- L_scheduled = FALSE; // default nothing scheduled
+ L_scheduled = FALSE; // default nothing scheduled
- switch (L_reset_state)
- {
- case GPU_RESET_STATE_RESET_MASTER:
- G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
- L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
- break;
+ switch (L_reset_state)
+ {
+ case GPU_RESET_STATE_INIT_BUS:
+ // Setup I2C Interrupt Mask Register
+ L_scheduled = schedule_gpu_req(GPU_REQ_INIT, G_new_gpu_req_args);
+ break;
- case GPU_RESET_STATE_RESET_SLAVE:
- G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
- L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
- break;
+ case GPU_RESET_STATE_RESET_MASTER:
+ G_new_gpu_req_args.data = GPU_RESET_REQ_MASTER;
+ L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+ break;
- case GPU_RESET_STATE_RESET_SLAVE_WAIT:
- // Delay to allow reset to complete
- GPU_DBG("gpu_reset_sm: waiting during slave port 4 reset");
- break;
+ case GPU_RESET_STATE_RESET_SLAVE:
+ G_new_gpu_req_args.data = GPU_RESET_REQ_SLV;
+ L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+ break;
- case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
- G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
- L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
- break;
+ case GPU_RESET_STATE_RESET_SLAVE_WAIT:
+ // Delay to allow reset to complete
+ GPU_DBG("gpu_reset_sm: waiting during slave port 4 reset");
+ break;
- case GPU_RESET_STATE_INIT:
- // Notify GPE which GPUs are present
- G_new_gpu_req_args.data[0] = (GPU_PRESENT(ID_GPU0)) ? GPU_STATE_PRESENT : 0;
- G_new_gpu_req_args.data[1] = (GPU_PRESENT(ID_GPU1)) ? GPU_STATE_PRESENT : 0;
- G_new_gpu_req_args.data[2] = (GPU_PRESENT(ID_GPU2)) ? GPU_STATE_PRESENT : 0;
- // Setup I2C Interrupt Mask Register and Mode
- L_scheduled = schedule_gpu_req(GPU_REQ_INIT, G_new_gpu_req_args);
- break;
+ case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
+ G_new_gpu_req_args.data = GPU_RESET_REQ_SLV_COMPLETE;
+ L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+ break;
- case GPU_RESET_STATE_INIT_COMPLETE:
- // Reset and init is complete ready to start sending commands to the GPUs
- l_complete = TRUE;
- L_consec_reset_failure_count = 0;
- // next time this is called will be to start a new reset
- L_reset_state = GPU_RESET_STATE_NEW;
- break;
+ case GPU_RESET_STATE_RESET_FINISH:
+ // Reset and init is complete ready to start sending commands to the GPUs
+ l_complete = TRUE;
+ L_consec_reset_failure_count = 0;
+ // next time this is called will be to start a new reset
+ L_reset_state = GPU_RESET_STATE_NEW;
+ break;
default:
INTR_TRAC_ERR("gpu_reset_sm: INVALID STATE: 0x%02X when reset is required", L_reset_state);
@@ -680,22 +680,22 @@ bool gpu_reset_sm()
break;
} // switch L_reset_state
- if(L_scheduled)
- {
- GPU_DBG("gpu_reset_sm: Scheduled reset state 0x%02X", L_reset_state);
- }
- // check if the state was expected to have a schedule. Only new and slave wait
- // don't schedule for all other states the schedule must have failed
- else if( (L_reset_state != GPU_RESET_STATE_NEW) &&
- (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) )
- {
- INTR_TRAC_ERR("gpu_reset_sm: failed to schedule state 0x%02X", L_reset_state);
- }
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_reset_sm: Scheduled reset state 0x%02X", L_reset_state);
+ }
+ // check if the state was expected to have a schedule. Only new and slave wait
+ // don't schedule for all other states the schedule must have failed
+ else if( (L_reset_state != GPU_RESET_STATE_NEW) &&
+ (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) )
+ {
+ INTR_TRAC_ERR("gpu_reset_sm: failed to schedule state 0x%02X", L_reset_state);
+ }
} // if async_request_is_idle
else
{
- INTR_TRAC_ERR("gpu_reset_sm: NOT idle for state 0x%02X", L_reset_state);
+ INTR_TRAC_ERR("gpu_reset_sm: NOT idle for state 0x%02X", L_reset_state);
}
return l_complete;
@@ -709,154 +709,152 @@ bool gpu_reset_sm()
// This function should only return that complete is TRUE when the temperature
// read is complete (or determined failed) and ready to start reading a different GPU
//
-// Pre-Req: Caller must have G_current_gpu_id set for GPU to read and
-// verified G_gpu_op_request is idle to allow scheduling
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to read and
+// verified G_gpu_op_request is idle to allow scheduling
// End Function Specification
bool gpu_read_temp_sm()
{
bool l_complete = FALSE; // only return TRUE when the read is complete or failed
uint16_t l_temp = 0;
static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
- static uint8_t L_read_failure_count = 0;
+ static uint8_t L_read_failure_count = 0; // Used for I2C errors
+
static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp
if (async_request_is_idle(&G_gpu_op_request.request))
{
- // If not starting a new read then need to check status of current state before moving on
- // stay in current state if the schedule failed or the state isn't finished/failed
- if( (L_read_temp_state != GPU_STATE_READ_TEMP_NEW) &&
- (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
- {
- // If reached retry count give up on this GPU
- if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
- {
- mark_gpu_failed(&G_gpu_op_req_args);
-
- L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
- return TRUE; // Done with this GPU, let GPU SM move to next
- }
- else
- {
- // INC failure count and retry current state
- L_read_failure_count++;
- }
- }
- else // success on last state go to next state and process it
- {
- L_read_failure_count = 0;
- L_read_temp_state++;
- }
-
- L_scheduled = FALSE; // default nothing scheduled
-
- switch (L_read_temp_state)
- {
- case GPU_STATE_READ_TEMP_START:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE_START, G_new_gpu_req_args);
- break;
-
- case GPU_STATE_READ_TEMP_STOP:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE_STOP, G_new_gpu_req_args);
- break;
-
- case GPU_STATE_READ_TEMP_READ:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE, G_new_gpu_req_args);
- break;
-
- case GPU_STATE_READ_TEMP_COMPLETE:
- if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
- (0 != G_gpu_op_req_args.data[0]) ) // TODO: check for valid temp?
- {
- g_amec->gpu[G_current_gpu_id].status.readOnce = true;
- TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
- G_current_gpu_id, CURRENT_TICK);
- // comm is now established update for capability checking to take place
- g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
- g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
- }
- // Update sensor
- l_temp = G_gpu_op_req_args.data[0] >> 24;
- sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
-
- // Clear all past errors
- g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
-
- // check if there is an overtemp that hasn't been reported
- if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
- (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
- (!g_amec->gpu[G_current_gpu_id].status.overtempError) )
- {
- g_amec->gpu[G_current_gpu_id].status.overtempError = TRUE;
+ // If not starting a new read then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_read_temp_state != GPU_STATE_READ_TEMP_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // If reached retry count give up on this GPU
+ if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
+ (GPE_RC_I2C_ERROR == G_gpu_op_req_args.error.rc) )
+ {
+ mark_gpu_failed(&G_gpu_op_req_args);
- INTR_TRAC_ERR("gpu_read_temp: GPU%d OT! temp[%d]",
- G_current_gpu_id, l_temp);
+ L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+ else
+ {
+ // INC failure count and retry current state
+ L_read_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_read_failure_count = 0;
+ L_read_temp_state++;
+ }
- // Log an OT error
- /* @
- * @errortype
- * @moduleid GPU_MID_GPU_READ_TEMP
- * @reasoncode GPU_ERROR_TEMP
- * @userdata1 GPU ID
- * @userdata2 GPU memory temperature
- * @userdata4 OCC_NO_EXTENDED_RC
- * @devdesc GPU memory has reached error temperature
- *
- */
- errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_TEMP,
- GPU_ERROR_TEMP,
- OCC_NO_EXTENDED_RC,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_current_gpu_id,
- l_temp);
+ L_scheduled = FALSE; // default nothing scheduled
- // Callout the over temperature procedure
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_COMPONENT_ID,
- ERRL_COMPONENT_ID_OVER_TEMPERATURE,
- ERRL_CALLOUT_PRIORITY_HIGH);
+ switch (L_read_temp_state)
+ {
+ case GPU_STATE_READ_TEMP_START:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
+ break;
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
- {
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_HUID,
- G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ case GPU_STATE_READ_TEMP_FINISH:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_FINISH, G_new_gpu_req_args);
+ break;
- // Commit Error
- commitErrl(&l_err);
+ case GPU_STATE_READ_TEMP_COMPLETE:
+ if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
+ (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp?
+ {
+ g_amec->gpu[G_current_gpu_id].status.readOnce = true;
+ TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
+ G_current_gpu_id, CURRENT_TICK);
+ // comm is now established update for capability checking to take place
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
+ }
+ // Update sensor
+ l_temp = G_gpu_op_req_args.data;
+ sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
- } // if OT error
+ // Clear all past errors
+ g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
- // Done with this GPU ready to move to new one
- L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
- l_complete = TRUE;
- break;
+ // check if there is an overtemp that hasn't been reported
+ if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
+ (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
+ (!g_amec->gpu[G_current_gpu_id].status.overtempError) )
+ {
+ g_amec->gpu[G_current_gpu_id].status.overtempError = TRUE;
+
+ INTR_TRAC_ERR("gpu_read_temp: GPU%d OT! temp[%d]",
+ G_current_gpu_id, l_temp);
+
+ // Log an OT error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_TEMP
+ * @reasoncode GPU_ERROR_TEMP
+ * @userdata1 GPU ID
+ * @userdata2 GPU memory temperature
+ * @userdata4 OCC_NO_EXTENDED_RC
+ * @devdesc GPU memory has reached error temperature
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_TEMP,
+ GPU_ERROR_TEMP,
+ OCC_NO_EXTENDED_RC,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ l_temp);
+
+ // Callout the over temperature procedure
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_COMPONENT_ID,
+ ERRL_COMPONENT_ID_OVER_TEMPERATURE,
+ ERRL_CALLOUT_PRIORITY_HIGH);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_HUID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+
+ } // if OT error
+
+ // Done with this GPU ready to move to new one
+ L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+ l_complete = TRUE;
+ break;
- default:
- INTR_TRAC_ERR("gpu_read_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
- L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
- l_complete = TRUE;
+ default:
+ INTR_TRAC_ERR("gpu_read_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
+ L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+ l_complete = TRUE;
break;
- } // switch L_read_temp_state
+ } // switch L_read_temp_state
- if(L_scheduled)
- {
- GPU_DBG("gpu_read_temp_sm: Scheduled read temp state 0x%02X at tick %d",
- L_read_temp_state, GPU_TICK);
- }
- else if(!l_complete) // if not complete there must have been a failure on the schedule
- {
- INTR_TRAC_ERR("gpu_read_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
- }
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_read_temp_sm: Scheduled read temp state 0x%02X at tick %d",
+ L_read_temp_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_read_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
+ }
} // if async_request_is_idle
else
{
- INTR_TRAC_ERR("gpu_read_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
+ INTR_TRAC_ERR("gpu_read_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
}
return l_complete;
@@ -949,18 +947,21 @@ bool gpu_read_mem_temp_capability_sm()
L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
break;
- case GPU_STATE_READ_MEM_TEMP_CAPABLE_STOP:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_STOP, G_new_gpu_req_args);
+ case GPU_STATE_READ_MEM_TEMP_CAPABLE_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_MEM_TEMP_CAPABLE_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_3, G_new_gpu_req_args);
break;
case GPU_STATE_READ_MEM_TEMP_CAPABLE_READ:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS, G_new_gpu_req_args);
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_FINISH, G_new_gpu_req_args);
break;
case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
// Update capability
- g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
-
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data & 0x01;
// Done with this GPU ready to move to new one
L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
l_complete = TRUE;
@@ -1089,20 +1090,20 @@ bool gpu_read_memory_temp_sm()
switch (L_read_temp_state)
{
case GPU_STATE_READ_MEM_TEMP_START:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
break;
case GPU_STATE_READ_MEM_TEMP_STOP:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_STOP, G_new_gpu_req_args);
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args);
break;
case GPU_STATE_READ_MEM_TEMP_READ:
- L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP, G_new_gpu_req_args);
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args);
break;
case GPU_STATE_READ_MEM_TEMP_COMPLETE:
// Update sensor
- l_temp = G_gpu_op_req_args.data[0] >> 24;
+ l_temp = G_gpu_op_req_args.data;
sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
// Clear past errors
@@ -1206,6 +1207,8 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
{
// Check for next state in order of priority
+//TODO: Enable when functional
+#if 0
// 1. Need to set a power limit on a GPU?
l_gpu_id = gpu_id_need_set_power_limit();
if(l_gpu_id != 0xFF)
@@ -1216,6 +1219,7 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
l_new_state = TRUE;
break;
}
+#endif
// 2. check if Host needs lock
if (!check_and_update_i2c_lock(GPU_I2C_ENGINE))
@@ -1227,6 +1231,8 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
break;
}
+//TODO: Enable when functional
+#if 0
// 3. Need to check if driver is loaded?
l_gpu_id = gpu_id_need_driver_check();
if(l_gpu_id != 0xFF)
@@ -1276,6 +1282,7 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
}
}
}
+#endif
// 6. Time to start new temperature reads?
if(i_read_temp_start_needed)
@@ -1332,6 +1339,7 @@ void task_gpu_sm(struct task *i_self)
// are functional or GPU I2C interface is broken
if(G_gpu_monitoring_allowed)
{
+
// Initialize the IPC commands if this is our first run
if(L_gpu_first_run)
{
@@ -1341,7 +1349,7 @@ void task_gpu_sm(struct task *i_self)
}
// Check if time to start reading temperatures
- // GPU tempertures (core and memory) are only used for fan control which happens every 1s
+ // GPU temperatures (core and memory) are only used for fan control which happens every 1s
// so there is no need to read the GPU temperatures any faster than every 1s
if(!L_read_temp_start_needed)
{
@@ -1355,6 +1363,7 @@ void task_gpu_sm(struct task *i_self)
// make sure OCC owns the lock in order to send commands to the GPU
if( (L_occ_owns_lock == FALSE) || (G_gpu_state == GPU_STATE_NO_LOCK) )
{
+
// Check if host gave up the I2C lock
L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE);
if (L_occ_owns_lock)
@@ -1557,4 +1566,6 @@ void task_gpu_sm(struct task *i_self)
}
}while((l_start_next_state) && (!l_next_state));
} // GPU monitoring enabled
+
+
} // end task_gpu_sm()
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index 91d081b..c8f13ff 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -52,12 +52,12 @@ typedef enum
typedef enum
{
GPU_RESET_STATE_NEW = 0x01, // new reset attempt
- GPU_RESET_STATE_RESET_MASTER = 0x02, // Reset master
- GPU_RESET_STATE_RESET_SLAVE = 0x03, // Start of slave port 4 reset
- GPU_RESET_STATE_RESET_SLAVE_WAIT = 0x04,
- GPU_RESET_STATE_RESET_SLAVE_COMPLETE = 0x05,
- GPU_RESET_STATE_INIT = 0x06,
- GPU_RESET_STATE_INIT_COMPLETE = 0x07,
+ GPU_RESET_STATE_INIT_BUS = 0x02,
+ GPU_RESET_STATE_RESET_MASTER = 0x03, // Reset master
+ GPU_RESET_STATE_RESET_SLAVE = 0x04, // Start of slave port 4 reset
+ GPU_RESET_STATE_RESET_SLAVE_WAIT = 0x05,
+ GPU_RESET_STATE_RESET_SLAVE_COMPLETE = 0x06,
+ GPU_RESET_STATE_RESET_FINISH = 0x07,
} gpuResetState_e;
// States for reading GPU core temperature (gpu_read_temp_sm)
@@ -65,9 +65,8 @@ typedef enum
{
GPU_STATE_READ_TEMP_NEW = 0x11, // new temp read
GPU_STATE_READ_TEMP_START = 0x12, // start write temp reg
- GPU_STATE_READ_TEMP_STOP = 0x13, // stop write/begin read
- GPU_STATE_READ_TEMP_READ = 0x14, // read temperature
- GPU_STATE_READ_TEMP_COMPLETE = 0x15, // store temperature read
+ GPU_STATE_READ_TEMP_FINISH = 0x13, // read temperature
+ GPU_STATE_READ_TEMP_COMPLETE = 0x14, // store temperature read
} gpuReadTempState_e;
// States for reading GPU memory temperature (gpu_read_mem_temp_sm)
@@ -85,9 +84,10 @@ typedef enum
{
GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW = 0x31,
GPU_STATE_READ_MEM_TEMP_CAPABLE_START = 0x32,
- GPU_STATE_READ_MEM_TEMP_CAPABLE_STOP = 0x33,
- GPU_STATE_READ_MEM_TEMP_CAPABLE_READ = 0x34,
- GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x35,
+ GPU_STATE_READ_MEM_TEMP_CAPABLE_2 = 0x33,
+ GPU_STATE_READ_MEM_TEMP_CAPABLE_3 = 0x34,
+ GPU_STATE_READ_MEM_TEMP_CAPABLE_READ = 0x35,
+ GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x36,
} gpuReadMemTempCapableState_e;
// GPU IPC initialization
OpenPOWER on IntegriCloud