summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2017-09-28 13:32:29 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2017-10-03 16:03:05 -0400
commit74f721c90235a18821b97782d98349cf51e0f12d (patch)
tree1f2fd59b41db514c0273632dd2dd7926e25a2030
parent76b91d0038d59b30de14108e908bc78c6d988796 (diff)
downloadtalos-occ-74f721c90235a18821b97782d98349cf51e0f12d.tar.gz
talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.zip
GPU 405 Enable Memory Temperatures
Change-Id: Id50d12a50a05b8b3a6a6f1ce3ce4512d3299caa7 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46882 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rw-r--r--src/common/gpe_err.h6
-rw-r--r--src/common/gpu_structs.h31
-rw-r--r--src/include/registers/ocb_firmware_registers.h20
-rwxr-xr-xsrc/occ_405/amec/amec_data.c6
-rwxr-xr-xsrc/occ_405/amec/amec_pcap.c81
-rwxr-xr-xsrc/occ_405/amec/amec_sys.h38
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c46
-rwxr-xr-xsrc/occ_405/gpu/gpu.c1332
-rw-r--r--src/occ_405/gpu/gpu.h40
-rwxr-xr-xsrc/occ_405/gpu/gpu_service_codes.h3
-rw-r--r--src/occ_405/occ_service_codes.h12
11 files changed, 1303 insertions, 312 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h
index 8580012..3bb1fa0 100644
--- a/src/common/gpe_err.h
+++ b/src/common/gpe_err.h
@@ -51,6 +51,8 @@
#define GPE_RC_GET_NEST_DTS_FAILED 0x61 // Failed to collect nest DTS temperatures
// GPU Errors
-#define GPE_RC_NO_GPU_SUPPORT 0x8F // GPE1 image doesn't support GPUs
-
+#define GPE_RC_NO_GPU_SUPPORT 0x80 // GPE1 image doesn't support GPUs
+#define GPE_RC_GPU_DRIVER_CHANGE 0x81 // GPU in transition or just completed phase change
+#define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82 // GPU rejected command with no support
+#define GPE_RC_GPU_CMD_FAILED 0x83 // An error occurred in the last GPU operation
#endif //_GPE_ERR_H
diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h
index 7933adb..ba522e5 100644
--- a/src/common/gpu_structs.h
+++ b/src/common/gpu_structs.h
@@ -41,6 +41,12 @@
typedef enum
{
+ GPU_CAP_MEM = 0x00,
+ GPU_CAP_CORE = 0x01
+} GPU_CAPABILITIES;
+
+typedef enum
+{
ID_GPU0 = 0x00,
ID_GPU1 = 0x01,
ID_GPU2 = 0x02,
@@ -64,12 +70,21 @@ typedef enum
GPU_REQ_READ_MEM_TEMP_3 = 0x06, // mem temp step 3
GPU_REQ_READ_MEM_TEMP_FINISH = 0x07, // Get memory temp reading
GPU_REQ_READ_CAPS_START = 0x08, // Start reading capabilities
- GPU_REQ_READ_CAPS_2 = 0x09, // Start reading capabilities
- GPU_REQ_READ_CAPS_3 = 0x0A, // Start reading capabilities
- GPU_REQ_READ_CAPS_FINISH = 0x0B,
- GPU_REQ_READ_PWR_LIMIT_START = 0x0C, // Start reading GPU information
- GPU_REQ_READ_PWR_LIMIT_STOP = 0x0D, // Read GPU temp register
- GPU_REQ_READ_PWR_LIMIT = 0x0E, // Start reading pwr limit
+ GPU_REQ_READ_CAPS_2 = 0x09, // Capabilities read step 2
+ GPU_REQ_READ_CAPS_3 = 0x0A, // Capabilities read step 3
+ GPU_REQ_READ_CAPS_FINISH = 0x0B, // get capabilities
+ GPU_REQ_READ_PWR_LIMIT_START = 0x10, // Start reading GPU power limit
+ GPU_REQ_READ_PWR_LIMIT_2 = 0x11,
+ GPU_REQ_READ_PWR_LIMIT_3 = 0x12,
+ GPU_REQ_READ_PWR_LIMIT_FINISH = 0x13,
+ GPU_REQ_SET_PWR_LIMIT_START = 0x20, // Start setting GPU power limit
+ GPU_REQ_SET_PWR_LIMIT_2 = 0x21,
+ GPU_REQ_SET_PWR_LIMIT_3 = 0x22,
+ GPU_REQ_SET_PWR_LIMIT_FINISH = 0x23,
+ GPU_REQ_CHECK_DRIVER_START = 0x31, // Start check driver loaded
+ GPU_REQ_CHECK_DRIVER_2 = 0x32,
+ GPU_REQ_CHECK_DRIVER_3 = 0x33,
+ GPU_REQ_CHECK_DRIVER_FINISH = 0x34,
GPU_REQ_RESET = 0x60, // Reset
} gpu_op_req_e;
@@ -78,10 +93,10 @@ typedef struct
{
GpeErrorStruct error;
uint8_t gpu_id;
+ uint8_t gpu_rc;
uint8_t operation;
- uint64_t data;
+ uint64_t data[3];
} gpu_sm_args_t;
-
#endif // _GPU_STRUCTS_H
diff --git a/src/include/registers/ocb_firmware_registers.h b/src/include/registers/ocb_firmware_registers.h
index 5b6705a..010ad02 100644
--- a/src/include/registers/ocb_firmware_registers.h
+++ b/src/include/registers/ocb_firmware_registers.h
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2015,2016 */
+/* Contributors Listed Below - COPYRIGHT 2015,2017 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -1411,9 +1411,21 @@ typedef union ocb_occflg {
uint32_t i2c_engine2_lock_occ : 1;
uint32_t i2c_engine3_lock_host : 1;
uint32_t i2c_engine3_lock_occ : 1;
- uint32_t reserved_occ : 10;
-#else
- uint32_t reserved_occ : 10;
+ uint32_t gpu0_reset_status : 1;
+ uint32_t gpu1_reset_status : 1;
+ uint32_t gpu2_reset_status : 1;
+ uint32_t reserved_occ : 3;
+ uint32_t wof_hcode_mode : 2;
+ uint32_t active_quad_update : 1;
+ uint32_t request_occ_safe : 1;
+#else
+ uint32_t request_occ_safe : 1;
+ uint32_t active_quad_update : 1;
+ uint32_t wof_hcode_mode : 2;
+ uint32_t reserved_occ : 3;
+ uint32_t gpu2_reset_status : 1;
+ uint32_t gpu1_reset_status : 1;
+ uint32_t gpu0_reset_status : 1;
uint32_t i2c_engine3_lock_occ : 1;
uint32_t i2c_engine3_lock_host : 1;
uint32_t i2c_engine2_lock_occ : 1;
diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c
index 3a82bb3..4c553d6 100755
--- a/src/occ_405/amec/amec_data.c
+++ b/src/occ_405/amec/amec_data.c
@@ -458,12 +458,6 @@ void amec_data_write_pcap(void)
g_amec->pcap.ovs_node_pcap = G_sysConfigData.pcap.hard_min_pcap;
}
- //Oversubscription pcap can NOT be higher than a customer set pcap.
- if(g_amec->pcap.ovs_node_pcap > l_customer)
- {
- g_amec->pcap.ovs_node_pcap = l_customer;
- }
-
//for all new pcap data setting: If KVM, update the OPAL dynamic data
if(G_sysConfigData.system_type.kvm)
{
diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c
index 7584ddf..995324d 100755
--- a/src/occ_405/amec/amec_pcap.c
+++ b/src/occ_405/amec/amec_pcap.c
@@ -95,14 +95,16 @@ extern uint32_t G_first_num_gpus_sys;
// Thread: Real Time Loop
//
// End Function Specification
-void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
+void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power)
{
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
uint8_t i = 0;
uint32_t l_gpu_cap_mw = 0;
+ uint16_t l_system_gpu_total_pcap = 0; // total GPU pcap required by system based on if currently in oversub or not
static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect
+ static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0; // Total GPU pcap required for N+1 (not in oversubscription)
static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription
static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR
static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU
@@ -112,10 +114,12 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
- // If this is the first time running calculate the total GPU power cap for oversubscription
+ // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1)
if(L_first_run)
{
+ // calculate total GPU power cap for oversubscription
if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+
{
// Take all non-GPU power away from the oversubscription power cap
L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
@@ -157,6 +161,50 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
ERRL_CALLOUT_PRIORITY_HIGH);
commitErrl(&l_err);
}
+
+ // calculate total GPU power cap for N+1 (not in oversubscription)
+ if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+ {
+ // Take all non-GPU power away from the N+1 power cap
+ L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
+ // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs
+ L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts;
+ }
+ else
+ {
+ // This should not happen, the total non GPU power should never be higher than the N+1 mode cap
+ // Log error and set GPUs to minimum power cap
+ L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap
+
+ TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW",
+ G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap);
+
+ /* @
+ * @errortype
+ * @moduleid AMEC_GPU_PCAP_MID
+ * @reasoncode GPU_FAILURE
+ * @userdata1 N+1 mode Power Cap watts
+ * @userdata2 Total non-GPU power watts
+ * @userdata4 ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE
+ * @devdesc Total non-GPU power more than N+1 mode power cap
+ *
+ */
+ errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID,
+ GPU_FAILURE,
+ ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_sysConfigData.pcap.system_pcap,
+ G_sysConfigData.total_non_gpu_max_pwr_watts);
+
+ //Callout firmware
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_COMPONENT_ID,
+ ERRL_COMPONENT_ID_FIRMWARE,
+ ERRL_CALLOUT_PRIORITY_HIGH);
+ commitErrl(&l_err);
+ }
} // if first run
// Calculate the total GPU power cap for the current active limit and PSR
@@ -180,12 +228,23 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap);
}
- // Total GPU power cap is the lower of oversubscription and active power limit
- // must always account for oversubscription to ensure when a power supply is lost the OCC
- // can react fast enough, GPU power capping is too slow and must have GPU power cap already
- // set to account for oversubscription case
- L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
- L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap;
+ // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub)
+ // and the active power limit. We do not need to always account for oversubscription since
+ // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is
+ // entered from the time OCC can set and GPUs react to a new power limit
+ if(i_oversubscription)
+ {
+ // system in oversubscription use N mode cap
+ l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap;
+ }
+ else
+ {
+ // system is not in oversubscription use N+1 mode cap
+ l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap;
+ }
+
+ L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
+ l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap;
// Divide the total equally across all GPUs in the system
if(G_first_num_gpus_sys)
@@ -282,8 +341,8 @@ void amec_pcap_calc(void)
l_oversub_state = AMEC_INTF_GET_OVERSUBSCRIPTION();
// Determine the active power cap. norm_node_pcap is set as lowest
- // between sys and user in amec_data_write_pcap()
- // when in oversub only use oversub pcap if lower than norm_node_pcap
+ // between sys (N+1 mode) and user in amec_data_write_pcap()
+ // when in oversub (N mode) only use oversub pcap if lower than norm_node_pcap
// to handle user set power cap lower than the oversub power cap
if( (TRUE == l_oversub_state) &&
(g_amec->pcap.ovs_node_pcap < g_amec->pcap.norm_node_pcap) )
@@ -312,7 +371,7 @@ void amec_pcap_calc(void)
// Determine GPU power cap if there are GPUs present
if(G_first_proc_gpu_config)
{
- amec_gpu_pcap(l_active_pcap_changed, l_avail_power);
+ amec_gpu_pcap(l_oversub_state, l_active_pcap_changed, l_avail_power);
}
if(l_node_pwr != 0)
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 803ca28..3f1d333 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -436,27 +436,35 @@ typedef struct
//-------------------------------------------------------------
typedef struct {
- bool disabled; // GPU has been marked failed and no longer monitored
- bool readOnce; // Comm has been established with GPU
- bool overtempError; // Core OT error has been logged against GPU
- bool memOvertempError; // Memory OT error has been logged against GPU
+ bool disabled; // GPU has been marked failed and no longer monitored
+ bool readOnce; // Comm has been established with GPU
+ bool commErrorLogged; // GPU has been called out due to comm error
+ bool overtempError; // Core OT error has been logged against GPU
+ bool memOvertempError; // Memory OT error has been logged against GPU
bool checkDriverLoaded; // Indicates if need to check if driver is loaded
- bool driverLoaded; // Indicates if GPU driver is loaded
+ bool driverLoaded; // Indicates if GPU driver is loaded
bool checkMemTempSupport; // Indicates if need to check if mem monitoring is supported
- bool memTempSupported; // Indicates if memory temperature monitoring is supported
- uint8_t memErrorCount; // count of consecutive GPU mem temp read failures
- uint8_t errorCount; // count of consecutive GPU core temp read failures
+ bool memTempSupported; // Indicates if memory temperature monitoring is supported
+ bool notReset; // '1' = GPU NOT in reset. Read from OCC FLAGS register
+ bool coreTempNotAvailable; // for fan control: '1' = core temp not available. (send 0 for fan control)
+ bool memTempNotAvailable; // for fan control: '1' = Mem temp not available. (send 0 for fan control)
+ bool coreTempFailure; // for fan control: '1' = timeout failure reading core temp (send 0xFF for fan control)
+ bool memTempFailure; // for fan control: '1' = timeout failure reading Mem temp (send 0xFF for fan control)
+ uint8_t memErrorCount; // count of consecutive GPU mem temp read failures when GPU not in reset
+ uint8_t errorCount; // count of consecutive GPU core temp read failures when GPU not in reset
+ uint8_t retryCount; // count of consecutive GPU core temp read failures before I2C reset
} gpuStatus_t;
typedef struct {
- bool check_pwr_limit; // Indicates if need to read power limits from GPU
- bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max
- bool gpu_min_cap_required; // Indicates if power limits were read i.e. have min/max
- uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
- uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
- uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set
+ bool check_pwr_limit; // Indicates if need to read power limits from GPU
+ bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max
+ bool set_failed; // Indicates if failed to set power limit
+ bool gpu_min_cap_required; // Indicates if GPU requires min cap
+ uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
+ uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
+ uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set
uint32_t gpu_requested_pcap_mw; // Requested power cap in mW sent to GPU
- uint32_t gpu_actual_pcap_mw; // Actual power cap in mW read back from the GPU
+ uint32_t gpu_default_pcap_mw; // Default power cap in mW read from the GPU
} gpuPcap_t;
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 4606f6d..422dc38 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -348,7 +348,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
const sensor_t *vrfan = getSensorByGsid(VRMPROCOT);
if (vrfan != NULL)
{
- l_tempSensorList[l_sensorHeader.count].id = G_sysConfigData.proc_huid;
+ l_tempSensorList[l_sensorHeader.count].id = 0;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM;
l_tempSensorList[l_sensorHeader.count].value = vrfan->sample & 0xFF;
l_sensorHeader.count++;
@@ -358,24 +358,46 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
// Add GPU temperatures
for (k=0; k<MAX_NUM_GPU_PER_DOMAIN; k++)
{
- if(GPU_PRESENT(k)) // temp until GPU sensor IDs are sent make sensor ids "GPU"<gpu#>
+ if(GPU_PRESENT(k))
{
// GPU core temperature
- if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp
- l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
- else
- l_tempSensorList[l_sensorHeader.count].id = 0xC6 + (9 * G_pbax_id.chip_id) + (k*3); // temp
+ l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU;
- l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
+ if(g_amec->gpu[k].status.coreTempFailure)
+ {
+ // failed to read core temperature return 0xFF
+ l_tempSensorList[l_sensorHeader.count].value = 0xFF;
+ }
+ else if(g_amec->gpu[k].status.coreTempNotAvailable)
+ {
+ // core temperature not available return 0
+ l_tempSensorList[l_sensorHeader.count].value = 0;
+ }
+ else
+ {
+ // have a good core temperature return the reading
+ l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
+ }
l_sensorHeader.count++;
// GPU memory temperature
- if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp
- l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
- else
- l_tempSensorList[l_sensorHeader.count].id = 0xC7 + (9 * G_pbax_id.chip_id) + (k*3); // temp
+ l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM;
- l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
+ if(g_amec->gpu[k].status.memTempFailure)
+ {
+ // failed to read memory temperature return 0xFF
+ l_tempSensorList[l_sensorHeader.count].value = 0xFF;
+ }
+ else if(g_amec->gpu[k].status.memTempNotAvailable)
+ {
+ // memory temperature not available return 0
+ l_tempSensorList[l_sensorHeader.count].value = 0;
+ }
+ else
+ {
+ // have a good memory temperature return the reading
+ l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
+ }
l_sensorHeader.count++;
}
}
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 8666e12..1a27565 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -51,18 +51,25 @@
#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) ) // Number calls with assumption called every other tick
-// Time in seconds to ignore errors from the start of GPU SM
-// Right now this time must include PRST and GPU init time
-// this may be reduced after adding in OS interlock for PRST
-#define GPU_COMM_ESTAB_TIMEOUT_SECONDS 600
+// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
+#define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds
-#define MAX_CONSECUTIVE_GPU_RESETS 3
+#define MAX_CONSECUTIVE_GPU_RESETS 5
#define MAX_GPU_RESET_STATE_RETRY 3
#define MAX_RESET_STATE_NOT_DONE_COUNT 100
#define MAX_GPU_READ_ATTEMPT 3
+#define GPU_ERRORS_BEFORE_I2C_RESET 5
+
+// consecutive error counts for GPU command failures before error is logged if GPU is not in reset
+#define GPU_CHECK_DRIVER_ERROR_COUNT 5
+#define GPU_READ_MEM_CAP_ERROR_COUNT 5
+#define GPU_READ_PWR_LIMIT_ERROR_COUNT 5
+#define GPU_SET_PWR_LIMIT_ERROR_COUNT 5
+
#define GPU_I2C_ENGINE PIB_I2C_ENGINE_C
extern data_cnfg_t * G_data_cnfg;
+extern PWR_READING_TYPE G_pwr_reading_type;
// this is the global GPU task sm state each task within the GPU SM may have its own "state"
// to allow several calls to complete the task
@@ -71,7 +78,6 @@ gpuState_e G_gpu_state = GPU_STATE_IDLE;
bool G_gpu_monitoring_allowed = FALSE; // Set to true if GPU is present
bool G_gpu_i2c_reset_required = FALSE;
uint32_t G_gpu_reset_cause = 0;
-uint64_t G_gpu_sm_start_time = 0;
// GPE Requests
GpeRequest G_gpu_op_request;
@@ -82,11 +88,6 @@ GPE_BUFFER(gpu_sm_args_t G_gpu_op_req_args);
gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed
-bool G_gpu_read_issued = false;
-
-// Read OCC_MISC register to see if an I2C interrupt was generated for
-// the specified engine.
-bool check_for_i2c_interrupt(const uint8_t i_engine);
// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
uint8_t get_first_gpu(void)
@@ -141,14 +142,47 @@ uint8_t gpu_id_need_driver_check(void)
uint8_t gpu_id = 0xFF; // default none needs checking
uint8_t i = 0;
- for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ // checking for driver loaded can be repeated until driver is loaded which may never happen
+ // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+ // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+ static uint8_t L_current_gpu_id = 0;
+
+ if(L_current_gpu_id == 0xFF)
{
- if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkDriverLoaded))
- {
- gpu_id = i;
- break;
- }
+ // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+ L_current_gpu_id = 0;
}
+ else
+ {
+ for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ {
+ // only check for driver after i2c comm (readOnce) has been established
+ if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) &&
+ (g_amec->gpu[i].status.readOnce) && (g_amec->gpu[i].status.checkDriverLoaded))
+ {
+ gpu_id = i;
+ break;
+ }
+ }
+
+ // setup L_current_gpu_id for next call based on what is happening this time
+ if(gpu_id == 0xFF)
+ {
+ // no GPU needs checking start back at 0 next time
+ L_current_gpu_id = 0;
+ }
+ else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+ {
+ // last GPU is having driver checked do not check any next time
+ L_current_gpu_id = 0xFF;
+ }
+ else
+ {
+ // next time look at the next GPU ID first
+ L_current_gpu_id = gpu_id + 1;
+ }
+ }
+
return gpu_id;
}
@@ -157,13 +191,45 @@ uint8_t gpu_id_need_memory_temp_capability_check(void)
uint8_t gpu_id = 0xFF; // default none needs checking
uint8_t i = 0;
- for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ // checking for memory temp capability will be repeated until memory temp is capable which may never happen
+ // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+ // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+ static uint8_t L_current_gpu_id = 0;
+
+ if(L_current_gpu_id == 0xFF)
{
- if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkMemTempSupport))
- {
- gpu_id = i;
- break;
- }
+ // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+ L_current_gpu_id = 0;
+ }
+ else
+ {
+ for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+ {
+ // driver must be loaded for memory temp capability
+ if( (!g_amec->gpu[i].status.disabled) && (g_amec->gpu[i].status.driverLoaded) &&
+ (g_amec->gpu[i].status.checkMemTempSupport) )
+ {
+ gpu_id = i;
+ break;
+ }
+ }
+
+ // setup L_current_gpu_id for next call based on what is happening this time
+ if(gpu_id == 0xFF)
+ {
+ // no GPU needs checking start back at 0 next time
+ L_current_gpu_id = 0;
+ }
+ else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+ {
+ // last GPU is having memory capability checked do not check any next time
+ L_current_gpu_id = 0xFF;
+ }
+ else
+ {
+ // next time look at the next GPU ID first
+ L_current_gpu_id = gpu_id + 1;
+ }
}
return gpu_id;
}
@@ -178,7 +244,7 @@ uint8_t get_first_mem_temp_capable_gpu(void)
for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
{
if( (!g_amec->gpu[i].status.disabled) &&
- (g_amec->gpu[i].status.memTempSupported) )
+ (g_amec->gpu[i].status.memTempSupported) ) // memTempSupported implies that driver is loaded
{
first_gpu = i;
break;
@@ -202,7 +268,7 @@ uint8_t get_next_mem_temp_capable_gpu(void)
next_gpu = 0;
}
if( (!g_amec->gpu[next_gpu].status.disabled) &&
- (g_amec->gpu[next_gpu].status.memTempSupported) )
+ (g_amec->gpu[next_gpu].status.memTempSupported) ) // memTempSupported implies that driver is loaded
{
break;
}
@@ -213,6 +279,10 @@ uint8_t get_next_mem_temp_capable_gpu(void)
{
next_gpu = 0xFF;
}
+ else if( (next_gpu != 0xFF) && (!g_amec->gpu[next_gpu].status.memTempSupported) )
+ {
+ next_gpu = 0xFF;
+ }
return next_gpu;
}
@@ -231,8 +301,16 @@ uint8_t gpu_id_need_power_limits(void)
if( (g_amec->gpu[i].status.driverLoaded) &&
(g_amec->gpu[i].pcap.check_pwr_limit))
{
- gpu_id = i;
- break;
+ // If there is no power capping support skip reading power limits
+ if(G_pwr_reading_type == PWR_READING_TYPE_NONE)
+ {
+ g_amec->gpu[i].pcap.check_pwr_limit = false;
+ }
+ else
+ {
+ gpu_id = i;
+ break;
+ }
}
}
return gpu_id;
@@ -247,8 +325,9 @@ uint8_t gpu_id_need_set_power_limit(void)
for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
{
- // to set power limit requires that the driver is loaded
- if( (g_amec->gpu[i].status.driverLoaded) &&
+ // to set power limit requires that the driver is loaded and power limits were read
+ if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) &&
+ (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) &&
(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) )
{
gpu_id = i;
@@ -258,6 +337,97 @@ uint8_t gpu_id_need_set_power_limit(void)
return gpu_id;
}
+// For the given GPU clear status/data that requires GPU driver to be loaded
+void clear_gpu_driver_status(uint8_t i_gpu_num)
+{
+ g_amec->gpu[i_gpu_num].status.checkDriverLoaded = false;
+ g_amec->gpu[i_gpu_num].status.driverLoaded = false;
+
+ // Reading memory temperature requires driver to be loaded.
+ g_amec->gpu[i_gpu_num].status.checkMemTempSupport = false;
+ g_amec->gpu[i_gpu_num].status.memTempSupported = false;
+ g_amec->gpu[i_gpu_num].status.memErrorCount = 0;
+
+ // Power capping requires driver to be loaded. Clear GPU power limits
+ g_amec->gpu[i_gpu_num].pcap.check_pwr_limit = false;
+ g_amec->gpu[i_gpu_num].pcap.pwr_limits_read = false;
+ g_amec->gpu[i_gpu_num].pcap.set_failed = false;
+ g_amec->gpu[i_gpu_num].pcap.gpu_min_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_max_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_requested_pcap_mw = 0;
+ g_amec->gpu[i_gpu_num].pcap.gpu_default_pcap_mw = 0;
+ //amec will need to recalculate after power limits are read to handle any clipping with new GPU min/max
+ g_amec->gpu[i_gpu_num].pcap.gpu_desired_pcap_mw = 0;
+}
+
+// Handles GPU not able to process request due to driver load or un-load
+void handle_driver_change(void)
+{
+ // Clear out driver status while driver change completes and is determined if loaded/un-loaded
+ clear_gpu_driver_status(G_current_gpu_id);
+
+ // memory temp only available when driver is loaded. clear error and set not available
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // when driver change is complete we must re-query to see if driver is loaded or not
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+}
+
+// For all GPUs read GPU reset status and take action if reset status has changed
+void update_gpu_reset_status(void)
+{
+ uint8_t gpu_num = 0;
+
+ // GPU reset status is in the OCC FLAGS register and is updated by OPAL
+ // Read the current reset status for all GPUs. A reset status of '1' indicates NOT in reset
+ ocb_occflg_t occ_flags = {0};
+ occ_flags.value = in32(OCB_OCCFLG);
+ bool not_in_reset[3] = {occ_flags.fields.gpu0_reset_status,
+ occ_flags.fields.gpu1_reset_status,
+ occ_flags.fields.gpu2_reset_status};
+
+ // reset status of '0' (IN reset) is the default
+ // the OCC will still try to read GPU when IN reset but will not log errors
+ // this is so we still communicate with the GPUs without OPAL support to indicate
+ // a GPU is not in reset.
+ // Full OCC support below for when OPAL starts updating the reset status
+ for (gpu_num=0; gpu_num<MAX_NUM_GPU_PER_DOMAIN; gpu_num++)
+ {
+ if(not_in_reset[gpu_num] != g_amec->gpu[gpu_num].status.notReset)
+ {
+ INTR_TRAC_IMP("update_gpu_reset_status: GPU%d NOT in reset is now = %d",
+ gpu_num,
+ not_in_reset[gpu_num]);
+
+ // There has been a change to the reset status clear everything out except for errors logged so we don't log again
+ clear_gpu_driver_status(gpu_num);
+ g_amec->gpu[gpu_num].status.errorCount = 0;
+ g_amec->gpu[gpu_num].status.retryCount = 0;
+
+ // readOnce of false will force comm to be established and once established then checkDriverLoaded will get set
+ g_amec->gpu[gpu_num].status.readOnce = false;
+
+ if(not_in_reset[gpu_num])
+ {
+ // GPU was taken out of reset clear disabled to allow communication again
+ g_amec->gpu[gpu_num].status.disabled = false;
+ }
+ else
+ {
+ // GPU was put in reset. Clear temperature sensor errors and set to not available
+ g_amec->gpu[gpu_num].status.coreTempFailure = false;
+ g_amec->gpu[gpu_num].status.coreTempNotAvailable = true;
+ g_amec->gpu[gpu_num].status.memTempFailure = false;
+ g_amec->gpu[gpu_num].status.memTempNotAvailable = true;
+ }
+
+ g_amec->gpu[gpu_num].status.notReset = not_in_reset[gpu_num];
+
+ } // if GPU reset status changed
+ } // for each GPU
+} // end update_gpu_reset_status()
+
// Disable GPU monitoring for all GPUs
void disable_all_gpus(void)
{
@@ -327,95 +497,103 @@ void gpu_ipc_init()
}
}
-// Called after a failure for a specified GPU. The error will
+// Called after a failure reading core temp for a specified GPU. The error will
// be counted and if threshold is reached, an error will be created with
-// the GPU as a callout and then set flag to force reset
+// the GPU as a callout if the GPU is not in reset
void mark_gpu_failed(const gpu_sm_args_t *i_arg)
{
uint32_t gpu_id = i_arg->gpu_id;
do
{
- // ignore all errors if haven't reached timeout for comm established
- if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
- (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
- {
- // do nothing but reset at this time
- break;
- }
if((false == g_amec->gpu[gpu_id].status.disabled) &&
(true == g_amec->gpu[gpu_id].status.readOnce))
{
- INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
- "(ffdc 0x%08X%08X)",
- gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
- WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
+ GPU_DBG("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
+ "(ffdc 0x%08X%08X)",
+ gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
+ WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
}
- if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
- (false == g_amec->gpu[gpu_id].status.disabled) &&
- (true == g_amec->gpu[gpu_id].status.readOnce))
+ // Always inc retry count for I2C reset regardless of if GPU is in reset or not
+ g_amec->gpu[gpu_id].status.retryCount++;
+
+ // Only inc error count if it is known that GPU is NOT in reset
+ // NOTE: Default is IN reset so this will only be true when OPAL/OS supports telling the OCC reset status
+ // if OS never tells the OCC reset status the OCC will never disable or log a comm error
+ if(g_amec->gpu[gpu_id].status.notReset)
{
- G_gpu_state = GPU_STATE_IDLE;
-
- // Something has gone wrong and it may be that OPAL has put
- // the GPU into reset. For now, if this happens we will just
- // continue polling the GPU until it comes back.
- g_amec->gpu[gpu_id].status.readOnce = false;
- g_amec->gpu[gpu_id].status.checkDriverLoaded = true;
- g_amec->gpu[gpu_id].status.driverLoaded = false;
- g_amec->gpu[gpu_id].status.checkMemTempSupport = true;
- g_amec->gpu[gpu_id].status.memTempSupported = false;
- g_amec->gpu[gpu_id].status.memErrorCount = 0;
- g_amec->gpu[gpu_id].status.errorCount = 0;
-
-// This code can be used if an interlock with OPAL is ever introduced
-#if 0
- // Disable this GPU, collect FFDC and log error
- g_amec->gpu[gpu_id].status.disabled = true;
-
- INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
- gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
- errlHndl_t l_err = NULL;
- /*
- * @errortype
- * @moduleid GPU_MID_MARK_GPU_FAILED
- * @reasoncode GPU_FAILURE
- * @userdata1 GPE returned rc code
- * @userdata4 ERC_GPU_COMPLETE_FAILURE
- * @devdesc GPU failure
- */
- l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
- GPU_FAILURE,
- ERC_GPU_COMPLETE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- i_arg->error.rc,
- 0);
- addUsrDtlsToErrl(l_err,
- (uint8_t*)&i_arg->error.ffdc,
- sizeof(i_arg->error.ffdc),
- ERRL_STRUCT_VERSION_1,
- ERRL_USR_DTL_BINARY_DATA);
-
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ // INC count and check if reached error threshold
+ if( ++g_amec->gpu[gpu_id].status.errorCount > GPU_INIT_ERROR_COUNT)
{
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ // set that GPU temperature readings failed
+ g_amec->gpu[gpu_id].status.memTempFailure = true;
+ g_amec->gpu[gpu_id].status.memTempNotAvailable = true;
+ g_amec->gpu[gpu_id].status.coreTempFailure = true;
+ g_amec->gpu[gpu_id].status.coreTempNotAvailable = true;
+
+ // Disable this GPU. GPU will get re-enabled if detected that GPU is put in reset and then taken out
+ g_amec->gpu[gpu_id].status.disabled = true;
+
+ INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
+ gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
+
+ if(g_amec->gpu[gpu_id].status.commErrorLogged == false)
+ {
+
+ errlHndl_t l_err = NULL;
+ /*
+ * @errortype
+ * @moduleid GPU_MID_MARK_GPU_FAILED
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPE returned rc code
+ * @userdata4 ERC_GPU_COMPLETE_FAILURE
+ * @devdesc GPU failure
+ */
+ l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
+ GPU_FAILURE,
+ ERC_GPU_COMPLETE_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ i_arg->error.rc,
+ 0);
+ addUsrDtlsToErrl(l_err,
+ (uint8_t*)&i_arg->error.ffdc,
+ sizeof(i_arg->error.ffdc),
+ ERRL_STRUCT_VERSION_1,
+ ERRL_USR_DTL_BINARY_DATA);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ commitErrl(&l_err);
+ g_amec->gpu[gpu_id].status.commErrorLogged = true;
+
+ } // if !commErrorLogged
+
+ } // if errorCount > threshold
+
+ } // if notReset
- commitErrl(&l_err);
-#endif
- }
} while(0);
- // Reset GPU
- G_gpu_i2c_reset_required = true;
- G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+ // Do an I2C reset if reached retry count
+ // don't want to do I2C reset every time since could be that this GPU really is in reset and
+ // while resetting I2C we are unable to read other GPUs that may not be in reset
+ if( g_amec->gpu[gpu_id].status.retryCount > GPU_ERRORS_BEFORE_I2C_RESET)
+ {
+ g_amec->gpu[gpu_id].status.retryCount = 0;
+ G_gpu_i2c_reset_required = true;
+ G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+ }
+
} // end mark_gpu_failed()
// Schedule a GPE request for GPU operation
@@ -461,6 +639,27 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
case GPU_REQ_READ_MEM_TEMP_FINISH:
break;
+ // Check if driver is loaded
+ case GPU_REQ_CHECK_DRIVER_START:
+ case GPU_REQ_CHECK_DRIVER_2:
+ case GPU_REQ_CHECK_DRIVER_3:
+ case GPU_REQ_CHECK_DRIVER_FINISH:
+ break;
+
+ // Read GPU Power Limit
+ case GPU_REQ_READ_PWR_LIMIT_START:
+ case GPU_REQ_READ_PWR_LIMIT_2:
+ case GPU_REQ_READ_PWR_LIMIT_3:
+ case GPU_REQ_READ_PWR_LIMIT_FINISH:
+ break;
+
+ // Set GPU Power Limit
+ case GPU_REQ_SET_PWR_LIMIT_START:
+ case GPU_REQ_SET_PWR_LIMIT_2:
+ case GPU_REQ_SET_PWR_LIMIT_3:
+ case GPU_REQ_SET_PWR_LIMIT_FINISH:
+ break;
+
// I2C reset
case GPU_REQ_RESET:
break;
@@ -498,6 +697,7 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
{
// Clear errors and init common arguments for GPE
G_gpu_op_req_args.error.error = 0;
+ G_gpu_op_req_args.gpu_rc = 0;
G_gpu_op_req_args.operation = i_operation;
G_gpu_op_req_args.gpu_id = G_current_gpu_id;
@@ -574,14 +774,14 @@ bool gpu_reset_sm()
/*
* @errortype
* @moduleid GPU_MID_GPU_RESET_SM
- * @reasoncode GPU_FAILURE
+ * @reasoncode GPU_NO_GPE_SUPPORT
* @userdata1 0
* @userdata2 0
* @userdata4 ERC_GPU_NO_GPE_SUPPORT
* @devdesc GPE1 image doesn't support GPU communication
*/
errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
- GPU_FAILURE,
+ GPU_NO_GPE_SUPPORT,
ERC_GPU_NO_GPE_SUPPORT,
ERRL_SEV_UNRECOVERABLE,
NULL,
@@ -606,9 +806,7 @@ bool gpu_reset_sm()
else // this reset attempt failed
{
// Stop trying if reached max resets
- if( (L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) &&
- (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) >=
- GPU_COMM_ESTAB_TIMEOUT_SECONDS))
+ if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
{
INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
L_reset_state);
@@ -662,12 +860,12 @@ bool gpu_reset_sm()
break;
case GPU_RESET_STATE_RESET_MASTER:
- G_new_gpu_req_args.data = GPU_RESET_REQ_MASTER;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
case GPU_RESET_STATE_RESET_SLAVE:
- G_new_gpu_req_args.data = GPU_RESET_REQ_SLV;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
@@ -677,7 +875,7 @@ bool gpu_reset_sm()
break;
case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
- G_new_gpu_req_args.data = GPU_RESET_REQ_SLV_COMPLETE;
+ G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
break;
@@ -718,6 +916,555 @@ bool gpu_reset_sm()
// Function Specification
//
+// Name: gpu_check_driver_loaded_sm
+//
+// Description: Called from gpu_task_sm to check if driver is loaded for G_current_gpu_id
+// This function should only return that complete is TRUE when the check
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to check
+//
+// End Function Specification
+bool gpu_check_driver_loaded_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when the read is complete or failed
+ bool l_new_driver_loaded = FALSE;
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_check_driver_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
+ static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new read then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_check_driver_state != GPU_STATE_CHECK_DRIVER_LOADED_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_check_driver_failure_count[G_current_gpu_id] = 0; // clear driver failure count since there's a driver change
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached state retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_check_driver_failure_count[G_current_gpu_id] > GPU_CHECK_DRIVER_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: Failed to check driver loaded for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up checking driver loaded for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+ L_check_driver_failure_count[G_current_gpu_id] = 0;
+
+ // without driver loaded cannot read memory temp, mark memory temp as failed
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // log one time error that driver loaded couldn't be determined
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_CHECK_DRIVER_LOADED
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_CHECK_DRIVER_LOADED_FAILURE
+ * @devdesc Failure to check GPU driver loaded
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_CHECK_DRIVER_LOADED,
+ GPU_FAILURE,
+ ERC_GPU_CHECK_DRIVER_LOADED_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached state retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_check_driver_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_check_driver_state)
+ {
+ case GPU_STATE_CHECK_DRIVER_LOADED_START:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
+ // Update driver loaded
+ l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
+ if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
+ {
+ // Driver loaded status changed
+ INTR_TRAC_IMP("gpu_check_driver_loaded: GPU%d driver loaded changed to %d",
+ G_current_gpu_id,
+ l_new_driver_loaded);
+
+ if(l_new_driver_loaded)
+ {
+ // Driver is now loaded do checking that required driver to be loaded
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = true;
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+ // done checking for driver to be loaded
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+ }
+ else
+ {
+ // Driver is no longer loaded
+ clear_gpu_driver_status(G_current_gpu_id);
+
+ // memory temp only available when driver is loaded
+ // clear error and set not available
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // Need to keep query for driver loaded to detect when driver is loaded
+ g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+ }
+
+ g_amec->gpu[G_current_gpu_id].status.driverLoaded = l_new_driver_loaded;
+ }
+
+ // Done with this GPU ready to move to new one
+ L_check_driver_failure_count[G_current_gpu_id] = 0;
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_check_driver_loaded: INVALID STATE: 0x%02X", L_check_driver_state);
+ L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_check_driver_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_check_driver_loaded: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_check_driver_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: failed to schedule state 0x%02X", L_check_driver_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_check_driver_loaded: NOT idle for state 0x%02X", L_check_driver_state);
+ }
+
+ return l_complete;
+} // end gpu_check_driver_loaded_sm()
+
+// Function Specification
+//
+// Name: gpu_read_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to read GPU power limits for G_current_gpu_id
+// This function should only return that complete is TRUE when the read
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_read_pwr_limit_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when the read is complete or failed
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_read_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
+ static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new read then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up trying to read power limits for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false;
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+ // log one time error that power limits could not be read
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_PWR_LIMIT
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_READ_PWR_LIMIT_FAILURE
+ * @devdesc Failure to read GPU power limits
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT,
+ GPU_FAILURE,
+ ERC_GPU_READ_PWR_LIMIT_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_read_pwr_limit_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_read_pwr_limit_state)
+ {
+ case GPU_STATE_READ_PWR_LIMIT_START:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
+ g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
+ // Update power limits
+ g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = (uint32_t) G_gpu_op_req_args.data[1];
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = (uint32_t) G_gpu_op_req_args.data[2];
+
+ // Done with this GPU ready to move to new one
+ L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state);
+ L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_read_pwr_limit_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_read_pwr_limit_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_read_pwr_limit: NOT idle for state 0x%02X", L_read_pwr_limit_state);
+ }
+
+ return l_complete;
+} // end gpu_read_pwr_limit_sm()
+
+// Function Specification
+//
+// Name: gpu_set_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to set GPU power limit for G_current_gpu_id
+// This function should only return that complete is TRUE when the set
+// is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req: Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_set_pwr_limit_sm()
+{
+ bool l_complete = FALSE; // only return TRUE when complete or failed
+ static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
+ static uint8_t L_state_failure_count = 0;
+ static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+ if (async_request_is_idle(&G_gpu_op_request.request))
+ {
+ // If not starting a new set limit then need to check status of current state before moving on
+ // stay in current state if the schedule failed or the state isn't finished/failed
+ if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) &&
+ (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+ {
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+ {
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
+ {
+ if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X",
+ G_gpu_op_req_args.data[0],
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
+
+ // give up trying to set power limit for this GPU
+ // It will be retried if detected that GPU is put in reset and then taken out or driver change
+ g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+ // log error that power limit could not be set
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_SET_PWR_LIMIT
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_SET_PWR_LIMIT_FAILURE
+ * @devdesc Failure to set GPU power limit
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT,
+ GPU_FAILURE,
+ ERC_GPU_SET_PWR_LIMIT_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
+
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ L_state_failure_count = 0;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ } // if reached retry count
+ else
+ {
+ // INC failure count and retry current state
+ L_state_failure_count++;
+ }
+ }
+ else // success on last state go to next state and process it
+ {
+ L_state_failure_count = 0;
+ L_set_pwr_limit_state++;
+ }
+
+ L_scheduled = FALSE; // default nothing scheduled
+
+ switch (L_set_pwr_limit_state)
+ {
+ case GPU_STATE_SET_PWR_LIMIT_START:
+ // send the desired GPU power cap to the GPE to send to GPU
+ G_new_gpu_req_args.data[0] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw;
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_START, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_2:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_3:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+ break;
+
+ case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
+ // Update the requested power limit since it was successfully sent
+ // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
+ // has caluclated a new desired pcap while this one was already in process of being set
+ g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+
+ // Done with this GPU ready to move to new one
+ L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+
+ default:
+ INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state);
+ L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+ l_complete = TRUE;
+ break;
+ } // switch L_set_pwr_limit_state
+
+ if(L_scheduled)
+ {
+ GPU_DBG("gpu_set_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+ L_set_pwr_limit_state, GPU_TICK);
+ }
+ else if(!l_complete) // if not complete there must have been a failure on the schedule
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state);
+ }
+
+ } // if async_request_is_idle
+ else
+ {
+ INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state);
+ }
+
+ return l_complete;
+} // end gpu_set_pwr_limit_sm()
+
+// Function Specification
+//
// Name: gpu_read_temp_sm
//
// Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id
@@ -750,6 +1497,7 @@ bool gpu_read_temp_sm()
mark_gpu_failed(&G_gpu_op_req_args);
L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+ L_read_failure_count = 0;
return TRUE; // Done with this GPU, let GPU SM move to next
}
else
@@ -778,7 +1526,7 @@ bool gpu_read_temp_sm()
case GPU_STATE_READ_TEMP_COMPLETE:
if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
- (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp?
+ (0 != G_gpu_op_req_args.data[0]) )
{
g_amec->gpu[G_current_gpu_id].status.readOnce = true;
@@ -791,15 +1539,17 @@ bool gpu_read_temp_sm()
}
// comm is now established update for capability checking to take place
- g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
}
// Update sensor
- l_temp = G_gpu_op_req_args.data;
+ l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
// Clear all past errors
+ g_amec->gpu[G_current_gpu_id].status.coreTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.coreTempNotAvailable = false;
g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
+ g_amec->gpu[G_current_gpu_id].status.retryCount = 0;
// check if there is an overtemp that hasn't been reported
if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
@@ -897,8 +1647,10 @@ bool gpu_read_mem_temp_capability_sm()
{
bool l_complete = FALSE; // only return TRUE when the read is complete or failed
static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
- static uint8_t L_read_failure_count = 0;
+ static uint8_t L_read_mem_cap_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+ static uint8_t L_state_failure_count = 0;
static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
if (async_request_is_idle(&G_gpu_op_request.request))
{
@@ -907,57 +1659,91 @@ bool gpu_read_mem_temp_capability_sm()
if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) &&
(!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
{
- // If reached retry count give up on this read
- if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
- // log error that memory temp capability couldn't be determined
- // memory temp support will be left as not supported
- INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d", G_current_gpu_id);
-
- // Log error
- /* @
- * @errortype
- * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
- * @reasoncode GPU_FAILURE
- * @userdata1 GPU ID
- * @userdata2 0
- * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
- * @devdesc Failure to read GPU memory temp capability
- *
- */
- errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
- GPU_FAILURE,
- ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_current_gpu_id,
- 0);
-
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_state_failure_count = 0;
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+ L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+ // If reached state retry count give up on this read
+ else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+ {
+ // if GPU is not in reset then INC error count and check if reached threshold
+ if(g_amec->gpu[G_current_gpu_id].status.notReset)
{
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ if(++L_read_mem_cap_failure_count[G_current_gpu_id] > GPU_READ_MEM_CAP_ERROR_COUNT)
+ {
+ INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d RC: 0x%02X",
+ G_current_gpu_id,
+ G_gpu_op_req_args.gpu_rc);
- // Commit Error
- commitErrl(&l_err);
+ // give up trying to read mem temp capability for this GPU
+ // It will be retried if detected that GPU driver is re-loaded
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
+
+ // cannot determine memory temp capability, mark memory temp as failed
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+ // log one time error that memory temp capability couldn't be determined
+ if(!L_error_logged[G_current_gpu_id])
+ {
+ L_error_logged[G_current_gpu_id] = TRUE;
+
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 0
+ * @userdata4 ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
+ * @devdesc Failure to read memory temp capability
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
+ GPU_FAILURE,
+ ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_current_gpu_id,
+ 0);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ } // if error not logged
+ } // if reached error count
+ } // if notReset
L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+ L_state_failure_count = 0;
return TRUE; // Done with this GPU, let GPU SM move to next
- }
+ } // if reached state retry count
else
{
// INC failure count and retry current state
- L_read_failure_count++;
+ L_state_failure_count++;
}
}
else // success on last state go to next state and process it
{
- L_read_failure_count = 0;
+ L_state_failure_count = 0;
L_read_cap_state++;
}
@@ -983,8 +1769,21 @@ bool gpu_read_mem_temp_capability_sm()
case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
// Update capability
- g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data & 0x01;
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
+
+ if(g_amec->gpu[G_current_gpu_id].status.memTempSupported)
+ {
+ // mem temp is supported no need to re-check capability
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+ }
+ else
+ {
+ // Need to keep query for mem temp capability to detect if ever changes to capable
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
+ }
+
// Done with this GPU ready to move to new one
+ L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
l_complete = TRUE;
break;
@@ -1041,60 +1840,95 @@ bool gpu_read_memory_temp_sm()
if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) &&
(!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
{
- // If reached retry count give up on this read
- if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+ // Check if failure was due to driver change
+ if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
- // INC memory error count and check if reached timeout threshold for new mem temp
- uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
- g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
- if((max_read_timeout) && (max_read_timeout != 0xFF) &&
- (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+ handle_driver_change();
+ // Request can't be processed by GPU at this time so we are done with this GPU
+ // setup to start new request
+ L_read_failure_count = 0;
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
+ L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+ return TRUE; // Done with this GPU, let GPU SM move to next
+ }
+
+ // If reached retry count or GPU indicated cmd not supported then give up on this read
+ else if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
+ (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
+ {
+ // if GPU is not in reset or the GPU responded with command not supported then
+ // INC memory error count and check if reached timeout for new mem temp
+ if( (g_amec->gpu[G_current_gpu_id].status.notReset) ||
+ (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
{
- // Disable memory temp reading for this GPU and log error
- g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
- // so BMC knows there is an error for fan control set sensor to 0xFF
- sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), 0xFFFF);
-
- INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
- G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
-
- // Log error
- /* @
- * @errortype
- * @moduleid GPU_MID_GPU_READ_MEM_TEMP
- * @reasoncode GPU_FAILURE
- * @userdata1 GPU ID
- * @userdata2 number consecutive read mem temp failures
- * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT
- * @devdesc Timeout reading new GPU memory temperature
- *
- */
- errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
- GPU_FAILURE,
- ERC_GPU_READ_MEM_TEMP_TIMEOUT,
- ERRL_SEV_PREDICTIVE,
- NULL,
- DEFAULT_TRACE_SIZE,
- G_current_gpu_id,
- g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
- // Callout the GPU if have sensor ID for it
- if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
- {
- addCalloutToErrl(l_err,
- ERRL_CALLOUT_TYPE_GPU_ID,
- G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
- ERRL_CALLOUT_PRIORITY_MED);
- }
+ uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
+ if((max_read_timeout) && (max_read_timeout != 0xFF) &&
+ (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+ {
+ // Disable memory temp reading for this GPU and log error
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
+ // so BMC knows there is an error for fan control set failure
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
- // Commit Error
- commitErrl(&l_err);
+ INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
+ G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
- } // if timeout error
+ if(g_amec->gpu[G_current_gpu_id].status.commErrorLogged == false)
+ {
+ INTR_TRAC_ERR("notReset: %d rc: 0x%0X", g_amec->gpu[G_current_gpu_id].status.notReset,
+ G_gpu_op_req_args.error.rc);
+ // Log error
+ /* @
+ * @errortype
+ * @moduleid GPU_MID_GPU_READ_MEM_TEMP
+ * @reasoncode GPU_FAILURE
+ * @userdata1 GPU ID
+ * @userdata2 GPU RC
+ * @userdata4 ERC_GPU_READ_MEM_TEMP_TIMEOUT
+ * @devdesc Timeout reading new GPU memory temperature
+ *
+ */
+ errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
+ GPU_FAILURE,
+ ERC_GPU_READ_MEM_TEMP_TIMEOUT,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ G_gpu_op_req_args.gpu_rc,
+ g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+
+ // Callout the GPU if have sensor ID for it
+ if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+ {
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_GPU_ID,
+ G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+ ERRL_CALLOUT_PRIORITY_MED);
+ }
+
+ // Commit Error
+ commitErrl(&l_err);
+ g_amec->gpu[G_current_gpu_id].status.commErrorLogged = true;
+ } // if !commErrorLogged
+ } // if timeout error
+ else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED)
+ {
+ // GPU indicated command not supported, re-check mem temp capability
+ // if we try to read mem temp again that means mem temp was reported capable
+ // and if this continues to fail eventually an error will be logged above at timeout
+ g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+ g_amec->gpu[G_current_gpu_id].status.memTempSupported = false;
+ }
+ } // if notReset or command not supported
+ // setup to start new request
+ L_read_failure_count = 0;
L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
return TRUE; // Done with this GPU, let GPU SM move to next
- }
+ } // else if failure count exceeded or command not supported
else
{
// INC failure count and retry current state
@@ -1115,20 +1949,26 @@ bool gpu_read_memory_temp_sm()
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
break;
- case GPU_STATE_READ_MEM_TEMP_STOP:
+ case GPU_STATE_READ_MEM_TEMP_2:
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args);
break;
- case GPU_STATE_READ_MEM_TEMP_READ:
+ case GPU_STATE_READ_MEM_TEMP_3:
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args);
break;
+ case GPU_STATE_READ_MEM_TEMP_READ:
+ L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_FINISH, G_new_gpu_req_args);
+ break;
+
case GPU_STATE_READ_MEM_TEMP_COMPLETE:
// Update sensor
- l_temp = G_gpu_op_req_args.data;
+ l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
// Clear past errors
+ g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+ g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = false;
g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
// check if there is an overtemp that hasn't been reported
@@ -1253,9 +2093,30 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
break;
}
-//TODO: Enable when functional
-#if 0
- // 3. Need to check if driver is loaded?
+ // 3. Time to start new temperature reads?
+ if(i_read_temp_start_needed)
+ {
+ // Start reading core temp from first present and functional GPU
+ l_gpu_id = get_first_gpu();
+ if(l_gpu_id != 0xFF)
+ {
+ // Read core temp for this GPU
+ G_current_gpu_id = l_gpu_id;
+ G_gpu_state = GPU_STATE_READ_TEMP;
+ l_new_state = TRUE;
+ break;
+ }
+ else // no functional GPUs
+ {
+ // release I2C lock to the host for this engine and stop monitoring
+ occ_i2c_lock_release(GPU_I2C_ENGINE);
+ G_gpu_state = GPU_STATE_NO_LOCK;
+ G_gpu_monitoring_allowed = FALSE;
+ l_new_state = FALSE; // No new state for GPU communication
+ break;
+ }
+ }
+ // 4. Need to check if driver is loaded?
l_gpu_id = gpu_id_need_driver_check();
if(l_gpu_id != 0xFF)
{
@@ -1266,7 +2127,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
break;
}
- // 4. Need to read power limits?
+//TODO: Enable when functional
+#if 0
+ // 5. Need to read power limits?
l_gpu_id = gpu_id_need_power_limits();
if(l_gpu_id != 0xFF)
{
@@ -1276,8 +2139,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
l_new_state = TRUE;
break;
}
+#endif
- // 5. Need to read memory temps?
+ // 6. Need to read memory temps?
if(i_mem_temp_needed)
{
// first check if there is a GPU that needs memory temp capability checked
@@ -1304,31 +2168,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
}
}
}
-#endif
-
- // 6. Time to start new temperature reads?
- if(i_read_temp_start_needed)
- {
- // Start reading core temp from first present and functional GPU
- l_gpu_id = get_first_gpu();
- if(l_gpu_id != 0xFF)
- {
- // Read core temp for this GPU
- G_current_gpu_id = l_gpu_id;
- G_gpu_state = GPU_STATE_READ_TEMP;
- l_new_state = TRUE;
- break;
- }
- else // no functional GPUs
- {
- // release I2C lock to the host for this engine and stop monitoring
- occ_i2c_lock_release(GPU_I2C_ENGINE);
- G_gpu_state = GPU_STATE_NO_LOCK;
- G_gpu_monitoring_allowed = FALSE;
- l_new_state = FALSE; // No new state for GPU communication
- break;
- }
- }
// Else nothing stay idle
}while(0);
@@ -1361,12 +2200,13 @@ void task_gpu_sm(struct task *i_self)
// are functional or GPU I2C interface is broken
if(G_gpu_monitoring_allowed)
{
+ // Read and update reset status for all GPUs
+ update_gpu_reset_status();
// Initialize the IPC commands if this is our first run
if(L_gpu_first_run)
{
gpu_ipc_init();
- G_gpu_sm_start_time = ssx_timebase_get(); // used for timeout establishing comm
L_gpu_first_run = FALSE;
}
@@ -1379,6 +2219,7 @@ void task_gpu_sm(struct task *i_self)
if(L_numCallsForTempRead >= GPU_TEMP_READ_1S)
{
L_read_temp_start_needed = TRUE;
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
}
}
@@ -1430,6 +2271,7 @@ void task_gpu_sm(struct task *i_self)
// Start first with reading core temp of first functional GPU
L_numCallsForTempRead = 0; // to track start of next temp reading in 1s
L_read_temp_start_needed = FALSE; // start is no longer needed
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
l_gpu_id = get_first_gpu();
if(l_gpu_id != 0xFF)
{
@@ -1506,7 +2348,6 @@ void task_gpu_sm(struct task *i_self)
{
// Capability check complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1514,17 +2355,14 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_CHECK_DRIVER_LOADED:
// Check if driver is loaded for current GPU
- if(1) // TODO
+ if(gpu_check_driver_loaded_sm())
{
- // Driver check complete for this GPU, go to IDLE state
- // to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE;
- g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE;
- if(g_amec->gpu[G_current_gpu_id].status.driverLoaded)
- {
- // Driver is loaded, read the power limits so we can start GPU power capping
- g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE;
- }
+ // Driver check complete for this GPU,
+ // NOTE: Do not set status.checkDriverLoaded to false here, if driver is
+ // not loaded we need to keep checking for driver to be loaded this is decided
+ // inside gpu_check_driver_loaded_sm()
+
+ // go to IDLE state to let IDLE SM decide what to do next
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1532,11 +2370,10 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_READ_PWR_LIMIT:
// Read power limits for current GPU
- if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported
+ if(gpu_read_pwr_limit_sm())
{
// Read power limits complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
- g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
G_gpu_state = GPU_STATE_IDLE;
l_start_next_state = TRUE;
}
@@ -1544,7 +2381,7 @@ void task_gpu_sm(struct task *i_self)
case GPU_STATE_SET_PWR_LIMIT:
// Set power limit on current GPU
- if(1) // TODO
+ if(gpu_set_pwr_limit_sm())
{
// Set power limit complete for this GPU, go to IDLE state
// to let IDLE SM decide what to do next
@@ -1596,6 +2433,7 @@ void task_gpu_sm(struct task *i_self)
// new state to read core temp reset temperature reading timer
L_numCallsForTempRead = 0;
L_read_temp_start_needed = FALSE; // start no longer needed
+ L_mem_temp_needed = FALSE; // will get set to TRUE when core temp reads finish
}
else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP)
{
@@ -1606,6 +2444,4 @@ void task_gpu_sm(struct task *i_self)
}
}while((l_start_next_state) && (!l_next_state));
} // GPU monitoring enabled
-
-
} // end task_gpu_sm()
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index c8f13ff..a26c9c0 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -74,9 +74,10 @@ typedef enum
{
GPU_STATE_READ_MEM_TEMP_NEW = 0x21,
GPU_STATE_READ_MEM_TEMP_START = 0x22,
- GPU_STATE_READ_MEM_TEMP_STOP = 0x23,
- GPU_STATE_READ_MEM_TEMP_READ = 0x24,
- GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x25,
+ GPU_STATE_READ_MEM_TEMP_2 = 0x23,
+ GPU_STATE_READ_MEM_TEMP_3 = 0x24,
+ GPU_STATE_READ_MEM_TEMP_READ = 0x25,
+ GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x26,
} gpuReadMemTempState_e;
// States for checking GPU memory temperature capability (gpu_read_mem_temp_capability_sm)
@@ -90,6 +91,39 @@ typedef enum
GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x36,
} gpuReadMemTempCapableState_e;
+// States for checking if GPU driver is loaded (gpu_check_driver_loaded_sm)
+typedef enum
+{
+ GPU_STATE_CHECK_DRIVER_LOADED_NEW = 0x41,
+ GPU_STATE_CHECK_DRIVER_LOADED_START = 0x42,
+ GPU_STATE_CHECK_DRIVER_LOADED_2 = 0x43,
+ GPU_STATE_CHECK_DRIVER_LOADED_3 = 0x44,
+ GPU_STATE_CHECK_DRIVER_LOADED_READ = 0x45,
+ GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE = 0x46,
+} gpuCheckDriverLoadedState_e;
+
+// States for reading GPU power limits (gpu_read_pwr_limit_sm)
+typedef enum
+{
+ GPU_STATE_READ_PWR_LIMIT_NEW = 0x51,
+ GPU_STATE_READ_PWR_LIMIT_START = 0x52,
+ GPU_STATE_READ_PWR_LIMIT_2 = 0x53,
+ GPU_STATE_READ_PWR_LIMIT_3 = 0x54,
+ GPU_STATE_READ_PWR_LIMIT_READ = 0x55,
+ GPU_STATE_READ_PWR_LIMIT_COMPLETE = 0x56,
+} gpuReadPwrLimitState_e;
+
+// States for setting GPU power limit (gpu_set_pwr_limit_sm)
+typedef enum
+{
+ GPU_STATE_SET_PWR_LIMIT_NEW = 0x61,
+ GPU_STATE_SET_PWR_LIMIT_START = 0x62,
+ GPU_STATE_SET_PWR_LIMIT_2 = 0x63,
+ GPU_STATE_SET_PWR_LIMIT_3 = 0x64,
+ GPU_STATE_SET_PWR_LIMIT_READ = 0x65,
+ GPU_STATE_SET_PWR_LIMIT_COMPLETE = 0x66,
+} gpuSetPwrLimitState_e;
+
// GPU IPC initialization
void gpu_ipc_init();
diff --git a/src/occ_405/gpu/gpu_service_codes.h b/src/occ_405/gpu/gpu_service_codes.h
index 41cb3f9..4ea7c6f 100755
--- a/src/occ_405/gpu/gpu_service_codes.h
+++ b/src/occ_405/gpu/gpu_service_codes.h
@@ -39,6 +39,9 @@ enum gpuModuleId
GPU_MID_GPU_READ_TEMP = GPU_COMP_ID | 0x06,
GPU_MID_GPU_READ_MEM_TEMP = GPU_COMP_ID | 0x07,
GPU_MID_GPU_READ_MEM_TEMP_CAPABLE = GPU_COMP_ID | 0x08,
+ GPU_MID_GPU_CHECK_DRIVER_LOADED = GPU_COMP_ID | 0x09,
+ GPU_MID_GPU_READ_PWR_LIMIT = GPU_COMP_ID | 0x0A,
+ GPU_MID_GPU_SET_PWR_LIMIT = GPU_COMP_ID | 0x0B,
};
#endif /* #ifndef _GPU_SERVICE_CODES_H_ */
diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h
index a379f87..c740c05 100644
--- a/src/occ_405/occ_service_codes.h
+++ b/src/occ_405/occ_service_codes.h
@@ -87,12 +87,14 @@ enum occReasonCode
/// Firmware Failure: equivalent to assertion failures
INTERNAL_FW_FAILURE = 0xA0,
+ /// Build problem, gpe1 image doesn't support GPU interface
+ GPU_NO_GPE_SUPPORT = 0xA1,
/// Error with GPU tasks
- GPU_FAILURE = 0xA1,
+ GPU_FAILURE = 0xA2,
/// GPU core reached error threshold
- GPU_ERROR_TEMP = 0xA2,
+ GPU_ERROR_TEMP = 0xA3,
/// GPU memory reached error threshold
- GPU_MEMORY_ERROR_TEMP = 0xA3,
+ GPU_MEMORY_ERROR_TEMP = 0xA4,
/// Failure within the OCC Complex of the processor
INTERNAL_HW_FAILURE = 0xB0,
@@ -289,7 +291,11 @@ enum occExtReasonCode
ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE = 0x00F6,
ERC_GPU_INVALID_GPU_OPERATION = 0x00F7,
ERC_GPU_N_MODE_PCAP_CALC_FAILURE = 0x00F8,
+ ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE = 0x00F9,
ERC_GPU_NO_GPE_SUPPORT = 0x00FF,
+ ERC_GPU_CHECK_DRIVER_LOADED_FAILURE = 0x0100,
+ ERC_GPU_READ_PWR_LIMIT_FAILURE = 0x0101,
+ ERC_GPU_SET_PWR_LIMIT_FAILURE = 0x0102,
ERC_STATE_FROM_ALL_TO_STB_FAILURE = 0x0123,
ERC_STATE_FROM_ACT_TO_CHR_FAILURE = 0x0124,
OpenPOWER on IntegriCloud