GPU 405 Enable Memory Temperatures

Change-Id: Id50d12a50a05b8b3a6a6f1ce3ce4512d3299caa7 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/46882 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
author: William Bryan <wilbryan@us.ibm.com> 2017-09-28 13:32:29 -0500
committer: William A. Bryan <wilbryan@us.ibm.com> 2017-10-03 16:03:05 -0400
commit: 74f721c90235a18821b97782d98349cf51e0f12d (patch)
tree: 1f2fd59b41db514c0273632dd2dd7926e25a2030
parent: 76b91d0038d59b30de14108e908bc78c6d988796 (diff)
download: talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.tar.gz
talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.zip
11 files changed, 1303 insertions, 312 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h
index 8580012..3bb1fa0 100644
--- a/src/common/gpe_err.h
+++ b/src/common/gpe_err.h
@@ -51,6 +51,8 @@
 #define GPE_RC_GET_NEST_DTS_FAILED   0x61     // Failed to collect nest DTS temperatures
 
 // GPU Errors
-#define GPE_RC_NO_GPU_SUPPORT        0x8F     // GPE1 image doesn't support GPUs
-
+#define GPE_RC_NO_GPU_SUPPORT        0x80     // GPE1 image doesn't support GPUs
+#define GPE_RC_GPU_DRIVER_CHANGE     0x81     // GPU in transition or just completed phase change
+#define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82     // GPU rejected command with no support
+#define GPE_RC_GPU_CMD_FAILED        0x83     // An error occurred in the last GPU operation
 #endif //_GPE_ERR_H
diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h
index 7933adb..ba522e5 100644
--- a/src/common/gpu_structs.h
+++ b/src/common/gpu_structs.h
@@ -41,6 +41,12 @@
 
 typedef enum
 {
+    GPU_CAP_MEM     = 0x00,
+    GPU_CAP_CORE    = 0x01
+} GPU_CAPABILITIES;
+
+typedef enum
+{
     ID_GPU0 = 0x00,
     ID_GPU1 = 0x01,
     ID_GPU2 = 0x02,
@@ -64,12 +70,21 @@ typedef enum
     GPU_REQ_READ_MEM_TEMP_3             = 0x06, // mem temp step 3
     GPU_REQ_READ_MEM_TEMP_FINISH        = 0x07, // Get memory temp reading
     GPU_REQ_READ_CAPS_START             = 0x08, // Start reading capabilities
-    GPU_REQ_READ_CAPS_2                 = 0x09, // Start reading capabilities
-    GPU_REQ_READ_CAPS_3                 = 0x0A, // Start reading capabilities
-    GPU_REQ_READ_CAPS_FINISH            = 0x0B,
-    GPU_REQ_READ_PWR_LIMIT_START        = 0x0C, // Start reading GPU information
-    GPU_REQ_READ_PWR_LIMIT_STOP         = 0x0D, // Read GPU temp register
-    GPU_REQ_READ_PWR_LIMIT              = 0x0E, // Start reading pwr limit
+    GPU_REQ_READ_CAPS_2                 = 0x09, // Capabilities read step 2
+    GPU_REQ_READ_CAPS_3                 = 0x0A, // Capabilities read step 3
+    GPU_REQ_READ_CAPS_FINISH            = 0x0B, // get capabilities
+    GPU_REQ_READ_PWR_LIMIT_START        = 0x10, // Start reading GPU power limit
+    GPU_REQ_READ_PWR_LIMIT_2            = 0x11,
+    GPU_REQ_READ_PWR_LIMIT_3            = 0x12,
+    GPU_REQ_READ_PWR_LIMIT_FINISH       = 0x13,
+    GPU_REQ_SET_PWR_LIMIT_START         = 0x20, // Start setting GPU power limit
+    GPU_REQ_SET_PWR_LIMIT_2             = 0x21,
+    GPU_REQ_SET_PWR_LIMIT_3             = 0x22,
+    GPU_REQ_SET_PWR_LIMIT_FINISH        = 0x23,
+    GPU_REQ_CHECK_DRIVER_START          = 0x31, // Start check driver loaded
+    GPU_REQ_CHECK_DRIVER_2              = 0x32,
+    GPU_REQ_CHECK_DRIVER_3              = 0x33,
+    GPU_REQ_CHECK_DRIVER_FINISH         = 0x34,
     GPU_REQ_RESET                       = 0x60, // Reset
 } gpu_op_req_e;
 
@@ -78,10 +93,10 @@ typedef struct
 {
   GpeErrorStruct error;
   uint8_t gpu_id;
+  uint8_t gpu_rc;
   uint8_t operation;
-  uint64_t data;
+  uint64_t data[3];
 } gpu_sm_args_t;
 
-
 #endif // _GPU_STRUCTS_H
 
diff --git a/src/include/registers/ocb_firmware_registers.h b/src/include/registers/ocb_firmware_registers.h
index 5b6705a..010ad02 100644
--- a/src/include/registers/ocb_firmware_registers.h
+++ b/src/include/registers/ocb_firmware_registers.h
@@ -5,7 +5,7 @@
 /*                                                                        */
 /* OpenPOWER OnChipController Project                                     */
 /*                                                                        */
-/* Contributors Listed Below - COPYRIGHT 2015,2016                        */
+/* Contributors Listed Below - COPYRIGHT 2015,2017                        */
 /* [+] International Business Machines Corp.                              */
 /*                                                                        */
 /*                                                                        */
@@ -1411,9 +1411,21 @@ typedef union ocb_occflg {
     uint32_t i2c_engine2_lock_occ  : 1;
     uint32_t i2c_engine3_lock_host : 1;
     uint32_t i2c_engine3_lock_occ  : 1;
-    uint32_t reserved_occ : 10;
-#else
-    uint32_t reserved_occ : 10;
+    uint32_t gpu0_reset_status     : 1;
+    uint32_t gpu1_reset_status     : 1;
+    uint32_t gpu2_reset_status     : 1;
+    uint32_t reserved_occ          : 3;
+    uint32_t wof_hcode_mode        : 2;
+    uint32_t active_quad_update    : 1;
+    uint32_t request_occ_safe      : 1;
+#else
+    uint32_t request_occ_safe      : 1;
+    uint32_t active_quad_update    : 1;
+    uint32_t wof_hcode_mode        : 2;
+    uint32_t reserved_occ          : 3;
+    uint32_t gpu2_reset_status     : 1;
+    uint32_t gpu1_reset_status     : 1;
+    uint32_t gpu0_reset_status     : 1;
     uint32_t i2c_engine3_lock_occ  : 1;
     uint32_t i2c_engine3_lock_host : 1;
     uint32_t i2c_engine2_lock_occ  : 1;
diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c
index 3a82bb3..4c553d6 100755
--- a/src/occ_405/amec/amec_data.c
+++ b/src/occ_405/amec/amec_data.c
@@ -458,12 +458,6 @@ void amec_data_write_pcap(void)
             g_amec->pcap.ovs_node_pcap = G_sysConfigData.pcap.hard_min_pcap;
         }
 
-        //Oversubscription pcap can NOT be higher than a customer set pcap.
-        if(g_amec->pcap.ovs_node_pcap > l_customer)
-        {
-            g_amec->pcap.ovs_node_pcap = l_customer;
-        }
-
         //for all new pcap data setting: If KVM, update the OPAL dynamic data
         if(G_sysConfigData.system_type.kvm)
         {
diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c
index 7584ddf..995324d 100755
--- a/src/occ_405/amec/amec_pcap.c
+++ b/src/occ_405/amec/amec_pcap.c
@@ -95,14 +95,16 @@ extern uint32_t G_first_num_gpus_sys;
 // Thread: Real Time Loop
 //
 // End Function Specification
-void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
+void amec_gpu_pcap(bool i_oversubscription, bool i_active_pcap_changed, int32_t i_avail_power)
 {
     /*------------------------------------------------------------------------*/
     /*  Local Variables                                                       */
     /*------------------------------------------------------------------------*/
     uint8_t  i = 0;
     uint32_t l_gpu_cap_mw = 0;
+    uint16_t l_system_gpu_total_pcap = 0;  // total GPU pcap required by system based on if currently in oversub or not
     static uint16_t L_total_gpu_pcap = 0;  // Current total GPU pcap in effect
+    static uint16_t L_n_plus_1_mode_gpu_total_pcap = 0;  // Total GPU pcap required for N+1 (not in oversubscription)
     static uint16_t L_n_mode_gpu_total_pcap = 0;  // Total GPU pcap required for oversubscription
     static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR
     static uint16_t L_per_gpu_pcap = 0;  // Amount of L_total_gpu_pcap for each GPU
@@ -112,10 +114,12 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
     /*------------------------------------------------------------------------*/
     /*  Code                                                                  */
     /*------------------------------------------------------------------------*/
-    // If this is the first time running calculate the total GPU power cap for oversubscription
+    // If this is the first time running calculate the total GPU power cap for system power caps (N and N+1)
     if(L_first_run)
     {
+       // calculate total GPU power cap for oversubscription
        if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+         
        {
            // Take all non-GPU power away from the oversubscription power cap
            L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
@@ -157,6 +161,50 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
                             ERRL_CALLOUT_PRIORITY_HIGH);
            commitErrl(&l_err);
        }
+
+       // calculate total GPU power cap for N+1 (not in oversubscription)
+       if(G_sysConfigData.pcap.system_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+       {
+           // Take all non-GPU power away from the N+1 power cap
+           L_n_plus_1_mode_gpu_total_pcap = G_sysConfigData.pcap.system_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
+           // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs
+           L_n_plus_1_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts;
+       }
+       else
+       {
+           // This should not happen, the total non GPU power should never be higher than the N+1 mode cap
+           // Log error and set GPUs to minimum power cap
+           L_n_plus_1_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap
+
+           TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N+1 mode pwr limit %dW",
+                     G_sysConfigData.total_non_gpu_max_pwr_watts, G_sysConfigData.pcap.system_pcap);
+
+           /* @
+            * @errortype
+            * @moduleid    AMEC_GPU_PCAP_MID
+            * @reasoncode  GPU_FAILURE
+            * @userdata1   N+1 mode Power Cap watts
+            * @userdata2   Total non-GPU power watts
+            * @userdata4   ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE
+            * @devdesc     Total non-GPU power more than N+1 mode power cap
+            *
+            */
+           errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID,
+                                         GPU_FAILURE,
+                                         ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE,
+                                         ERRL_SEV_PREDICTIVE,
+                                         NULL,
+                                         DEFAULT_TRACE_SIZE,
+                                         G_sysConfigData.pcap.system_pcap,
+                                         G_sysConfigData.total_non_gpu_max_pwr_watts);
+
+           //Callout firmware
+           addCalloutToErrl(l_err,
+                            ERRL_CALLOUT_TYPE_COMPONENT_ID,
+                            ERRL_COMPONENT_ID_FIRMWARE,
+                            ERRL_CALLOUT_PRIORITY_HIGH);
+           commitErrl(&l_err);
+       }
     }  // if first run
 
     // Calculate the total GPU power cap for the current active limit and PSR
@@ -180,12 +228,23 @@ void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
                      G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap);
        }
 
-       // Total GPU power cap is the lower of oversubscription and active power limit
-       // must always account for oversubscription to ensure when a power supply is lost the OCC
-       // can react fast enough, GPU power capping is too slow and must have GPU power cap already
-       // set to account for oversubscription case
-       L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
-                           L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap;
+       // Total GPU power cap is the lower of system (N+1 or oversubscription depending on if in oversub)
+       // and the active power limit.  We do not need to always account for oversubscription since
+       // the automatic hw power brake will assert to the GPUs if there is a problem when oversub is
+       // entered from the time OCC can set and GPUs react to a new power limit
+       if(i_oversubscription)
+       {
+          // system in oversubscription use N mode cap
+          l_system_gpu_total_pcap = L_n_mode_gpu_total_pcap;
+       }
+       else
+       {
+          // system is not in oversubscription use N+1 mode cap
+          l_system_gpu_total_pcap = L_n_plus_1_mode_gpu_total_pcap;
+       }
+       
+       L_total_gpu_pcap = (l_system_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
+                           l_system_gpu_total_pcap : L_active_psr_gpu_total_pcap;
 
        // Divide the total equally across all GPUs in the system
        if(G_first_num_gpus_sys)
@@ -282,8 +341,8 @@ void amec_pcap_calc(void)
     l_oversub_state = AMEC_INTF_GET_OVERSUBSCRIPTION();
 
     // Determine the active power cap.  norm_node_pcap is set as lowest
-    // between sys and user in amec_data_write_pcap()
-    // when in oversub only use oversub pcap if lower than norm_node_pcap
+    // between sys (N+1 mode) and user in amec_data_write_pcap()
+    // when in oversub (N mode) only use oversub pcap if lower than norm_node_pcap
     // to handle user set power cap lower than the oversub power cap
     if( (TRUE == l_oversub_state) &&
         (g_amec->pcap.ovs_node_pcap < g_amec->pcap.norm_node_pcap) )
@@ -312,7 +371,7 @@ void amec_pcap_calc(void)
     // Determine GPU power cap if there are GPUs present
     if(G_first_proc_gpu_config)
     {
-       amec_gpu_pcap(l_active_pcap_changed, l_avail_power);
+       amec_gpu_pcap(l_oversub_state, l_active_pcap_changed, l_avail_power);
     }
 
     if(l_node_pwr != 0)
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 803ca28..3f1d333 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -436,27 +436,35 @@ typedef struct
 //-------------------------------------------------------------
 
 typedef struct {
-    bool     disabled;  // GPU has been marked failed and no longer monitored
-    bool     readOnce;  // Comm has been established with GPU
-    bool     overtempError;  // Core OT error has been logged against GPU
-    bool     memOvertempError;  // Memory OT error has been logged against GPU
+    bool     disabled;            // GPU has been marked failed and no longer monitored
+    bool     readOnce;            // Comm has been established with GPU
+    bool     commErrorLogged;     // GPU has been called out due to comm error
+    bool     overtempError;       // Core OT error has been logged against GPU
+    bool     memOvertempError;    // Memory OT error has been logged against GPU
     bool     checkDriverLoaded;   // Indicates if need to check if driver is loaded
-    bool     driverLoaded;   // Indicates if GPU driver is loaded
+    bool     driverLoaded;        // Indicates if GPU driver is loaded
     bool     checkMemTempSupport; // Indicates if need to check if mem monitoring is supported
-    bool     memTempSupported; // Indicates if memory temperature monitoring is supported
-    uint8_t  memErrorCount; // count of consecutive GPU mem temp read failures
-    uint8_t  errorCount;   // count of consecutive GPU core temp read failures
+    bool     memTempSupported;    // Indicates if memory temperature monitoring is supported
+    bool     notReset;            // '1' = GPU NOT in reset.  Read from OCC FLAGS register
+    bool     coreTempNotAvailable; // for fan control: '1' = core temp not available. (send 0 for fan control)
+    bool     memTempNotAvailable; // for fan control: '1' = Mem temp not available. (send 0 for fan control)
+    bool     coreTempFailure;     // for fan control: '1' = timeout failure reading core temp (send 0xFF for fan control)
+    bool     memTempFailure;      // for fan control: '1' = timeout failure reading Mem temp (send 0xFF for fan control)
+    uint8_t  memErrorCount;       // count of consecutive GPU mem temp read failures when GPU not in reset
+    uint8_t  errorCount;          // count of consecutive GPU core temp read failures when GPU not in reset
+    uint8_t  retryCount;          // count of consecutive GPU core temp read failures before I2C reset
 } gpuStatus_t;
 
 typedef struct {
-    bool     check_pwr_limit; // Indicates if need to read power limits from GPU
-    bool     pwr_limits_read;   // Indicates if power limits were read i.e. have min/max
-    bool     gpu_min_cap_required;   // Indicates if power limits were read i.e. have min/max
-    uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
-    uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
-    uint32_t gpu_desired_pcap_mw;  // AMEC determined pcap in mW to set
+    bool     check_pwr_limit;        // Indicates if need to read power limits from GPU
+    bool     pwr_limits_read;        // Indicates if power limits were read i.e. have min/max
+    bool     set_failed;             // Indicates if failed to set power limit
+    bool     gpu_min_cap_required;   // Indicates if GPU requires min cap
+    uint32_t gpu_min_pcap_mw;        // Min GPU power limit in mW read from the GPU
+    uint32_t gpu_max_pcap_mw;        // Max GPU power limit in mW read from the GPU
+    uint32_t gpu_desired_pcap_mw;    // AMEC determined pcap in mW to set
     uint32_t gpu_requested_pcap_mw;  // Requested power cap in mW sent to GPU
-    uint32_t gpu_actual_pcap_mw;  // Actual power cap in mW read back from the GPU
+    uint32_t gpu_default_pcap_mw;    // Default power cap in mW read from the GPU
 } gpuPcap_t;
 
 
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 4606f6d..422dc38 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -348,7 +348,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
         const sensor_t *vrfan = getSensorByGsid(VRMPROCOT);
         if (vrfan != NULL)
         {
-            l_tempSensorList[l_sensorHeader.count].id = G_sysConfigData.proc_huid;
+            l_tempSensorList[l_sensorHeader.count].id = 0;
             l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM;
             l_tempSensorList[l_sensorHeader.count].value = vrfan->sample & 0xFF;
             l_sensorHeader.count++;
@@ -358,24 +358,46 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
     // Add GPU temperatures
     for (k=0; k<MAX_NUM_GPU_PER_DOMAIN; k++)
     {
-        if(GPU_PRESENT(k))  // temp until GPU sensor IDs are sent make sensor ids "GPU"<gpu#>
+        if(GPU_PRESENT(k))
         {
             // GPU core temperature
-            if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid)  // temp
-               l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
-            else
-               l_tempSensorList[l_sensorHeader.count].id = 0xC6 + (9 * G_pbax_id.chip_id) + (k*3); // temp
+            l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
             l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU;
-            l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
+            if(g_amec->gpu[k].status.coreTempFailure)
+            {
+               // failed to read core temperature return 0xFF
+               l_tempSensorList[l_sensorHeader.count].value = 0xFF;
+            }
+            else if(g_amec->gpu[k].status.coreTempNotAvailable)
+            {
+               // core temperature not available return 0
+               l_tempSensorList[l_sensorHeader.count].value = 0;
+            }
+            else
+            {
+               // have a good core temperature return the reading
+               l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
+            }
             l_sensorHeader.count++;
 
             // GPU memory temperature
-            if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid)  // temp
-               l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
-            else
-               l_tempSensorList[l_sensorHeader.count].id = 0xC7 + (9 * G_pbax_id.chip_id) + (k*3); // temp
+            l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
             l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM;
-            l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
+            if(g_amec->gpu[k].status.memTempFailure)
+            {
+               // failed to read memory temperature return 0xFF
+               l_tempSensorList[l_sensorHeader.count].value = 0xFF;
+            }
+            else if(g_amec->gpu[k].status.memTempNotAvailable)
+            {
+               // memory temperature not available return 0
+               l_tempSensorList[l_sensorHeader.count].value = 0;
+            }
+            else
+            {
+               // have a good memory temperature return the reading
+               l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
+            }
             l_sensorHeader.count++;
         }
     }
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 8666e12..1a27565 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -51,18 +51,25 @@
 
 #define GPU_TEMP_READ_1S  ( 1000000 / (MICS_PER_TICK * 2) )  // Number calls with assumption called every other tick
 
-// Time in seconds to ignore errors from the start of GPU SM
-// Right now this time must include PRST and GPU init time
-// this may be reduced after adding in OS interlock for PRST
-#define GPU_COMM_ESTAB_TIMEOUT_SECONDS 600
+// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
+#define GPU_INIT_ERROR_COUNT 300  // approximately 300 seconds
 
-#define MAX_CONSECUTIVE_GPU_RESETS      3
+#define MAX_CONSECUTIVE_GPU_RESETS      5
 #define MAX_GPU_RESET_STATE_RETRY 3
 #define MAX_RESET_STATE_NOT_DONE_COUNT 100
 #define MAX_GPU_READ_ATTEMPT    3
+#define GPU_ERRORS_BEFORE_I2C_RESET 5
+
+// consecutive error counts for GPU command failures before error is logged if GPU is not in reset
+#define GPU_CHECK_DRIVER_ERROR_COUNT 5
+#define GPU_READ_MEM_CAP_ERROR_COUNT 5
+#define GPU_READ_PWR_LIMIT_ERROR_COUNT 5
+#define GPU_SET_PWR_LIMIT_ERROR_COUNT 5
+
 #define GPU_I2C_ENGINE      PIB_I2C_ENGINE_C
 
 extern data_cnfg_t * G_data_cnfg;
+extern PWR_READING_TYPE  G_pwr_reading_type;
 
 // this is the global GPU task sm state each task within the GPU SM may have its own "state"
 // to allow several calls to complete the task
@@ -71,7 +78,6 @@ gpuState_e G_gpu_state = GPU_STATE_IDLE;
 bool     G_gpu_monitoring_allowed = FALSE;   // Set to true if GPU is present
 bool     G_gpu_i2c_reset_required = FALSE;
 uint32_t G_gpu_reset_cause = 0;
-uint64_t G_gpu_sm_start_time = 0;
 
 // GPE Requests
 GpeRequest G_gpu_op_request;
@@ -82,11 +88,6 @@ GPE_BUFFER(gpu_sm_args_t  G_gpu_op_req_args);
 gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
 
 uint8_t G_current_gpu_id = 0;   // ID 0..2 of GPU currently being processed
-bool    G_gpu_read_issued = false;
-
-// Read OCC_MISC register to see if an I2C interrupt was generated for
-// the specified engine.
-bool check_for_i2c_interrupt(const uint8_t i_engine);
 
 // Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
 uint8_t get_first_gpu(void)
@@ -141,14 +142,47 @@ uint8_t gpu_id_need_driver_check(void)
     uint8_t gpu_id = 0xFF;  // default none needs checking
     uint8_t i = 0;
 
-    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    // checking for driver loaded can be repeated until driver is loaded which may never happen
+    // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+    // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+    static uint8_t L_current_gpu_id = 0;
+
+    if(L_current_gpu_id == 0xFF)
     {
-        if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkDriverLoaded))
-        {
-           gpu_id = i;
-           break;
-        }
+       // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+       L_current_gpu_id = 0;
     }
+    else
+    {
+       for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+       {
+           // only check for driver after i2c comm (readOnce) has been established
+           if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) &&
+              (g_amec->gpu[i].status.readOnce) && (g_amec->gpu[i].status.checkDriverLoaded))
+           {
+              gpu_id = i;
+              break;
+           }
+       }
+
+       //  setup L_current_gpu_id for next call based on what is happening this time
+       if(gpu_id == 0xFF)
+       {
+          // no GPU needs checking start back at 0 next time
+          L_current_gpu_id = 0;
+       }
+       else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+       {
+          // last GPU is having driver checked do not check any next time
+          L_current_gpu_id = 0xFF;
+       }
+       else
+       {
+          // next time look at the next GPU ID first
+          L_current_gpu_id = gpu_id + 1;
+       }
+    }
+
     return gpu_id;
 }
 
@@ -157,13 +191,45 @@ uint8_t gpu_id_need_memory_temp_capability_check(void)
     uint8_t gpu_id = 0xFF;  // default none needs checking
     uint8_t i = 0;
 
-    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    // checking for memory temp capability will be repeated until memory temp is capable which may never happen
+    // to avoid infinite loop checking the same GPU over and over we will use a static and each call
+    // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
+    static uint8_t L_current_gpu_id = 0;
+
+    if(L_current_gpu_id == 0xFF)
     {
-        if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkMemTempSupport))
-        {
-           gpu_id = i;
-           break;
-        }
+       // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
+       L_current_gpu_id = 0;
+    }
+    else
+    {
+       for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+       {
+           // driver must be loaded for memory temp capability
+           if( (!g_amec->gpu[i].status.disabled) && (g_amec->gpu[i].status.driverLoaded) &&
+               (g_amec->gpu[i].status.checkMemTempSupport) )
+           {
+              gpu_id = i;
+              break;
+           }
+       }
+
+       //  setup L_current_gpu_id for next call based on what is happening this time
+       if(gpu_id == 0xFF)
+       {
+          // no GPU needs checking start back at 0 next time
+          L_current_gpu_id = 0;
+       }
+       else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
+       {
+          // last GPU is having memory capability checked do not check any next time
+          L_current_gpu_id = 0xFF;
+       }
+       else
+       {
+          // next time look at the next GPU ID first
+          L_current_gpu_id = gpu_id + 1;
+       }
     }
     return gpu_id;
 }
@@ -178,7 +244,7 @@ uint8_t get_first_mem_temp_capable_gpu(void)
     for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
     {
         if( (!g_amec->gpu[i].status.disabled) &&
-            (g_amec->gpu[i].status.memTempSupported) )
+            (g_amec->gpu[i].status.memTempSupported) )  // memTempSupported implies that driver is loaded
         {
            first_gpu = i;
            break;
@@ -202,7 +268,7 @@ uint8_t get_next_mem_temp_capable_gpu(void)
               next_gpu = 0;
            }
            if( (!g_amec->gpu[next_gpu].status.disabled) &&
-               (g_amec->gpu[next_gpu].status.memTempSupported) )
+               (g_amec->gpu[next_gpu].status.memTempSupported) )  // memTempSupported implies that driver is loaded
            {
               break;
            }
@@ -213,6 +279,10 @@ uint8_t get_next_mem_temp_capable_gpu(void)
     {
         next_gpu = 0xFF;
     }
+    else if( (next_gpu != 0xFF) && (!g_amec->gpu[next_gpu].status.memTempSupported) )
+    {
+        next_gpu = 0xFF;
+    }
 
     return next_gpu;
 }
@@ -231,8 +301,16 @@ uint8_t gpu_id_need_power_limits(void)
         if( (g_amec->gpu[i].status.driverLoaded) &&
             (g_amec->gpu[i].pcap.check_pwr_limit))
         {
-           gpu_id = i;
-           break;
+           // If there is no power capping support skip reading power limits
+           if(G_pwr_reading_type == PWR_READING_TYPE_NONE)
+           {
+               g_amec->gpu[i].pcap.check_pwr_limit = false;
+           }
+           else
+           {
+               gpu_id = i;
+               break;
+           }
         }
     }
     return gpu_id;
@@ -247,8 +325,9 @@ uint8_t gpu_id_need_set_power_limit(void)
 
     for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
     {
-        // to set power limit requires that the driver is loaded
-        if( (g_amec->gpu[i].status.driverLoaded) &&
+        // to set power limit requires that the driver is loaded and power limits were read
+        if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) &&
+            (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) &&
             (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) )
         {
            gpu_id = i;
@@ -258,6 +337,97 @@ uint8_t gpu_id_need_set_power_limit(void)
     return gpu_id;
 }
 
+// For the given GPU clear status/data that requires GPU driver to be loaded
+void clear_gpu_driver_status(uint8_t i_gpu_num)
+{
+    g_amec->gpu[i_gpu_num].status.checkDriverLoaded = false;
+    g_amec->gpu[i_gpu_num].status.driverLoaded = false;
+
+    // Reading memory temperature requires driver to be loaded.
+    g_amec->gpu[i_gpu_num].status.checkMemTempSupport = false;
+    g_amec->gpu[i_gpu_num].status.memTempSupported = false;
+    g_amec->gpu[i_gpu_num].status.memErrorCount = 0;
+
+    // Power capping requires driver to be loaded.  Clear GPU power limits
+    g_amec->gpu[i_gpu_num].pcap.check_pwr_limit = false;
+    g_amec->gpu[i_gpu_num].pcap.pwr_limits_read = false;
+    g_amec->gpu[i_gpu_num].pcap.set_failed = false;
+    g_amec->gpu[i_gpu_num].pcap.gpu_min_pcap_mw = 0;
+    g_amec->gpu[i_gpu_num].pcap.gpu_max_pcap_mw = 0;
+    g_amec->gpu[i_gpu_num].pcap.gpu_requested_pcap_mw = 0;
+    g_amec->gpu[i_gpu_num].pcap.gpu_default_pcap_mw = 0;
+    //amec will need to recalculate after power limits are read to handle any clipping with new GPU min/max
+    g_amec->gpu[i_gpu_num].pcap.gpu_desired_pcap_mw = 0;
+}
+
+// Handles GPU not able to process request due to driver load or un-load
+void handle_driver_change(void)
+{
+    // Clear out driver status while driver change completes and is determined if loaded/un-loaded
+    clear_gpu_driver_status(G_current_gpu_id);
+
+    // memory temp only available when driver is loaded. clear error and set not available
+    g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+    // when driver change is complete we must re-query to see if driver is loaded or not
+    g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+}
+
+// For all GPUs read GPU reset status and take action if reset status has changed
+void update_gpu_reset_status(void)
+{
+    uint8_t gpu_num = 0;
+
+    // GPU reset status is in the OCC FLAGS register and is updated by OPAL
+    // Read the current reset status for all GPUs.  A reset status of '1' indicates NOT in reset
+    ocb_occflg_t occ_flags = {0};
+    occ_flags.value = in32(OCB_OCCFLG);
+    bool not_in_reset[3] = {occ_flags.fields.gpu0_reset_status,
+                            occ_flags.fields.gpu1_reset_status,
+                            occ_flags.fields.gpu2_reset_status};
+
+    // reset status of '0' (IN reset) is the default
+    // the OCC will still try to read GPU when IN reset but will not log errors
+    // this is so we still communicate with the GPUs without OPAL support to indicate
+    // a GPU is not in reset.
+    // Full OCC support below for when OPAL starts updating the reset status
+    for (gpu_num=0; gpu_num<MAX_NUM_GPU_PER_DOMAIN; gpu_num++)
+    {
+        if(not_in_reset[gpu_num] != g_amec->gpu[gpu_num].status.notReset)
+        {
+            INTR_TRAC_IMP("update_gpu_reset_status: GPU%d NOT in reset is now = %d",
+                           gpu_num,
+                           not_in_reset[gpu_num]);
+
+            // There has been a change to the reset status clear everything out except for errors logged so we don't log again
+            clear_gpu_driver_status(gpu_num);
+            g_amec->gpu[gpu_num].status.errorCount = 0;
+            g_amec->gpu[gpu_num].status.retryCount = 0;
+
+            // readOnce of false will force comm to be established and once established then checkDriverLoaded will get set
+            g_amec->gpu[gpu_num].status.readOnce = false;
+
+            if(not_in_reset[gpu_num])
+            {
+                // GPU was taken out of reset clear disabled to allow communication again
+                g_amec->gpu[gpu_num].status.disabled = false;
+            }
+            else
+            {
+                // GPU was put in reset.  Clear temperature sensor errors and set to not available
+                g_amec->gpu[gpu_num].status.coreTempFailure = false;
+                g_amec->gpu[gpu_num].status.coreTempNotAvailable = true;
+                g_amec->gpu[gpu_num].status.memTempFailure = false;
+                g_amec->gpu[gpu_num].status.memTempNotAvailable = true;
+            }
+
+            g_amec->gpu[gpu_num].status.notReset = not_in_reset[gpu_num];
+
+        }  // if GPU reset status changed
+    }  // for each GPU
+}  // end update_gpu_reset_status()
+
 // Disable GPU monitoring for all GPUs
 void disable_all_gpus(void)
 {
@@ -327,95 +497,103 @@ void gpu_ipc_init()
     }
 }
 
-// Called after a failure for a specified GPU.  The error will
+// Called after a failure reading core temp for a specified GPU.  The error will
 // be counted and if threshold is reached, an error will be created with
-// the GPU as a callout and then set flag to force reset
+// the GPU as a callout if the GPU is not in reset
 void mark_gpu_failed(const gpu_sm_args_t *i_arg)
 {
     uint32_t gpu_id = i_arg->gpu_id;
 
     do
     {
-        // ignore all errors if haven't reached timeout for comm established
-        if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
-            (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
-        {
-             // do nothing but reset at this time
-            break;
-        }
         if((false == g_amec->gpu[gpu_id].status.disabled) &&
            (true == g_amec->gpu[gpu_id].status.readOnce))
         {
-            INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
-                          "(ffdc 0x%08X%08X)",
-                          gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
-                          WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
+            GPU_DBG("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
+                    "(ffdc 0x%08X%08X)",
+                    gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
+                    WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
         }
 
-        if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
-            (false == g_amec->gpu[gpu_id].status.disabled) &&
-            (true == g_amec->gpu[gpu_id].status.readOnce))
+        // Always inc retry count for I2C reset regardless of if GPU is in reset or not
+        g_amec->gpu[gpu_id].status.retryCount++;
+
+        // Only inc error count if it is known that GPU is NOT in reset
+        // NOTE: Default is IN reset so this will only be true when OPAL/OS supports telling the OCC reset status
+        // if OS never tells the OCC reset status the OCC will never disable or log a comm error
+        if(g_amec->gpu[gpu_id].status.notReset)
         {
-            G_gpu_state = GPU_STATE_IDLE;
-
-            // Something has gone wrong and it may be that OPAL has put
-            // the GPU into reset. For now, if this happens we will just
-            // continue polling the GPU until it comes back.
-            g_amec->gpu[gpu_id].status.readOnce = false;
-            g_amec->gpu[gpu_id].status.checkDriverLoaded = true;
-            g_amec->gpu[gpu_id].status.driverLoaded = false;
-            g_amec->gpu[gpu_id].status.checkMemTempSupport = true;
-            g_amec->gpu[gpu_id].status.memTempSupported = false;
-            g_amec->gpu[gpu_id].status.memErrorCount = 0;
-            g_amec->gpu[gpu_id].status.errorCount = 0;
-
-// This code can be used if an interlock with OPAL is ever introduced
-#if 0
-            // Disable this GPU, collect FFDC and log error
-            g_amec->gpu[gpu_id].status.disabled = true;
-
-            INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
-                          gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
-            errlHndl_t l_err = NULL;
-            /*
-             * @errortype
-             * @moduleid    GPU_MID_MARK_GPU_FAILED
-             * @reasoncode  GPU_FAILURE
-             * @userdata1   GPE returned rc code
-             * @userdata4   ERC_GPU_COMPLETE_FAILURE
-             * @devdesc     GPU failure
-             */
-            l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
-                               GPU_FAILURE,
-                               ERC_GPU_COMPLETE_FAILURE,
-                               ERRL_SEV_PREDICTIVE,
-                               NULL,
-                               DEFAULT_TRACE_SIZE,
-                               i_arg->error.rc,
-                               0);
-            addUsrDtlsToErrl(l_err,
-                             (uint8_t*)&i_arg->error.ffdc,
-                             sizeof(i_arg->error.ffdc),
-                             ERRL_STRUCT_VERSION_1,
-                             ERRL_USR_DTL_BINARY_DATA);
-
-            // Callout the GPU if have sensor ID for it
-            if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+            // INC count and check if reached error threshold
+            if( ++g_amec->gpu[gpu_id].status.errorCount > GPU_INIT_ERROR_COUNT)
             {
-               addCalloutToErrl(l_err,
-                                ERRL_CALLOUT_TYPE_GPU_ID,
-                                G_sysConfigData.gpu_sensor_ids[gpu_id],
-                                ERRL_CALLOUT_PRIORITY_MED);
-            }
+               // set that GPU temperature readings failed
+               g_amec->gpu[gpu_id].status.memTempFailure = true;
+               g_amec->gpu[gpu_id].status.memTempNotAvailable = true;
+               g_amec->gpu[gpu_id].status.coreTempFailure = true;
+               g_amec->gpu[gpu_id].status.coreTempNotAvailable = true;
+
+               // Disable this GPU.  GPU will get re-enabled if detected that GPU is put in reset and then taken out
+               g_amec->gpu[gpu_id].status.disabled = true;
+
+               INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
+                             gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
+
+               if(g_amec->gpu[gpu_id].status.commErrorLogged == false)
+               {
+
+                  errlHndl_t l_err = NULL;
+                  /*
+                   * @errortype
+                   * @moduleid    GPU_MID_MARK_GPU_FAILED
+                   * @reasoncode  GPU_FAILURE
+                   * @userdata1   GPE returned rc code
+                   * @userdata4   ERC_GPU_COMPLETE_FAILURE
+                   * @devdesc     GPU failure
+                   */
+                  l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
+                                     GPU_FAILURE,
+                                     ERC_GPU_COMPLETE_FAILURE,
+                                     ERRL_SEV_PREDICTIVE,
+                                     NULL,
+                                     DEFAULT_TRACE_SIZE,
+                                     i_arg->error.rc,
+                                     0);
+                  addUsrDtlsToErrl(l_err,
+                                   (uint8_t*)&i_arg->error.ffdc,
+                                   sizeof(i_arg->error.ffdc),
+                                   ERRL_STRUCT_VERSION_1,
+                                   ERRL_USR_DTL_BINARY_DATA);
+
+                  // Callout the GPU if have sensor ID for it
+                  if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+                  {
+                     addCalloutToErrl(l_err,
+                                      ERRL_CALLOUT_TYPE_GPU_ID,
+                                      G_sysConfigData.gpu_sensor_ids[gpu_id],
+                                      ERRL_CALLOUT_PRIORITY_MED);
+                  }
+
+                  commitErrl(&l_err);
+                  g_amec->gpu[gpu_id].status.commErrorLogged = true;
+
+               } // if !commErrorLogged
+
+            } // if errorCount > threshold
+
+        } // if notReset
 
-            commitErrl(&l_err);
-#endif
-        }
     } while(0);
 
-    // Reset GPU
-    G_gpu_i2c_reset_required = true;
-    G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+    // Do an I2C reset if reached retry count
+    // don't want to do I2C reset every time since could be that this GPU really is in reset and
+    // while resetting I2C we are unable to read other GPUs that may not be in reset
+    if( g_amec->gpu[gpu_id].status.retryCount > GPU_ERRORS_BEFORE_I2C_RESET)
+    {
+       g_amec->gpu[gpu_id].status.retryCount = 0;
+       G_gpu_i2c_reset_required = true;
+       G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+    }
+
 } // end mark_gpu_failed()
 
 // Schedule a GPE request for GPU operation
@@ -461,6 +639,27 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
             case GPU_REQ_READ_MEM_TEMP_FINISH:
                 break;
 
+            // Check if driver is loaded
+            case GPU_REQ_CHECK_DRIVER_START:
+            case GPU_REQ_CHECK_DRIVER_2:
+            case GPU_REQ_CHECK_DRIVER_3:
+            case GPU_REQ_CHECK_DRIVER_FINISH:
+                break;
+
+            // Read GPU Power Limit
+            case GPU_REQ_READ_PWR_LIMIT_START:
+            case GPU_REQ_READ_PWR_LIMIT_2:
+            case GPU_REQ_READ_PWR_LIMIT_3:
+            case GPU_REQ_READ_PWR_LIMIT_FINISH:
+                break;
+
+            // Set GPU Power Limit
+            case GPU_REQ_SET_PWR_LIMIT_START:
+            case GPU_REQ_SET_PWR_LIMIT_2:
+            case GPU_REQ_SET_PWR_LIMIT_3:
+            case GPU_REQ_SET_PWR_LIMIT_FINISH:
+                break;
+
             // I2C reset
             case GPU_REQ_RESET:
                 break;
@@ -498,6 +697,7 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
         {
             // Clear errors and init common arguments for GPE
             G_gpu_op_req_args.error.error = 0;
+            G_gpu_op_req_args.gpu_rc = 0;
             G_gpu_op_req_args.operation = i_operation;
             G_gpu_op_req_args.gpu_id = G_current_gpu_id;
 
@@ -574,14 +774,14 @@ bool gpu_reset_sm()
                 /*
                  * @errortype
                  * @moduleid    GPU_MID_GPU_RESET_SM
-                 * @reasoncode  GPU_FAILURE
+                 * @reasoncode  GPU_NO_GPE_SUPPORT
                  * @userdata1   0
                  * @userdata2   0
                  * @userdata4   ERC_GPU_NO_GPE_SUPPORT
                  * @devdesc     GPE1 image doesn't support GPU communication
                  */
                 errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
-                                         GPU_FAILURE,
+                                         GPU_NO_GPE_SUPPORT,
                                          ERC_GPU_NO_GPE_SUPPORT,
                                          ERRL_SEV_UNRECOVERABLE,
                                          NULL,
@@ -606,9 +806,7 @@ bool gpu_reset_sm()
                 else  // this reset attempt failed
                 {
                     // Stop trying if reached max resets
-                    if( (L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS) &&
-                        (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) >=
-                         GPU_COMM_ESTAB_TIMEOUT_SECONDS))
+                    if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
                     {
                         INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
                                       L_reset_state);
@@ -662,12 +860,12 @@ bool gpu_reset_sm()
                 break;
 
             case GPU_RESET_STATE_RESET_MASTER:
-                G_new_gpu_req_args.data = GPU_RESET_REQ_MASTER;
+                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
                 L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                 break;
 
             case GPU_RESET_STATE_RESET_SLAVE:
-                G_new_gpu_req_args.data = GPU_RESET_REQ_SLV;
+                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
                 L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                 break;
 
@@ -677,7 +875,7 @@ bool gpu_reset_sm()
                 break;
 
             case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
-                G_new_gpu_req_args.data = GPU_RESET_REQ_SLV_COMPLETE;
+                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
                 L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                 break;
 
@@ -718,6 +916,555 @@ bool gpu_reset_sm()
 
 // Function Specification
 //
+// Name:  gpu_check_driver_loaded_sm
+//
+// Description: Called from gpu_task_sm to check if driver is loaded for G_current_gpu_id
+//              This function should only return that complete is TRUE when the check
+//              is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to check
+//
+// End Function Specification
+bool gpu_check_driver_loaded_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
+    bool l_new_driver_loaded = FALSE;
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_check_driver_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+    static uint8_t L_state_failure_count = 0;
+    static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new read then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_check_driver_state != GPU_STATE_CHECK_DRIVER_LOADED_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // Check if failure was due to driver change
+          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+          {
+             handle_driver_change();
+             // Request can't be processed by GPU at this time so we are done with this GPU
+             // setup to start new request
+             L_state_failure_count = 0;
+             L_check_driver_failure_count[G_current_gpu_id] = 0; // clear driver failure count since there's a driver change
+             L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+
+          // If reached state retry count give up on this read
+          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // if GPU is not in reset then INC error count and check if reached threshold
+             if(g_amec->gpu[G_current_gpu_id].status.notReset)
+             {
+                if(++L_check_driver_failure_count[G_current_gpu_id] > GPU_CHECK_DRIVER_ERROR_COUNT)
+                {
+                    INTR_TRAC_ERR("gpu_check_driver_loaded: Failed to check driver loaded for GPU%d RC: 0x%02X",
+                                   G_current_gpu_id,
+                                   G_gpu_op_req_args.gpu_rc);
+
+                    // give up checking driver loaded for this GPU
+                    // It will be retried if detected that GPU is put in reset and then taken out
+                    g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+                    L_check_driver_failure_count[G_current_gpu_id] = 0;
+
+                    // without driver loaded cannot read memory temp, mark memory temp as failed
+                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+                    // log one time error that driver loaded couldn't be determined
+                    if(!L_error_logged[G_current_gpu_id])
+                    {
+                       L_error_logged[G_current_gpu_id] = TRUE;
+
+                       // Log error
+                       /* @
+                        * @errortype
+                        * @moduleid    GPU_MID_GPU_CHECK_DRIVER_LOADED
+                        * @reasoncode  GPU_FAILURE
+                        * @userdata1   GPU ID
+                        * @userdata2   0
+                        * @userdata4   ERC_GPU_CHECK_DRIVER_LOADED_FAILURE
+                        * @devdesc     Failure to check GPU driver loaded
+                        *
+                        */
+                       errlHndl_t l_err = createErrl(GPU_MID_GPU_CHECK_DRIVER_LOADED,
+                                                     GPU_FAILURE,
+                                                     ERC_GPU_CHECK_DRIVER_LOADED_FAILURE,
+                                                     ERRL_SEV_PREDICTIVE,
+                                                     NULL,
+                                                     DEFAULT_TRACE_SIZE,
+                                                     G_current_gpu_id,
+                                                     0);
+
+                       // Callout the GPU if have sensor ID for it
+                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                       {
+                          addCalloutToErrl(l_err,
+                                           ERRL_CALLOUT_TYPE_GPU_ID,
+                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                           ERRL_CALLOUT_PRIORITY_MED);
+                       }
+
+                       // Commit Error
+                       commitErrl(&l_err);
+                    } // if error not logged
+                } // if reached error count
+             } // if notReset
+
+             L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+             L_state_failure_count = 0;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          } // if reached state retry count
+          else
+          {
+             // INC failure count and retry current state
+             L_state_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_state_failure_count = 0;
+          L_check_driver_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_check_driver_state)
+       {
+           case GPU_STATE_CHECK_DRIVER_LOADED_START:
+               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_CHECK_DRIVER_LOADED_2:
+               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_2, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_CHECK_DRIVER_LOADED_3:
+               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_3, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_CHECK_DRIVER_LOADED_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_FINISH, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
+               // Update driver loaded
+               l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
+               if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
+               {
+                  // Driver loaded status changed
+                  INTR_TRAC_IMP("gpu_check_driver_loaded: GPU%d driver loaded changed to %d",
+                                 G_current_gpu_id,
+                                 l_new_driver_loaded);
+
+                  if(l_new_driver_loaded)
+                  {
+                     // Driver is now loaded do checking that required driver to be loaded
+                     g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = true;
+                     g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+                     // done checking for driver to be loaded
+                     g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
+                  }
+                  else
+                  {
+                     // Driver is no longer loaded
+                     clear_gpu_driver_status(G_current_gpu_id);
+
+                     // memory temp only available when driver is loaded
+                     // clear error and set not available
+                     g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+                     g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+                     // Need to keep query for driver loaded to detect when driver is loaded
+                     g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
+                  }
+
+                  g_amec->gpu[G_current_gpu_id].status.driverLoaded = l_new_driver_loaded;
+               }
+
+               // Done with this GPU ready to move to new one
+               L_check_driver_failure_count[G_current_gpu_id] = 0;
+               L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_check_driver_loaded: INVALID STATE: 0x%02X", L_check_driver_state);
+               L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_check_driver_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_check_driver_loaded: Scheduled check driver loaded state 0x%02X at tick %d",
+                   L_check_driver_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_check_driver_loaded: failed to schedule state 0x%02X", L_check_driver_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_check_driver_loaded: NOT idle for state 0x%02X", L_check_driver_state);
+    }
+
+    return l_complete;
+} // end gpu_check_driver_loaded_sm()
+
+// Function Specification
+//
+// Name:  gpu_read_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to read GPU power limits for G_current_gpu_id
+//              This function should only return that complete is TRUE when the read
+//              is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_read_pwr_limit_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_read_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+    static uint8_t L_state_failure_count = 0;
+    static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new read then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // Check if failure was due to driver change
+          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+          {
+             handle_driver_change();
+             // Request can't be processed by GPU at this time so we are done with this GPU
+             // setup to start new request
+             L_state_failure_count = 0;
+             L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+             L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+
+          // If reached retry count give up on this read
+          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // if GPU is not in reset then INC error count and check if reached threshold
+             if(g_amec->gpu[G_current_gpu_id].status.notReset)
+             {
+                if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT)
+                {
+                    INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X",
+                                   G_current_gpu_id,
+                                   G_gpu_op_req_args.gpu_rc);
+
+                    // give up trying to read power limits for this GPU
+                    // It will be retried if detected that GPU is put in reset and then taken out
+                    g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false;
+                    L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+                    // log one time error that power limits could not be read
+                    if(!L_error_logged[G_current_gpu_id])
+                    {
+                       L_error_logged[G_current_gpu_id] = TRUE;
+
+                       // Log error
+                       /* @
+                        * @errortype
+                        * @moduleid    GPU_MID_GPU_READ_PWR_LIMIT
+                        * @reasoncode  GPU_FAILURE
+                        * @userdata1   GPU ID
+                        * @userdata2   0
+                        * @userdata4   ERC_GPU_READ_PWR_LIMIT_FAILURE
+                        * @devdesc     Failure to read GPU power limits
+                        *
+                        */
+                       errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT,
+                                                     GPU_FAILURE,
+                                                     ERC_GPU_READ_PWR_LIMIT_FAILURE,
+                                                     ERRL_SEV_PREDICTIVE,
+                                                     NULL,
+                                                     DEFAULT_TRACE_SIZE,
+                                                     G_current_gpu_id,
+                                                     0);
+
+                       // Callout the GPU if have sensor ID for it
+                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                       {
+                          addCalloutToErrl(l_err,
+                                           ERRL_CALLOUT_TYPE_GPU_ID,
+                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                           ERRL_CALLOUT_PRIORITY_MED);
+                       }
+
+                       // Commit Error
+                       commitErrl(&l_err);
+                    } // if error not logged
+                } // if reached error count
+             } // if notReset
+
+             L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+             L_state_failure_count = 0;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          } // if reached retry count
+          else
+          {
+             // INC failure count and retry current state
+             L_state_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_state_failure_count = 0;
+          L_read_pwr_limit_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_read_pwr_limit_state)
+       {
+           case GPU_STATE_READ_PWR_LIMIT_START:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_PWR_LIMIT_2:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_2, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_PWR_LIMIT_3:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_3, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_PWR_LIMIT_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
+               g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
+               // Update power limits
+               g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
+               g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+               g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw = (uint32_t) G_gpu_op_req_args.data[1];
+               g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw = (uint32_t) G_gpu_op_req_args.data[2];
+
+               // Done with this GPU ready to move to new one
+               L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
+               L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state);
+               L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_read_pwr_limit_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+                   L_read_pwr_limit_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_read_pwr_limit: NOT idle for state 0x%02X", L_read_pwr_limit_state);
+    }
+
+    return l_complete;
+} // end gpu_read_pwr_limit_sm()
+
+// Function Specification
+//
+// Name:  gpu_set_pwr_limit_sm
+//
+// Description: Called from gpu_task_sm to set GPU power limit for G_current_gpu_id
+//              This function should only return that complete is TRUE when the set
+//              is complete (or determined failed) and ready for a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_set_pwr_limit_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when complete or failed
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_state_failure_count = 0;
+    static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+    static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new set limit then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // Check if failure was due to driver change
+          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
+          {
+             handle_driver_change();
+             // Request can't be processed by GPU at this time so we are done with this GPU
+             // setup to start new request
+             L_state_failure_count = 0;
+             L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+             L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+
+          // If reached retry count give up on this read
+          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // if GPU is not in reset then INC error count and check if reached threshold
+             if(g_amec->gpu[G_current_gpu_id].status.notReset)
+             {
+                if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT)
+                {
+                    INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X",
+                                   G_gpu_op_req_args.data[0],
+                                   G_current_gpu_id,
+                                   G_gpu_op_req_args.gpu_rc);
+
+                    // give up trying to set power limit for this GPU
+                    // It will be retried if detected that GPU is put in reset and then taken out or driver change
+                    g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
+                    L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+
+                    // log error that power limit could not be set
+                    if(!L_error_logged[G_current_gpu_id])
+                    {
+                       L_error_logged[G_current_gpu_id] = TRUE;
+
+                       // Log error
+                       /* @
+                        * @errortype
+                        * @moduleid    GPU_MID_GPU_SET_PWR_LIMIT
+                        * @reasoncode  GPU_FAILURE
+                        * @userdata1   GPU ID
+                        * @userdata2   0
+                        * @userdata4   ERC_GPU_SET_PWR_LIMIT_FAILURE
+                        * @devdesc     Failure to set GPU power limit
+                        *
+                        */
+                       errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT,
+                                                     GPU_FAILURE,
+                                                     ERC_GPU_SET_PWR_LIMIT_FAILURE,
+                                                     ERRL_SEV_PREDICTIVE,
+                                                     NULL,
+                                                     DEFAULT_TRACE_SIZE,
+                                                     G_current_gpu_id,
+                                                     0);
+
+                       // Callout the GPU if have sensor ID for it
+                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                       {
+                          addCalloutToErrl(l_err,
+                                           ERRL_CALLOUT_TYPE_GPU_ID,
+                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                           ERRL_CALLOUT_PRIORITY_MED);
+                       }
+
+                       // Commit Error
+                       commitErrl(&l_err);
+                    } // if error not logged
+                } // if reached error count
+             } // if notReset
+
+             L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+             L_state_failure_count = 0;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          } // if reached retry count
+          else
+          {
+             // INC failure count and retry current state
+             L_state_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_state_failure_count = 0;
+          L_set_pwr_limit_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_set_pwr_limit_state)
+       {
+           case GPU_STATE_SET_PWR_LIMIT_START:
+               // send the desired GPU power cap to the GPE to send to GPU
+               G_new_gpu_req_args.data[0] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw;
+               L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_SET_PWR_LIMIT_2:
+               L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_SET_PWR_LIMIT_3:
+               L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_SET_PWR_LIMIT_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_FINISH, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
+               // Update the requested power limit since it was successfully sent
+               // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
+               // has caluclated a new desired pcap while this one was already in process of being set
+               g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
+
+               // Done with this GPU ready to move to new one
+               L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
+               L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state);
+               L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_set_pwr_limit_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_set_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
+                   L_set_pwr_limit_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state);
+    }
+
+    return l_complete;
+} // end gpu_set_pwr_limit_sm()
+
+// Function Specification
+//
 // Name:  gpu_read_temp_sm
 //
 // Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id
@@ -750,6 +1497,7 @@ bool gpu_read_temp_sm()
                 mark_gpu_failed(&G_gpu_op_req_args);
 
                 L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+                L_read_failure_count = 0;
                 return TRUE;  // Done with this GPU, let GPU SM move to next
             }
             else
@@ -778,7 +1526,7 @@ bool gpu_read_temp_sm()
 
             case GPU_STATE_READ_TEMP_COMPLETE:
                 if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
-                    (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp?
+                    (0 != G_gpu_op_req_args.data[0]) )
                 {
                     g_amec->gpu[G_current_gpu_id].status.readOnce = true;
 
@@ -791,15 +1539,17 @@ bool gpu_read_temp_sm()
                     }
 
                     // comm is now established update for capability checking to take place
-                    g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
                     g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
                 }
                 // Update sensor
-                l_temp = G_gpu_op_req_args.data;
+                l_temp = G_gpu_op_req_args.data[0];
                 sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
 
                 // Clear all past errors
+                g_amec->gpu[G_current_gpu_id].status.coreTempFailure = false;
+                g_amec->gpu[G_current_gpu_id].status.coreTempNotAvailable = false;
                 g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
+                g_amec->gpu[G_current_gpu_id].status.retryCount = 0;
 
                 // check if there is an overtemp that hasn't been reported
                 if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
@@ -897,8 +1647,10 @@ bool gpu_read_mem_temp_capability_sm()
 {
     bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
     static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
-    static uint8_t L_read_failure_count = 0;
+    static uint8_t L_read_mem_cap_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
+    static uint8_t L_state_failure_count = 0;
     static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
 
     if (async_request_is_idle(&G_gpu_op_request.request))
     {
@@ -907,57 +1659,91 @@ bool gpu_read_mem_temp_capability_sm()
        if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) &&
            (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
-          // If reached retry count give up on this read
-          if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+          // Check if failure was due to driver change
+          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
           {
-             // log error that memory temp capability couldn't be determined
-             //  memory temp support will be left as not supported
-             INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d", G_current_gpu_id);
-
-             // Log error
-             /* @
-              * @errortype
-              * @moduleid    GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
-              * @reasoncode  GPU_FAILURE
-              * @userdata1   GPU ID
-              * @userdata2   0
-              * @userdata4   ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
-              * @devdesc     Failure to read GPU memory temp capability
-              *
-              */
-             errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
-                                           GPU_FAILURE,
-                                           ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
-                                           ERRL_SEV_PREDICTIVE,
-                                           NULL,
-                                           DEFAULT_TRACE_SIZE,
-                                           G_current_gpu_id,
-                                           0);
-
-             // Callout the GPU if have sensor ID for it
-             if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+             handle_driver_change();
+             // Request can't be processed by GPU at this time so we are done with this GPU
+             // setup to start new request
+             L_state_failure_count = 0;
+             L_read_mem_cap_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
+             L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+          // If reached state retry count give up on this read
+          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // if GPU is not in reset then INC error count and check if reached threshold
+             if(g_amec->gpu[G_current_gpu_id].status.notReset)
              {
-                addCalloutToErrl(l_err,
-                                 ERRL_CALLOUT_TYPE_GPU_ID,
-                                 G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
-                                 ERRL_CALLOUT_PRIORITY_MED);
-             }
+                if(++L_read_mem_cap_failure_count[G_current_gpu_id] > GPU_READ_MEM_CAP_ERROR_COUNT)
+                {
+                    INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d RC: 0x%02X",
+                                   G_current_gpu_id,
+                                   G_gpu_op_req_args.gpu_rc);
 
-             // Commit Error
-             commitErrl(&l_err);
+                    // give up trying to read mem temp capability for this GPU
+                    // It will be retried if detected that GPU driver is re-loaded
+                    g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+                    L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
+
+                    // cannot determine memory temp capability, mark memory temp as failed
+                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
+
+                    // log one time error that memory temp capability couldn't be determined
+                    if(!L_error_logged[G_current_gpu_id])
+                    {
+                       L_error_logged[G_current_gpu_id] = TRUE;
+
+                       // Log error
+                       /* @
+                        * @errortype
+                        * @moduleid    GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
+                        * @reasoncode  GPU_FAILURE
+                        * @userdata1   GPU ID
+                        * @userdata2   0
+                        * @userdata4   ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
+                        * @devdesc     Failure to read memory temp capability
+                        *
+                        */
+                       errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
+                                                     GPU_FAILURE,
+                                                     ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
+                                                     ERRL_SEV_PREDICTIVE,
+                                                     NULL,
+                                                     DEFAULT_TRACE_SIZE,
+                                                     G_current_gpu_id,
+                                                     0);
+
+                       // Callout the GPU if have sensor ID for it
+                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                       {
+                          addCalloutToErrl(l_err,
+                                           ERRL_CALLOUT_TYPE_GPU_ID,
+                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                           ERRL_CALLOUT_PRIORITY_MED);
+                       }
+
+                       // Commit Error
+                       commitErrl(&l_err);
+                    } // if error not logged
+                } // if reached error count
+             } // if notReset
 
              L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+             L_state_failure_count = 0;
              return TRUE;  // Done with this GPU, let GPU SM move to next
-          }
+          } // if reached state retry count
           else
           {
              // INC failure count and retry current state
-             L_read_failure_count++;
+             L_state_failure_count++;
           }
        }
        else // success on last state go to next state and process it
        {
-          L_read_failure_count = 0;
+          L_state_failure_count = 0;
           L_read_cap_state++;
        }
 
@@ -983,8 +1769,21 @@ bool gpu_read_mem_temp_capability_sm()
 
            case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
                // Update capability
-               g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data & 0x01;
+               g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
+
+               if(g_amec->gpu[G_current_gpu_id].status.memTempSupported)
+               {
+                  // mem temp is supported no need to re-check capability
+                  g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+               }
+               else
+               {
+                  // Need to keep query for mem temp capability to detect if ever changes to capable
+                  g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
+               }
+
                // Done with this GPU ready to move to new one
+               L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
                L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
                l_complete = TRUE;
                break;
@@ -1041,60 +1840,95 @@ bool gpu_read_memory_temp_sm()
        if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) &&
            (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
-          // If reached retry count give up on this read
-          if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+          // Check if failure was due to driver change
+          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
           {
-             // INC memory error count and check if reached timeout threshold for new mem temp
-             uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
-             g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
-             if((max_read_timeout) && (max_read_timeout != 0xFF) &&
-                (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+             handle_driver_change();
+             // Request can't be processed by GPU at this time so we are done with this GPU
+             // setup to start new request
+             L_read_failure_count = 0;
+             g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
+             L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+
+          // If reached retry count or GPU indicated cmd not supported then give up on this read
+          else if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
+                   (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
+          {
+             // if GPU is not in reset or the GPU responded with command not supported then
+             // INC memory error count and check if reached timeout for new mem temp
+             if( (g_amec->gpu[G_current_gpu_id].status.notReset) ||
+                 (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
              {
-                 // Disable memory temp reading for this GPU and log error
-                 g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
-                 // so BMC knows there is an error for fan control set sensor to 0xFF
-                 sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), 0xFFFF);
-
-                 INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
-                      G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
-
-                 // Log error
-                 /* @
-                  * @errortype
-                  * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
-                  * @reasoncode  GPU_FAILURE
-                  * @userdata1   GPU ID
-                  * @userdata2   number consecutive read mem temp failures
-                  * @userdata4   ERC_GPU_READ_MEM_TEMP_TIMEOUT
-                  * @devdesc     Timeout reading new GPU memory temperature
-                  *
-                  */
-                 errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
-                                               GPU_FAILURE,
-                                               ERC_GPU_READ_MEM_TEMP_TIMEOUT,
-                                               ERRL_SEV_PREDICTIVE,
-                                               NULL,
-                                               DEFAULT_TRACE_SIZE,
-                                               G_current_gpu_id,
-                                               g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+                g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
 
-                 // Callout the GPU if have sensor ID for it
-                 if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
-                 {
-                    addCalloutToErrl(l_err,
-                                     ERRL_CALLOUT_TYPE_GPU_ID,
-                                     G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
-                                     ERRL_CALLOUT_PRIORITY_MED);
-                 }
+                uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
+                if((max_read_timeout) && (max_read_timeout != 0xFF) &&
+                   (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+                {
+                    // Disable memory temp reading for this GPU and log error
+                    g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
+                    // so BMC knows there is an error for fan control set failure
+                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
+                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;
 
-                 // Commit Error
-                 commitErrl(&l_err);
+                    INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
+                                    G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
 
-             } // if timeout error
+                    if(g_amec->gpu[G_current_gpu_id].status.commErrorLogged == false)
+                    {
+                       INTR_TRAC_ERR("notReset: %d rc: 0x%0X", g_amec->gpu[G_current_gpu_id].status.notReset,
+                                     G_gpu_op_req_args.error.rc);
+                       // Log error
+                       /* @
+                        * @errortype
+                        * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
+                        * @reasoncode  GPU_FAILURE
+                        * @userdata1   GPU ID
+                        * @userdata2   GPU RC
+                        * @userdata4   ERC_GPU_READ_MEM_TEMP_TIMEOUT
+                        * @devdesc     Timeout reading new GPU memory temperature
+                        *
+                        */
+                       errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
+                                                     GPU_FAILURE,
+                                                     ERC_GPU_READ_MEM_TEMP_TIMEOUT,
+                                                     ERRL_SEV_PREDICTIVE,
+                                                     NULL,
+                                                     DEFAULT_TRACE_SIZE,
+                                                     G_gpu_op_req_args.gpu_rc,
+                                                     g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+
+                       // Callout the GPU if have sensor ID for it
+                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                       {
+                          addCalloutToErrl(l_err,
+                                           ERRL_CALLOUT_TYPE_GPU_ID,
+                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                           ERRL_CALLOUT_PRIORITY_MED);
+                       }
+
+                       // Commit Error
+                       commitErrl(&l_err);
+                       g_amec->gpu[G_current_gpu_id].status.commErrorLogged = true;
+                    } // if !commErrorLogged
+                } // if timeout error
+                else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED)
+                {
+                    // GPU indicated command not supported, re-check mem temp capability
+                    // if we try to read mem temp again that means mem temp was reported capable
+                    // and if this continues to fail eventually an error will be logged above at timeout
+                    g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
+                    g_amec->gpu[G_current_gpu_id].status.memTempSupported = false;
+                }
+             } // if notReset or command not supported
 
+             // setup to start new request
+             L_read_failure_count = 0;
              L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
              return TRUE;  // Done with this GPU, let GPU SM move to next
-          }
+          }  // else if failure count exceeded or command not supported
           else
           {
              // INC failure count and retry current state
@@ -1115,20 +1949,26 @@ bool gpu_read_memory_temp_sm()
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
                break;
 
-           case GPU_STATE_READ_MEM_TEMP_STOP:
+           case GPU_STATE_READ_MEM_TEMP_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args);
                break;
 
-           case GPU_STATE_READ_MEM_TEMP_READ:
+           case GPU_STATE_READ_MEM_TEMP_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args);
                break;
 
+            case GPU_STATE_READ_MEM_TEMP_READ:
+                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_FINISH, G_new_gpu_req_args);
+                break;
+
            case GPU_STATE_READ_MEM_TEMP_COMPLETE:
                // Update sensor
-               l_temp = G_gpu_op_req_args.data;
+               l_temp = G_gpu_op_req_args.data[0];
                sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
 
                // Clear past errors
+               g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
+               g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = false;
                g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
 
                // check if there is an overtemp that hasn't been reported
@@ -1253,9 +2093,30 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
             break;
         }
 
-//TODO: Enable when functional
-#if 0
-        // 3.  Need to check if driver is loaded?
+        // 3.  Time to start new temperature reads?
+        if(i_read_temp_start_needed)
+        {
+            // Start reading core temp from first present and functional GPU
+            l_gpu_id = get_first_gpu();
+            if(l_gpu_id != 0xFF)
+            {
+               // Read core temp for this GPU
+               G_current_gpu_id = l_gpu_id;
+               G_gpu_state = GPU_STATE_READ_TEMP;
+               l_new_state = TRUE;
+               break;
+            }
+            else  // no functional GPUs
+            {
+               // release I2C lock to the host for this engine and stop monitoring
+               occ_i2c_lock_release(GPU_I2C_ENGINE);
+               G_gpu_state = GPU_STATE_NO_LOCK;
+               G_gpu_monitoring_allowed = FALSE;
+               l_new_state = FALSE;  // No new state for GPU communication
+               break;
+            }
+        }
+        // 4.  Need to check if driver is loaded?
         l_gpu_id = gpu_id_need_driver_check();
         if(l_gpu_id != 0xFF)
         {
@@ -1266,7 +2127,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
             break;
         }
 
-        // 4.  Need to read power limits?
+//TODO: Enable when functional
+#if 0
+        // 5.  Need to read power limits?
         l_gpu_id = gpu_id_need_power_limits();
         if(l_gpu_id != 0xFF)
         {
@@ -1276,8 +2139,9 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
             l_new_state = TRUE;
             break;
         }
+#endif
 
-        // 5.  Need to read memory temps?
+        // 6.  Need to read memory temps?
         if(i_mem_temp_needed)
         {
             // first check if there is a GPU that needs memory temp capability checked
@@ -1304,31 +2168,6 @@ bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_nee
                }
             }
         }
-#endif
-
-        // 6.  Time to start new temperature reads?
-        if(i_read_temp_start_needed)
-        {
-            // Start reading core temp from first present and functional GPU
-            l_gpu_id = get_first_gpu();
-            if(l_gpu_id != 0xFF)
-            {
-               // Read core temp for this GPU
-               G_current_gpu_id = l_gpu_id;
-               G_gpu_state = GPU_STATE_READ_TEMP;
-               l_new_state = TRUE;
-               break;
-            }
-            else  // no functional GPUs
-            {
-               // release I2C lock to the host for this engine and stop monitoring
-               occ_i2c_lock_release(GPU_I2C_ENGINE);
-               G_gpu_state = GPU_STATE_NO_LOCK;
-               G_gpu_monitoring_allowed = FALSE;
-               l_new_state = FALSE;  // No new state for GPU communication
-               break;
-            }
-        }
 
         // Else nothing stay idle
     }while(0);
@@ -1361,12 +2200,13 @@ void task_gpu_sm(struct task *i_self)
     // are functional or GPU I2C interface is broken
     if(G_gpu_monitoring_allowed)
     {
+        // Read and update reset status for all GPUs
+        update_gpu_reset_status();
 
         // Initialize the IPC commands if this is our first run
         if(L_gpu_first_run)
         {
             gpu_ipc_init();
-            G_gpu_sm_start_time = ssx_timebase_get();  // used for timeout establishing comm
             L_gpu_first_run = FALSE;
         }
 
@@ -1379,6 +2219,7 @@ void task_gpu_sm(struct task *i_self)
             if(L_numCallsForTempRead >= GPU_TEMP_READ_1S)
             {
                  L_read_temp_start_needed = TRUE;
+                 L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
             }
         }
 
@@ -1430,6 +2271,7 @@ void task_gpu_sm(struct task *i_self)
                       // Start first with reading core temp of first functional GPU
                       L_numCallsForTempRead = 0;  // to track start of next temp reading in 1s
                       L_read_temp_start_needed = FALSE;  // start is no longer needed
+                      L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
                       l_gpu_id = get_first_gpu();
                       if(l_gpu_id != 0xFF)
                       {
@@ -1506,7 +2348,6 @@ void task_gpu_sm(struct task *i_self)
                    {
                       // Capability check complete for this GPU, go to IDLE state
                       // to let IDLE SM decide what to do next
-                      g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
                       G_gpu_state = GPU_STATE_IDLE;
                       l_start_next_state = TRUE;
                    }
@@ -1514,17 +2355,14 @@ void task_gpu_sm(struct task *i_self)
 
                case GPU_STATE_CHECK_DRIVER_LOADED:
                    // Check if driver is loaded for current GPU
-                   if(1) // TODO
+                   if(gpu_check_driver_loaded_sm())
                    {
-                      // Driver check complete for this GPU, go to IDLE state
-                      // to let IDLE SM decide what to do next
-                      g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE;
-                      g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE;
-                      if(g_amec->gpu[G_current_gpu_id].status.driverLoaded)
-                      {
-                          // Driver is loaded, read the power limits so we can start GPU power capping
-                          g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE;
-                      }
+                      // Driver check complete for this GPU,
+                      // NOTE: Do not set status.checkDriverLoaded to false here, if driver is
+                      // not loaded we need to keep checking for driver to be loaded this is decided
+                      // inside gpu_check_driver_loaded_sm()
+
+                      // go to IDLE state to let IDLE SM decide what to do next
                       G_gpu_state = GPU_STATE_IDLE;
                       l_start_next_state = TRUE;
                    }
@@ -1532,11 +2370,10 @@ void task_gpu_sm(struct task *i_self)
 
                case GPU_STATE_READ_PWR_LIMIT:
                    // Read power limits for current GPU
-                   if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported
+                   if(gpu_read_pwr_limit_sm())
                    {
                       // Read power limits complete for this GPU, go to IDLE state
                       // to let IDLE SM decide what to do next
-                      g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
                       G_gpu_state = GPU_STATE_IDLE;
                       l_start_next_state = TRUE;
                    }
@@ -1544,7 +2381,7 @@ void task_gpu_sm(struct task *i_self)
 
                case GPU_STATE_SET_PWR_LIMIT:
                    // Set power limit on current GPU
-                   if(1) // TODO
+                   if(gpu_set_pwr_limit_sm())
                    {
                       // Set power limit complete for this GPU, go to IDLE state
                       // to let IDLE SM decide what to do next
@@ -1596,6 +2433,7 @@ void task_gpu_sm(struct task *i_self)
                      // new state to read core temp reset temperature reading timer
                      L_numCallsForTempRead = 0;
                      L_read_temp_start_needed = FALSE; // start no longer needed
+                     L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
                   }
                   else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP)
                   {
@@ -1606,6 +2444,4 @@ void task_gpu_sm(struct task *i_self)
            }
         }while((l_start_next_state) && (!l_next_state));
     } // GPU monitoring enabled
-
-
 } // end task_gpu_sm()
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
index c8f13ff..a26c9c0 100644
--- a/src/occ_405/gpu/gpu.h
+++ b/src/occ_405/gpu/gpu.h
@@ -74,9 +74,10 @@ typedef enum
 {
     GPU_STATE_READ_MEM_TEMP_NEW      = 0x21,
     GPU_STATE_READ_MEM_TEMP_START    = 0x22,
-    GPU_STATE_READ_MEM_TEMP_STOP     = 0x23,
-    GPU_STATE_READ_MEM_TEMP_READ     = 0x24,
-    GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x25,
+    GPU_STATE_READ_MEM_TEMP_2        = 0x23,
+    GPU_STATE_READ_MEM_TEMP_3        = 0x24,
+    GPU_STATE_READ_MEM_TEMP_READ     = 0x25,
+    GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x26,
 } gpuReadMemTempState_e;
 
 // States for checking GPU memory temperature capability (gpu_read_mem_temp_capability_sm)
@@ -90,6 +91,39 @@ typedef enum
     GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x36,
 } gpuReadMemTempCapableState_e;
 
+// States for checking if GPU driver is loaded (gpu_check_driver_loaded_sm)
+typedef enum
+{
+    GPU_STATE_CHECK_DRIVER_LOADED_NEW      = 0x41,
+    GPU_STATE_CHECK_DRIVER_LOADED_START    = 0x42,
+    GPU_STATE_CHECK_DRIVER_LOADED_2        = 0x43,
+    GPU_STATE_CHECK_DRIVER_LOADED_3        = 0x44,
+    GPU_STATE_CHECK_DRIVER_LOADED_READ     = 0x45,
+    GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE = 0x46,
+} gpuCheckDriverLoadedState_e;
+
+// States for reading GPU power limits (gpu_read_pwr_limit_sm)
+typedef enum
+{
+    GPU_STATE_READ_PWR_LIMIT_NEW      = 0x51,
+    GPU_STATE_READ_PWR_LIMIT_START    = 0x52,
+    GPU_STATE_READ_PWR_LIMIT_2        = 0x53,
+    GPU_STATE_READ_PWR_LIMIT_3        = 0x54,
+    GPU_STATE_READ_PWR_LIMIT_READ     = 0x55,
+    GPU_STATE_READ_PWR_LIMIT_COMPLETE = 0x56,
+} gpuReadPwrLimitState_e;
+
+// States for setting GPU power limit (gpu_set_pwr_limit_sm)
+typedef enum
+{
+    GPU_STATE_SET_PWR_LIMIT_NEW      = 0x61,
+    GPU_STATE_SET_PWR_LIMIT_START    = 0x62,
+    GPU_STATE_SET_PWR_LIMIT_2        = 0x63,
+    GPU_STATE_SET_PWR_LIMIT_3        = 0x64,
+    GPU_STATE_SET_PWR_LIMIT_READ     = 0x65,
+    GPU_STATE_SET_PWR_LIMIT_COMPLETE = 0x66,
+} gpuSetPwrLimitState_e;
+
 // GPU IPC initialization
 void gpu_ipc_init();
 
diff --git a/src/occ_405/gpu/gpu_service_codes.h b/src/occ_405/gpu/gpu_service_codes.h
index 41cb3f9..4ea7c6f 100755
--- a/src/occ_405/gpu/gpu_service_codes.h
+++ b/src/occ_405/gpu/gpu_service_codes.h
@@ -39,6 +39,9 @@ enum gpuModuleId
     GPU_MID_GPU_READ_TEMP             =  GPU_COMP_ID | 0x06,
     GPU_MID_GPU_READ_MEM_TEMP         =  GPU_COMP_ID | 0x07,
     GPU_MID_GPU_READ_MEM_TEMP_CAPABLE =  GPU_COMP_ID | 0x08,
+    GPU_MID_GPU_CHECK_DRIVER_LOADED   =  GPU_COMP_ID | 0x09,
+    GPU_MID_GPU_READ_PWR_LIMIT        =  GPU_COMP_ID | 0x0A,
+    GPU_MID_GPU_SET_PWR_LIMIT         =  GPU_COMP_ID | 0x0B,
 };
 
 #endif /* #ifndef _GPU_SERVICE_CODES_H_ */
diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h
index a379f87..c740c05 100644
--- a/src/occ_405/occ_service_codes.h
+++ b/src/occ_405/occ_service_codes.h
@@ -87,12 +87,14 @@ enum occReasonCode
     /// Firmware Failure: equivalent to assertion failures
     INTERNAL_FW_FAILURE             = 0xA0,
 
+    /// Build problem, gpe1 image doesn't support GPU interface
+    GPU_NO_GPE_SUPPORT              = 0xA1,
     /// Error with GPU tasks
-    GPU_FAILURE                     = 0xA1,
+    GPU_FAILURE                     = 0xA2,
     /// GPU core reached error threshold
-    GPU_ERROR_TEMP                  = 0xA2,
+    GPU_ERROR_TEMP                  = 0xA3,
     /// GPU memory reached error threshold
-    GPU_MEMORY_ERROR_TEMP           = 0xA3,
+    GPU_MEMORY_ERROR_TEMP           = 0xA4,
 
     /// Failure within the OCC Complex of the processor
     INTERNAL_HW_FAILURE             = 0xB0,
@@ -289,7 +291,11 @@ enum occExtReasonCode
     ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE       = 0x00F6,
     ERC_GPU_INVALID_GPU_OPERATION               = 0x00F7,
     ERC_GPU_N_MODE_PCAP_CALC_FAILURE            = 0x00F8,
+    ERC_GPU_N_PLUS_1_MODE_PCAP_CALC_FAILURE     = 0x00F9,
     ERC_GPU_NO_GPE_SUPPORT                      = 0x00FF,
+    ERC_GPU_CHECK_DRIVER_LOADED_FAILURE         = 0x0100,
+    ERC_GPU_READ_PWR_LIMIT_FAILURE              = 0x0101,
+    ERC_GPU_SET_PWR_LIMIT_FAILURE               = 0x0102,
 
     ERC_STATE_FROM_ALL_TO_STB_FAILURE           = 0x0123,
     ERC_STATE_FROM_ACT_TO_CHR_FAILURE           = 0x0124,
author	William Bryan <wilbryan@us.ibm.com>	2017-09-28 13:32:29 -0500
committer	William A. Bryan <wilbryan@us.ibm.com>	2017-10-03 16:03:05 -0400
commit	74f721c90235a18821b97782d98349cf51e0f12d (patch)
tree	1f2fd59b41db514c0273632dd2dd7926e25a2030
parent	76b91d0038d59b30de14108e908bc78c6d988796 (diff)
download	talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.tar.gz talos-occ-74f721c90235a18821b97782d98349cf51e0f12d.zip