Initial 405 GPU support

Change-Id: I6e957ca1aa643d257274e99957df5b15ac8c889b Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44254 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
author: mbroyles <mbroyles@us.ibm.com> 2017-08-06 19:08:00 -0500
committer: William A. Bryan <wilbryan@us.ibm.com> 2017-08-14 15:18:26 -0400
commit: 8a335d83ed938f05f95ca1cfdbbb5292053ed51f (patch)
tree: bd2b38c6df596f3d3bf9f70f8a54a8a205e4e2e1
parent: 71b5f68da8b725f9c5251261b41fd824e652e491 (diff)
download: talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.tar.gz
talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.zip
22 files changed, 1963 insertions, 29 deletions
diff --git a/src/common/gpe_err.h b/src/common/gpe_err.h
index c4e9371..8580012 100644
--- a/src/common/gpe_err.h
+++ b/src/common/gpe_err.h
@@ -1,11 +1,11 @@
 /* IBM_PROLOG_BEGIN_TAG                                                   */
 /* This is an automatically generated prolog.                             */
 /*                                                                        */
-/* $Source: src/gpe_err.h $                                               */
+/* $Source: src/common/gpe_err.h $                                        */
 /*                                                                        */
 /* OpenPOWER OnChipController Project                                     */
 /*                                                                        */
-/* Contributors Listed Below - COPYRIGHT 2011,2016                        */
+/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
 /* [+] International Business Machines Corp.                              */
 /*                                                                        */
 /*                                                                        */
@@ -50,5 +50,7 @@
 #define GPE_RC_GET_CORE_DATA_FAILED  0x60     // Failed to collect core data
 #define GPE_RC_GET_NEST_DTS_FAILED   0x61     // Failed to collect nest DTS temperatures
 
+// GPU Errors
+#define GPE_RC_NO_GPU_SUPPORT        0x8F     // GPE1 image doesn't support GPUs
 
 #endif //_GPE_ERR_H
diff --git a/src/common/gpu_structs.h b/src/common/gpu_structs.h
new file mode 100644
index 0000000..03c8e06
--- /dev/null
+++ b/src/common/gpu_structs.h
@@ -0,0 +1,86 @@
+/* IBM_PROLOG_BEGIN_TAG                                                   */
+/* This is an automatically generated prolog.                             */
+/*                                                                        */
+/* $Source: src/common/gpu_structs.h $                                    */
+/*                                                                        */
+/* OpenPOWER OnChipController Project                                     */
+/*                                                                        */
+/* Contributors Listed Below - COPYRIGHT 2016,2017                        */
+/* [+] International Business Machines Corp.                              */
+/*                                                                        */
+/*                                                                        */
+/* Licensed under the Apache License, Version 2.0 (the "License");        */
+/* you may not use this file except in compliance with the License.       */
+/* You may obtain a copy of the License at                                */
+/*                                                                        */
+/*     http://www.apache.org/licenses/LICENSE-2.0                         */
+/*                                                                        */
+/* Unless required by applicable law or agreed to in writing, software    */
+/* distributed under the License is distributed on an "AS IS" BASIS,      */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
+/* implied. See the License for the specific language governing           */
+/* permissions and limitations under the License.                         */
+/*                                                                        */
+/* IBM_PROLOG_END_TAG                                                     */
+
+/* This header file is used by both occ_405 and occ_gpe1.                 */
+/* Contains common structures and globals.                                */
+
+#ifndef _GPU_STRUCTS_H
+#define _GPU_STRUCTS_H
+
+#include "occ_util.h"
+#include <gpe_export.h>
+#include "gpe_err.h"
+
+#define MAX_GPUS 3
+
+#define GPU_RESET_REQ_MASTER        1
+#define GPU_RESET_REQ_SLV           2
+#define GPU_RESET_REQ_SLV_COMPLETE  3
+
+typedef enum
+{
+    ID_GPU0 = 0x00,
+    ID_GPU1 = 0x01,
+    ID_GPU2 = 0x02,
+    ID_ALL_GPUS = 0xFF
+} GPU_ID;
+
+typedef enum
+{
+    GPU_STATE_PRESENT = 0x00000001,
+    GPU_STATE_FAILED  = 0x80000000,
+} GPU_STATE;
+
+// GPU Request Operations
+typedef enum
+{
+    GPU_REQ_INIT                        = 0x01, // Init interrupt registers
+    GPU_REQ_READ_CAPS_START             = 0x02, // Start reading capabilities
+    GPU_REQ_READ_CAPS_STOP              = 0x03,
+    GPU_REQ_READ_CAPS                   = 0x04,
+    GPU_REQ_READ_TEMP_SIMPLE_START      = 0x05, // Start reading GPU information
+    GPU_REQ_READ_TEMP_SIMPLE_STOP       = 0x06, // Read GPU temp register
+    GPU_REQ_READ_TEMP_SIMPLE            = 0x07, // Start reading GPU temperature
+    GPU_REQ_READ_TEMP_START             = 0x08, // Start reading GPU information
+    GPU_REQ_READ_TEMP_STOP              = 0x09, // Read GPU temp register
+    GPU_REQ_READ_TEMP                   = 0x0A, // Start reading GPU temperature
+    GPU_REQ_READ_PWR_LIMIT_START        = 0x0B, // Start reading GPU information
+    GPU_REQ_READ_PWR_LIMIT_STOP         = 0x0C, // Read GPU temp register
+    GPU_REQ_READ_PWR_LIMIT              = 0x0D, // Start reading pwr limit
+    GPU_REQ_RESET                       = 0x60, // Reset
+} gpu_op_req_e;
+
+// GPU arguments
+typedef struct
+{
+  GpeErrorStruct error;
+  uint8_t gpu_id;
+  uint8_t operation;
+  uint32_t data[MAX_GPUS];
+} gpu_sm_args_t;
+
+
+#endif // _GPU_STRUCTS_H
+
diff --git a/src/common/ipc_func_ids.h b/src/common/ipc_func_ids.h
index 3f759a5..9d3dd33 100644
--- a/src/common/ipc_func_ids.h
+++ b/src/common/ipc_func_ids.h
@@ -72,6 +72,7 @@ IPC_FUNCIDS_TABLE_START
         IPC_FUNC_ID(IPC_ST_RESET_MEM_DEADMAN)
         IPC_FUNC_ID(IPC_ST_24_X_7_FUNCID)
         IPC_FUNC_ID(IPC_ST_MEM_POWER_CONTROL_FUNCID)
+        IPC_FUNC_ID(IPC_ST_GPU_SM_FUNCID)
     IPC_FUNCIDS_ST_END(OCCHW_INST_ID_GPE1)
 
     //Functions that are only supported by GPE2 should be defined here
diff --git a/src/occ_405/Makefile b/src/occ_405/Makefile
index f64a319..8a3e9bf 100755
--- a/src/occ_405/Makefile
+++ b/src/occ_405/Makefile
@@ -5,7 +5,7 @@
 #
 # OpenPOWER OnChipController Project
 #
-# Contributors Listed Below - COPYRIGHT 2015,2016
+# Contributors Listed Below - COPYRIGHT 2015,2017
 # [+] International Business Machines Corp.
 #
 #
@@ -56,6 +56,7 @@ LIB_DIRS =  -L$(OBJDIR) \
             -L$(OBJDIR)/dcom \
             -L$(OBJDIR)/dimm \
             -L$(OBJDIR)/errl \
+            -L$(OBJDIR)/gpu \
             -L$(OBJDIR)/lock \
             -L$(OBJDIR)/pss \
             -L$(OBJDIR)/rtls \
diff --git a/src/occ_405/amec/amec_sensors_power.c b/src/occ_405/amec/amec_sensors_power.c
index 2233738..3820330 100755
--- a/src/occ_405/amec/amec_sensors_power.c
+++ b/src/occ_405/amec/amec_sensors_power.c
@@ -69,6 +69,7 @@ uint32_t G_curr_num_gpus_sys = 0;
 #define ADC_CONVERTED_VALUE(i_chan) \
     ((i_chan < MAX_APSS_ADC_CHANNELS) ? G_lastValidAdcValue[i_chan] : 0)
 
+extern bool    G_gpu_monitoring_allowed;
 extern uint8_t G_occ_interrupt_type;
 extern bool    G_vrm_thermal_monitoring;
 extern PWR_READING_TYPE  G_pwr_reading_type;
@@ -821,6 +822,11 @@ void amec_update_gpu_configuration(void)
         {
             G_gpu_config_done = TRUE;
             G_first_proc_gpu_config = l_valid_bitmask_proc;
+            if(G_first_proc_gpu_config)
+            {
+               // GPUs are present enable monitoring
+               G_gpu_monitoring_allowed = TRUE;
+            }
             G_first_sys_gpu_config = l_valid_bitmask_sys;
             G_first_num_gpus_sys = l_num_gpus_sys;
             TRAC_IMP("GPU presence detection completed. GPU configuration for this OCC: 0x%08X, total[%d]",
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 74b2812..a45fb42 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -430,6 +430,53 @@ typedef struct
 } amec_quad_t;
 
 //-------------------------------------------------------------
+// GPU Structures
+//-------------------------------------------------------------
+
+typedef struct {
+    bool     disabled;  // GPU has been marked failed and no longer monitored
+    bool     readOnce;  // Comm has been established with GPU
+    bool     overtempError;  // Core OT error has been logged against GPU
+    bool     memOvertempError;  // Memory OT error has been logged against GPU
+    bool     checkDriverLoaded;   // Indicates if need to check if driver is loaded
+    bool     driverLoaded;   // Indicates if GPU driver is loaded
+    bool     checkMemTempSupport; // Indicates if need to check if mem monitoring is supported
+    bool     memTempSupported; // Indicates if memory temperature monitoring is supported
+    uint8_t  memErrorCount; // count of consecutive GPU mem temp read failures
+    uint8_t  errorCount;   // count of consecutive GPU core temp read failures
+} gpuStatus_t;
+
+typedef struct {
+    bool     check_pwr_limit; // Indicates if need to read power limits from GPU
+    bool     pwr_limits_read;   // Indicates if power limits were read i.e. have min/max
+    uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
+    uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
+    uint32_t gpu_desired_pcap_mw;  // AMEC determined pcap in mW to set
+    uint32_t gpu_requested_pcap_mw;  // Requested power cap in mW sent to GPU
+    uint32_t gpu_actual_pcap_mw;  // Actual power cap in mW read back from the GPU
+} gpuPcap_t;
+
+
+typedef struct
+{
+  //-----------------------------------
+  // Sensors
+  //-----------------------------------
+  sensor_t tempgpu;    // GPU core temperature
+  sensor_t tempgpumem; // GPU HBM temperature
+
+  //-----------------------------------
+  // Data
+  //-----------------------------------
+  // General Status of GPU
+  gpuStatus_t status;
+
+  // GPU Power Cap Information
+  gpuPcap_t pcap;
+
+} amec_gpu_t;
+
+//-------------------------------------------------------------
 // Proc Structure
 //-------------------------------------------------------------
 typedef struct
@@ -468,11 +515,6 @@ typedef struct
   // Nimbus DIMM Sensors
   sensor_t       tempdimm[NUM_DIMM_PORTS*NUM_DIMMS_PER_I2CPORT];
 
- // GPU Sensors
-  sensor_t tempgpu0;
-  sensor_t tempgpu1;
-  sensor_t tempgpu2;
-
   sensor_t curvdn;
   sensor_t pwrvdd;
   sensor_t pwrvdn;
@@ -607,6 +649,9 @@ typedef struct
   //     in the hopes of perhaps reusing some code from previous projects.
   amec_proc_t   proc[NUM_PROC_CHIPS_PER_OCC];
 
+  // GPU Data
+  amec_gpu_t   gpu[MAX_NUM_GPU_PER_DOMAIN];
+
   // OCC Firmware Data
   amec_fw_t     fw;
 
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 84f14bc..cb3835c 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -248,7 +248,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
     l_sensorHeader.count  = 0;
 
     //Initialize to max number of possible temperature sensors.
-    l_max_sensors = MAX_NUM_CORES + MAX_NUM_MEM_CONTROLLERS + (MAX_NUM_MEM_CONTROLLERS * NUM_DIMMS_PER_CENTAUR) + MAX_NUM_GPU_PER_DOMAIN;
+    l_max_sensors = MAX_NUM_CORES + MAX_NUM_MEM_CONTROLLERS + (MAX_NUM_MEM_CONTROLLERS * NUM_DIMMS_PER_CENTAUR) + (MAX_NUM_GPU_PER_DOMAIN * 2);
     l_max_sensors++;  // +1 for VRM
     cmdh_poll_temp_sensor_t l_tempSensorList[l_max_sensors];
     memset(l_tempSensorList, 0x00, sizeof(l_tempSensorList));
@@ -351,12 +351,25 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
     // Add GPU temperatures
     for (k=0; k<MAX_NUM_GPU_PER_DOMAIN; k++)
     {
-        if(GPU_PRESENT(k))
+        if(GPU_PRESENT(k))  // temp until GPU sensor IDs are sent make sensor ids "GPU"<gpu#>
         {
-            l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
+            // GPU core temperature
+            if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid)  // temp
+               l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
+            else
+               l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k;  // temp
             l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU;
             l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
             l_sensorHeader.count++;
+
+            // GPU memory temperature
+            if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid)  // temp
+               l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
+            else
+               l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k;  // temp
+            l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM;
+            l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
+            l_sensorHeader.count++;
         }
     }
 
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c
index f285143..34f2e0d 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c
@@ -1209,12 +1209,18 @@ errlHndl_t data_store_gpu(const cmdh_fsp_cmd_t * i_cmd_ptr,
     {
         G_sysConfigData.total_non_gpu_max_pwr_watts = l_cmd_ptr->total_non_gpu_max_pwr_watts;
         G_sysConfigData.total_proc_mem_pwr_drop_watts = l_cmd_ptr->total_proc_mem_pwr_drop_watts;
-        G_sysConfigData.gpu_sensor_ids[0]  = l_cmd_ptr->gpu0_sid;
+
         AMECSENSOR_PTR(TEMPGPU0)->ipmi_sid = l_cmd_ptr->gpu0_temp_sid;
-        G_sysConfigData.gpu_sensor_ids[1]  = l_cmd_ptr->gpu1_sid;
+        AMECSENSOR_PTR(TEMPGPU0MEM)->ipmi_sid = l_cmd_ptr->gpu0_mem_temp_sid;
+        G_sysConfigData.gpu_sensor_ids[0]  = l_cmd_ptr->gpu0_sid;
+
         AMECSENSOR_PTR(TEMPGPU1)->ipmi_sid = l_cmd_ptr->gpu1_temp_sid;
-        G_sysConfigData.gpu_sensor_ids[2]  = l_cmd_ptr->gpu2_sid;
+        AMECSENSOR_PTR(TEMPGPU1MEM)->ipmi_sid = l_cmd_ptr->gpu1_mem_temp_sid;
+        G_sysConfigData.gpu_sensor_ids[1]  = l_cmd_ptr->gpu1_sid;
+
         AMECSENSOR_PTR(TEMPGPU2)->ipmi_sid = l_cmd_ptr->gpu2_temp_sid;
+        AMECSENSOR_PTR(TEMPGPU2MEM)->ipmi_sid = l_cmd_ptr->gpu2_mem_temp_sid;
+        G_sysConfigData.gpu_sensor_ids[2]  = l_cmd_ptr->gpu2_sid;
 
         G_data_cnfg->data_mask |= DATA_MASK_GPU;
         CMDH_TRAC_IMP("data_store_gpu: Got valid GPU data packet");
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h
index 4a2679c..fcb4893 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h
@@ -78,6 +78,7 @@ typedef enum
     DATA_FRU_DIMM               = 0x02,
     DATA_FRU_VRM                = 0x03,
     DATA_FRU_GPU                = 0x04,
+    DATA_FRU_GPU_MEM            = 0x05,
     DATA_FRU_MAX,
 } eConfigDataFruType;
 
@@ -171,12 +172,15 @@ typedef struct __attribute__ ((packed))
     uint16_t total_non_gpu_max_pwr_watts;
     uint16_t total_proc_mem_pwr_drop_watts;
     uint16_t reserved;
-    uint32_t gpu0_sid;       // GPU0 Sensor ID
-    uint32_t gpu0_temp_sid;  // GPU0 Temperature Sensor ID
-    uint32_t gpu1_sid;       // GPU1 Sensor ID
-    uint32_t gpu1_temp_sid;  // GPU1 Temperature Sensor ID
-    uint32_t gpu2_sid;       // GPU2 Sensor ID
-    uint32_t gpu2_temp_sid;  // GPU2 Temperature Sensor ID
+    uint32_t gpu0_temp_sid;     // GPU0 Temperature Sensor ID
+    uint32_t gpu0_mem_temp_sid; // GPU0 Memory Temperature Sensor ID
+    uint32_t gpu0_sid;          // GPU0 Sensor ID for callout
+    uint32_t gpu1_temp_sid;     // GPU1 Temperature Sensor ID
+    uint32_t gpu1_mem_temp_sid; // GPU1 Memory Temperature Sensor ID
+    uint32_t gpu1_sid;          // GPU1 Sensor ID for callout
+    uint32_t gpu2_temp_sid;     // GPU2 Temperature Sensor ID
+    uint32_t gpu2_mem_temp_sid; // GPU2 Memory Temperature Sensor ID
+    uint32_t gpu2_sid;          // GPU2 Sensor ID for callout
 }cmdh_gpu_config_t;
 
 // Used by TMGT to send OCC the PCAP config data.
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
new file mode 100755
index 0000000..5334e31
--- /dev/null
+++ b/src/occ_405/gpu/gpu.c
@@ -0,0 +1,1559 @@
+/* IBM_PROLOG_BEGIN_TAG                                                   */
+/* This is an automatically generated prolog.                             */
+/*                                                                        */
+/* $Source: src/occ_405/gpu/gpu.c $                                       */
+/*                                                                        */
+/* OpenPOWER OnChipController Project                                     */
+/*                                                                        */
+/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
+/* [+] International Business Machines Corp.                              */
+/*                                                                        */
+/*                                                                        */
+/* Licensed under the Apache License, Version 2.0 (the "License");        */
+/* you may not use this file except in compliance with the License.       */
+/* You may obtain a copy of the License at                                */
+/*                                                                        */
+/*     http://www.apache.org/licenses/LICENSE-2.0                         */
+/*                                                                        */
+/* Unless required by applicable law or agreed to in writing, software    */
+/* distributed under the License is distributed on an "AS IS" BASIS,      */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
+/* implied. See the License for the specific language governing           */
+/* permissions and limitations under the License.                         */
+/*                                                                        */
+/* IBM_PROLOG_END_TAG                                                     */
+
+//#define GPU_DEBUG
+#ifdef GPU_DEBUG
+  #define GPU_DBG(frmt,args...)   DBG_PRINT(frmt,##args)
+#else
+  #define GPU_DBG(frmt,args...)
+#endif
+
+#include <ssx.h>
+#include <occhw_async.h>
+
+#include <trac_interface.h>
+#include <trac.h>
+#include <occ_common.h>
+#include <comp_ids.h>
+#include <occ_service_codes.h>
+#include <state.h>
+#include <occ_sys_config.h>
+#include "sensor.h"
+#include "amec_sys.h"
+#include "lock.h"
+#include "common.h"
+#include "amec_health.h"
+#include "gpu.h"
+#include "gpu_structs.h"
+#include "gpu_service_codes.h"
+
+#define GPU_TEMP_READ_1S  ( 1000000 / (MICS_PER_TICK * 2) )  // Number calls with assumption called every other tick
+
+// Time in seconds to ignore errors from the start of GPU SM
+// Right now this time must include PRST and GPU init time
+// this may be reduced after adding in OS interlock for PRST
+#define GPU_COMM_ESTAB_TIMEOUT_SECONDS 600
+
+#define MAX_CONSECUTIVE_GPU_RESETS      3
+#define MAX_GPU_RESET_STATE_RETRY 3
+#define MAX_RESET_STATE_NOT_DONE_COUNT 100
+#define MAX_GPU_READ_ATTEMPT    3
+#define GPU_I2C_ENGINE      PIB_I2C_ENGINE_C
+
+extern data_cnfg_t * G_data_cnfg;
+
+// this is the global GPU task sm state each task within the GPU SM may have its own "state"
+// to allow several calls to complete the task
+gpuState_e G_gpu_state = GPU_STATE_IDLE;
+
+bool     G_gpu_monitoring_allowed = FALSE;   // Set to true if GPU is present
+bool     G_gpu_i2c_reset_required = FALSE;
+uint32_t G_gpu_reset_cause = 0;
+uint64_t G_gpu_sm_start_time = 0;
+
+// GPE Requests
+GpeRequest G_gpu_op_request;
+
+// GPE arguments
+GPE_BUFFER(gpu_sm_args_t  G_gpu_op_req_args);
+
+gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};
+
+uint8_t G_current_gpu_id = 0;   // ID 0..2 of GPU currently being processed
+bool    G_gpu_read_issued = false;
+
+// Read OCC_MISC register to see if an I2C interrupt was generated for
+// the specified engine.
+bool check_for_i2c_interrupt(const uint8_t i_engine);
+
+// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
+uint8_t get_first_gpu(void)
+{
+    uint8_t first_gpu = 0xFF;  // default no GPUs present/functional
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) )
+        {
+           first_gpu = i;
+           break;
+        }
+    }
+    return first_gpu;
+}
+
+// Get GPU number for next present non-failed GPU from G_current_gpu_id
+// returns 0xFF if there is no next GPU i.e. wrapped back to first GPU
+uint8_t get_next_gpu(void)
+{
+    uint8_t next_gpu = G_current_gpu_id;
+
+    if(G_current_gpu_id != 0xFF)
+    {
+        do
+        {
+           if(++next_gpu == MAX_NUM_GPU_PER_DOMAIN)
+           {
+              next_gpu = 0;
+           }
+           if( (GPU_PRESENT(next_gpu)) && (!g_amec->gpu[next_gpu].status.disabled) )
+           {
+              break;
+           }
+        }while(next_gpu != G_current_gpu_id);
+    }
+
+    if(next_gpu == get_first_gpu())
+    {
+        next_gpu = 0xFF;
+    }
+
+    return next_gpu;
+}
+
+// Get GPU number for a GPU that needs to be checked if driver is loaded
+// returns 0xFF if no GPU needs to be checked
+uint8_t gpu_id_need_driver_check(void)
+{
+    uint8_t gpu_id = 0xFF;  // default none needs checking
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkDriverLoaded))
+        {
+           gpu_id = i;
+           break;
+        }
+    }
+    return gpu_id;
+}
+
+uint8_t gpu_id_need_memory_temp_capability_check(void)
+{
+    uint8_t gpu_id = 0xFF;  // default none needs checking
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        if((GPU_PRESENT(i)) && (g_amec->gpu[i].status.checkMemTempSupport))
+        {
+           gpu_id = i;
+           break;
+        }
+    }
+    return gpu_id;
+}
+
+// Find first functional GPU with memory temp capability
+// returns 0xFF if no functional GPU has memory temp capability
+uint8_t get_first_mem_temp_capable_gpu(void)
+{
+    uint8_t first_gpu = 0xFF;  // default no GPU with mem temp capability
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        if( (!g_amec->gpu[i].status.disabled) &&
+            (g_amec->gpu[i].status.memTempSupported) )
+        {
+           first_gpu = i;
+           break;
+        }
+    }
+    return first_gpu;
+}
+
+// Get GPU number for next functional GPU from G_current_gpu_id with mem temp capability
+// returns 0xFF if there is no next GPU i.e. wrapped back to first GPU with mem temp
+uint8_t get_next_mem_temp_capable_gpu(void)
+{
+    uint8_t next_gpu = G_current_gpu_id;
+
+    if(G_current_gpu_id != 0xFF)
+    {
+        do
+        {
+           if(++next_gpu == MAX_NUM_GPU_PER_DOMAIN)
+           {
+              next_gpu = 0;
+           }
+           if( (!g_amec->gpu[next_gpu].status.disabled) &&
+               (g_amec->gpu[next_gpu].status.memTempSupported) )
+           {
+              break;
+           }
+        }while(next_gpu != G_current_gpu_id);
+    }
+
+    if(next_gpu == get_first_mem_temp_capable_gpu())
+    {
+        next_gpu = 0xFF;
+    }
+
+    return next_gpu;
+}
+
+
+// Get GPU number for a GPU that needs power limits read
+// returns 0xFF if no GPU needs power limits read
+uint8_t gpu_id_need_power_limits(void)
+{
+    uint8_t gpu_id = 0xFF;  // default none
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        // to read power limits requires that the driver is loaded
+        if( (g_amec->gpu[i].status.driverLoaded) &&
+            (g_amec->gpu[i].pcap.check_pwr_limit))
+        {
+           gpu_id = i;
+           break;
+        }
+    }
+    return gpu_id;
+}
+
+// Get GPU number for a GPU that needs power limit set
+// returns 0xFF if no GPU needs power limit set
+uint8_t gpu_id_need_set_power_limit(void)
+{
+    uint8_t gpu_id = 0xFF;  // default none
+    uint8_t i = 0;
+
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        // to set power limit requires that the driver is loaded
+        if( (g_amec->gpu[i].status.driverLoaded) &&
+            (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) )
+        {
+           gpu_id = i;
+           break;
+        }
+    }
+    return gpu_id;
+}
+
+// Disable GPU monitoring for all GPUs
+void disable_all_gpus(void)
+{
+    uint8_t i = 0;
+
+    // release I2C lock to the host for this engine and stop monitoring
+    occ_i2c_lock_release(GPU_I2C_ENGINE);
+    G_gpu_monitoring_allowed = FALSE;
+
+    // mark all GPUs as disabled
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        g_amec->gpu[i].status.disabled = TRUE;
+    }
+}
+
+// Create GPU IPC requests
+void gpu_ipc_init()
+{
+    errlHndl_t l_err = NULL;
+    int rc = 0;
+
+    do
+    {
+        // Initialize IPC request for GPU operation requests
+        GPU_DBG("gpu_ipc_init: Creating GPE1 IPC request for GPU op requests");
+        rc = gpe_request_create(&G_gpu_op_request,
+                                &G_async_gpe_queue1,
+                                IPC_ST_GPU_SM_FUNCID,
+                                &G_gpu_op_req_args,
+                                SSX_WAIT_FOREVER,
+                                NULL, // no callback/arg
+                                NULL,
+                                ASYNC_CALLBACK_IMMEDIATE);
+        if (rc)
+        {
+            TRAC_ERR("gpu_ipc_init: Failed to create GPE1 IPC request for GPU op req (rc=%d)", rc);
+            break;
+        }
+    }
+    while(0);
+
+    if (rc)
+    {
+        /* @
+         * @errortype
+         * @moduleid    GPU_MID_INIT
+         * @reasoncode  SSX_GENERIC_FAILURE
+         * @userdata1   return code
+         * @userdata4   OCC_NO_EXTENDED_RC
+         * @devdesc     Failed to create GPE1 GPU IPC request
+         */
+        l_err = createErrl(GPU_MID_INIT,
+                           SSX_GENERIC_FAILURE,
+                           OCC_NO_EXTENDED_RC,
+                           ERRL_SEV_PREDICTIVE,
+                           NULL, // trace buffer
+                           DEFAULT_TRACE_SIZE,
+                           rc,
+                           0);
+
+        REQUEST_RESET(l_err);
+
+        // release I2C lock to the host for this engine and stop monitoring
+        occ_i2c_lock_release(GPU_I2C_ENGINE);
+        G_gpu_monitoring_allowed = FALSE;
+    }
+}
+
+// Called after a failure for a specified GPU.  The error will
+// be counted and if threshold is reached, an error will be created with
+// the GPU as a callout and then set flag to force reset
+void mark_gpu_failed(const gpu_sm_args_t *i_arg)
+{
+    uint32_t gpu_id = i_arg->gpu_id;
+
+    // ignore all errors if haven't reached timeout for comm established
+    if( (false == g_amec->gpu[gpu_id].status.readOnce) &&
+        (DURATION_IN_S_UNTIL_NOW_FROM(G_gpu_sm_start_time) < GPU_COMM_ESTAB_TIMEOUT_SECONDS) )
+    {
+         // do nothing at this time
+         return;
+    }
+    if((false == g_amec->gpu[gpu_id].status.disabled) &&
+       (true == g_amec->gpu[gpu_id].status.readOnce))
+    {
+        INTR_TRAC_ERR("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
+                      "(ffdc 0x%08X%08X)",
+                      gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
+                      WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
+    }
+
+    if( ( ++g_amec->gpu[gpu_id].status.errorCount > MAX_CONSECUTIVE_GPU_RESETS) &&
+        (false == g_amec->gpu[gpu_id].status.disabled) &&
+        (true == g_amec->gpu[gpu_id].status.readOnce))
+    {
+        G_gpu_state = GPU_STATE_IDLE;
+        // Disable this GPU, collect FFDC and log error
+        g_amec->gpu[gpu_id].status.disabled = true;
+
+        INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
+                      gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);
+        errlHndl_t l_err = NULL;
+        /*
+         * @errortype
+         * @moduleid    GPU_MID_MARK_GPU_FAILED
+         * @reasoncode  GPU_FAILURE
+         * @userdata1   GPE returned rc code
+         * @userdata4   ERC_GPU_COMPLETE_FAILURE
+         * @devdesc     GPU failure
+         */
+        l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
+                           GPU_FAILURE,
+                           ERC_GPU_COMPLETE_FAILURE,
+                           ERRL_SEV_PREDICTIVE,
+                           NULL,
+                           DEFAULT_TRACE_SIZE,
+                           i_arg->error.rc,
+                           0);
+
+        addUsrDtlsToErrl(l_err,
+                         (uint8_t*)&i_arg->error.ffdc,
+                         sizeof(i_arg->error.ffdc),
+                         ERRL_STRUCT_VERSION_1,
+                         ERRL_USR_DTL_BINARY_DATA);
+
+        // Callout the GPU if have sensor ID for it
+        if(G_sysConfigData.gpu_sensor_ids[gpu_id])
+        {
+           addCalloutToErrl(l_err,
+                            ERRL_CALLOUT_TYPE_HUID,
+                            G_sysConfigData.gpu_sensor_ids[gpu_id],
+                            ERRL_CALLOUT_PRIORITY_MED);
+        }
+
+        commitErrl(&l_err);
+    }
+
+    // Reset GPU
+    G_gpu_i2c_reset_required = true;
+    G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
+} // end mark_gpu_failed()
+
+// Schedule a GPE request for GPU operation
+bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
+{
+    bool l_scheduled = false;
+    bool scheduleRequest = true;
+    errlHndl_t err = NULL;
+
+    GPU_DBG(">>schedule_gpu_req(op 0x%02X)", i_operation);
+
+    if (!async_request_is_idle(&G_gpu_op_request.request))
+    {
+        INTR_TRAC_INFO("E>schedule_gpu_req: prior request (op 0x%02X) not idle when scheduling 0x%02X (tick=%d)",
+                       G_gpu_op_req_args.operation, i_operation, GPU_TICK);
+    }
+    else
+    {
+        // Ready for next request
+        G_gpu_op_req_args = i_new_args;
+        switch(i_operation)
+        {
+            // Init
+            case GPU_REQ_INIT:
+                break;
+
+            // Read GPU memory temp capability
+            case GPU_REQ_READ_CAPS_START:
+            case GPU_REQ_READ_CAPS_STOP:
+            case GPU_REQ_READ_CAPS:
+                break;
+
+            // Read GPU memory temp
+            case GPU_REQ_READ_TEMP_START:
+            case GPU_REQ_READ_TEMP_STOP:
+            case GPU_REQ_READ_TEMP:
+                break;
+
+            // Read GPU core temp
+            case GPU_REQ_READ_TEMP_SIMPLE_START:
+            case GPU_REQ_READ_TEMP_SIMPLE_STOP:
+            case GPU_REQ_READ_TEMP_SIMPLE:
+                break;
+
+            // I2C reset
+            case GPU_REQ_RESET:
+                break;
+
+            default:
+                INTR_TRAC_ERR("schedule_gpu_req: Invalid GPU request operation: 0x%02X", i_operation);
+                /*
+                 * @errortype
+                 * @moduleid    GPU_MID_GPU_SCHED_REQ
+                 * @reasoncode  GPU_FAILURE
+                 * @userdata1   operation
+                 * @userdata2   0
+                 * @userdata4   ERC_GPU_INVALID_GPU_OPERATION
+                 * @devdesc     Invalid GPU request operation
+                 */
+                err = createErrl(GPU_MID_GPU_SCHED_REQ,
+                                 GPU_FAILURE,
+                                 ERC_GPU_INVALID_GPU_OPERATION,
+                                 ERRL_SEV_PREDICTIVE,
+                                 NULL,
+                                 DEFAULT_TRACE_SIZE,
+                                 i_operation,
+                                 0);
+
+                commitErrl(&err);
+                scheduleRequest = false;
+
+                // release I2C lock to the host for this engine and stop monitoring
+                occ_i2c_lock_release(GPU_I2C_ENGINE);
+                G_gpu_monitoring_allowed = FALSE;
+                break;
+        }
+
+        if (scheduleRequest)
+        {
+            // Clear errors and init common arguments for GPE
+            G_gpu_op_req_args.error.error = 0;
+            G_gpu_op_req_args.operation = i_operation;
+            G_gpu_op_req_args.gpu_id = G_current_gpu_id;
+
+            GPU_DBG("schedule_gpu_req: Scheduling GPE1 GPU operation 0x%02X (tick %d)", i_operation, GPU_TICK);
+            int l_rc = gpe_request_schedule(&G_gpu_op_request);
+            if (0 == l_rc)
+            {
+                l_scheduled = true;
+            }
+            else
+            {
+                INTR_TRAC_ERR("schedule_gpu_req: schedule failed w/rc=0x%08X (%d us)",
+                              l_rc, (int) ((ssx_timebase_get())/(SSX_TIMEBASE_FREQUENCY_HZ/1000000)));
+                /*
+                 * @errortype
+                 * @moduleid    GPU_MID_GPU_SCHED_REQ
+                 * @reasoncode  SSX_GENERIC_FAILURE
+                 * @userdata1   GPE schedule returned code
+                 * @userdata2   GPU operation
+                 * @userdata4   ERC_GPU_SCHEDULE_FAILURE
+                 * @devdesc     Failed to schedule GPU operation request
+                 */
+                err = createErrl(GPU_MID_GPU_SCHED_REQ,
+                                 SSX_GENERIC_FAILURE,
+                                 ERC_GPU_SCHEDULE_FAILURE,
+                                 ERRL_SEV_PREDICTIVE,
+                                 NULL,
+                                 DEFAULT_TRACE_SIZE,
+                                 l_rc,
+                                 i_operation);
+                commitErrl(&err);
+
+                // release I2C lock to the host for this engine and stop monitoring
+                occ_i2c_lock_release(GPU_I2C_ENGINE);
+                G_gpu_monitoring_allowed = FALSE;
+            }
+        }
+    }
+
+    return l_scheduled;
+
+} // end schedule_gpu_req()
+
+// Function Specification
+//
+// Name:  gpu_reset_sm
+//
+// Description: GPU Reset State Machine.  This is not called per GPU if any handling is needed
+//               per GPU this function must handle and not indicate that reset is complete
+//               until all present GPUs are ready
+//
+// End Function Specification
+bool gpu_reset_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the reset AND initialization is complete
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_state_retry_count = 0;
+    static uint8_t L_consec_reset_failure_count = 0;
+    static gpuResetState_e L_reset_state = GPU_RESET_STATE_NEW;  // 1st state for a reset
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // check if the previous state was successfully scheduled and success/done
+       if( (L_reset_state != GPU_RESET_STATE_NEW) &&
+           (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // Check if failure was due to GPE image not having GPU support
+          if(G_gpu_op_req_args.error.rc == GPE_RC_NO_GPU_SUPPORT)
+          {
+             // No GPU Support, log error and disable all GPUs
+             INTR_TRAC_ERR("gpu_reset_sm: GPE image doesn't support GPUs!");
+
+             /*
+              * @errortype
+              * @moduleid    GPU_MID_GPU_RESET_SM
+              * @reasoncode  GPU_FAILURE
+              * @userdata1   0
+              * @userdata2   0
+              * @userdata4   ERC_GPU_NO_GPE_SUPPORT
+              * @devdesc     GPE1 image doesn't support GPU communication
+              */
+             errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
+                                         GPU_FAILURE,
+                                         ERC_GPU_NO_GPE_SUPPORT,
+                                         ERRL_SEV_UNRECOVERABLE,
+                                         NULL,
+                                         DEFAULT_TRACE_SIZE,
+                                         0,
+                                         0);
+             commitErrl(&err);
+
+             disable_all_gpus();
+
+             L_reset_state = GPU_RESET_STATE_NEW;
+             return FALSE;  // GPUs are not ready for communication
+          }
+          else
+          {
+             // Stay in current state if haven't reached state retry count
+             if(L_state_retry_count < MAX_GPU_RESET_STATE_RETRY)
+             {
+                // INC state retry count and retry current state
+                L_state_retry_count++;
+             }
+             else  // this reset attempt failed
+             {
+                // Stop trying if reached max resets
+                if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
+                {
+                   INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
+                                  L_reset_state);
+
+                   /*
+                    * @errortype
+                    * @moduleid    GPU_MID_GPU_RESET_SM
+                    * @reasoncode  GPU_FAILURE
+                    * @userdata1   GPU reset state
+                    * @userdata2   0
+                    * @userdata4   ERC_GPU_RESET_FAILURE
+                    * @devdesc     Failure resetting GPU interface
+                    */
+                   errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
+                                               GPU_FAILURE,
+                                               ERC_GPU_RESET_FAILURE,
+                                               ERRL_SEV_UNRECOVERABLE,
+                                               NULL,
+                                               DEFAULT_TRACE_SIZE,
+                                               L_reset_state,
+                                               0);
+                   commitErrl(&err);
+
+                   disable_all_gpus();
+
+                   L_reset_state = GPU_RESET_STATE_NEW;
+                   return FALSE;  // GPUs are not ready for communication
+                }
+                else  // try the reset again from the beginning
+                {
+                   L_consec_reset_failure_count++;
+                   L_state_retry_count = 0;
+                   L_reset_state = GPU_RESET_STATE_RESET_MASTER;
+                }
+             }  // else reset attempt failed
+          }  // else GPE supports GPU
+       }// if previous state failed
+       else // success on last state go to next state and process it
+       {
+          L_state_retry_count = 0;
+          L_reset_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_reset_state)
+       {
+           case GPU_RESET_STATE_RESET_MASTER:
+               G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
+               L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+               break;
+
+           case GPU_RESET_STATE_RESET_SLAVE:
+               G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
+               L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+               break;
+
+           case GPU_RESET_STATE_RESET_SLAVE_WAIT:
+               // Delay to allow reset to complete
+               GPU_DBG("gpu_reset_sm: waiting during slave port 4 reset");
+               break;
+
+           case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
+               G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
+               L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
+               break;
+
+           case GPU_RESET_STATE_INIT:
+               // Notify GPE which GPUs are present
+               G_new_gpu_req_args.data[0] = (GPU_PRESENT(ID_GPU0)) ? GPU_STATE_PRESENT : 0;
+               G_new_gpu_req_args.data[1] = (GPU_PRESENT(ID_GPU1)) ? GPU_STATE_PRESENT : 0;
+               G_new_gpu_req_args.data[2] = (GPU_PRESENT(ID_GPU2)) ? GPU_STATE_PRESENT : 0;
+               // Setup I2C Interrupt Mask Register and Mode
+               L_scheduled = schedule_gpu_req(GPU_REQ_INIT, G_new_gpu_req_args);
+               break;
+
+           case GPU_RESET_STATE_INIT_COMPLETE:
+               // Reset and init is complete ready to start sending commands to the GPUs
+               l_complete = TRUE;
+               L_consec_reset_failure_count = 0;
+               // next time this is called will be to start a new reset
+               L_reset_state = GPU_RESET_STATE_NEW;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_reset_sm: INVALID STATE: 0x%02X when reset is required", L_reset_state);
+               L_reset_state = GPU_RESET_STATE_NEW;
+               break;
+       } // switch L_reset_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_reset_sm: Scheduled reset state 0x%02X", L_reset_state);
+       }
+       // check if the state was expected to have a schedule. Only new and slave wait
+       // don't schedule for all other states the schedule must have failed
+       else if( (L_reset_state != GPU_RESET_STATE_NEW) &&
+                (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) )
+       {
+          INTR_TRAC_ERR("gpu_reset_sm: failed to schedule state 0x%02X", L_reset_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_reset_sm: NOT idle for state 0x%02X", L_reset_state);
+    }
+
+    return l_complete;
+} // end gpu_reset_sm()
+
+// Function Specification
+//
+// Name:  gpu_read_temp_sm
+//
+// Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id
+//              This function should only return that complete is TRUE when the temperature
+//              read is complete (or determined failed) and ready to start reading a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read and
+//            verified G_gpu_op_request is idle to allow scheduling
+// End Function Specification
+bool gpu_read_temp_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
+    uint16_t l_temp = 0;
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_read_failure_count = 0;
+    static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW;  // 1st state for reading temp
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new read then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_read_temp_state != GPU_STATE_READ_TEMP_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // If reached retry count give up on this GPU
+          if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             mark_gpu_failed(&G_gpu_op_req_args);
+
+             L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+          else
+          {
+             // INC failure count and retry current state
+             L_read_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_read_failure_count = 0;
+          L_read_temp_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_read_temp_state)
+       {
+           case GPU_STATE_READ_TEMP_START:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_TEMP_STOP:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE_STOP, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_TEMP_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_SIMPLE, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_TEMP_COMPLETE:
+               if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
+                   (0 != G_gpu_op_req_args.data[0]) ) // TODO: check for valid temp?
+               {
+                   g_amec->gpu[G_current_gpu_id].status.readOnce = true;
+                   TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
+                              G_current_gpu_id, CURRENT_TICK);
+                   // comm is now established update for capability checking to take place
+                   g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
+                   g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
+               }
+               // Update sensor
+               l_temp = G_gpu_op_req_args.data[0] >> 24;
+               sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);
+
+               // Clear all past errors
+               g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
+
+               // check if there is an overtemp that hasn't been reported
+               if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
+                  (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
+                  (!g_amec->gpu[G_current_gpu_id].status.overtempError) )
+               {
+                   g_amec->gpu[G_current_gpu_id].status.overtempError = TRUE;
+
+                   INTR_TRAC_ERR("gpu_read_temp: GPU%d OT! temp[%d]",
+                                  G_current_gpu_id, l_temp);
+
+                   // Log an OT error
+                   /* @
+                    * @errortype
+                    * @moduleid    GPU_MID_GPU_READ_TEMP
+                    * @reasoncode  GPU_ERROR_TEMP
+                    * @userdata1   GPU ID
+                    * @userdata2   GPU memory temperature
+                    * @userdata4   OCC_NO_EXTENDED_RC
+                    * @devdesc     GPU memory has reached error temperature
+                    *
+                    */
+                   errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_TEMP,
+                                                 GPU_ERROR_TEMP,
+                                                 OCC_NO_EXTENDED_RC,
+                                                 ERRL_SEV_PREDICTIVE,
+                                                 NULL,
+                                                 DEFAULT_TRACE_SIZE,
+                                                 G_current_gpu_id,
+                                                 l_temp);
+
+                   // Callout the over temperature procedure
+                   addCalloutToErrl(l_err,
+                                    ERRL_CALLOUT_TYPE_COMPONENT_ID,
+                                    ERRL_COMPONENT_ID_OVER_TEMPERATURE,
+                                    ERRL_CALLOUT_PRIORITY_HIGH);
+
+                   // Callout the GPU if have sensor ID for it
+                   if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                   {
+                      addCalloutToErrl(l_err,
+                                       ERRL_CALLOUT_TYPE_HUID,
+                                       G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                       ERRL_CALLOUT_PRIORITY_MED);
+                   }
+
+                   // Commit Error
+                   commitErrl(&l_err);
+
+               } // if OT error
+
+               // Done with this GPU ready to move to new one
+               L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_read_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
+               L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_read_temp_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_read_temp_sm: Scheduled read temp state 0x%02X at tick %d",
+                   L_read_temp_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_read_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_read_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
+    }
+
+    return l_complete;
+} // end gpu_read_temp_sm()
+
+// Function Specification
+//
+// Name:  gpu_read_mem_temp_capability_sm
+//
+// Description: Called from gpu_task_sm to read GPU memory temp capability of G_current_gpu_id
+//              This function should only return that complete is TRUE when the capability
+//              read is complete (or determined failed) and ready to start reading a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_read_mem_temp_capability_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_read_failure_count = 0;
+    static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new read then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // If reached retry count give up on this read
+          if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // log error that memory temp capability couldn't be determined
+             //  memory temp support will be left as not supported
+             INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d", G_current_gpu_id);
+
+             // Log error
+             /* @
+              * @errortype
+              * @moduleid    GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
+              * @reasoncode  GPU_FAILURE
+              * @userdata1   GPU ID
+              * @userdata2   0
+              * @userdata4   ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
+              * @devdesc     Failure to read GPU memory temp capability
+              *
+              */
+             errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
+                                           GPU_FAILURE,
+                                           ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
+                                           ERRL_SEV_PREDICTIVE,
+                                           NULL,
+                                           DEFAULT_TRACE_SIZE,
+                                           G_current_gpu_id,
+                                           0);
+
+             // Callout the GPU if have sensor ID for it
+             if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+             {
+                addCalloutToErrl(l_err,
+                                 ERRL_CALLOUT_TYPE_HUID,
+                                 G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                 ERRL_CALLOUT_PRIORITY_MED);
+             }
+
+             // Commit Error
+             commitErrl(&l_err);
+
+             L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+          else
+          {
+             // INC failure count and retry current state
+             L_read_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_read_failure_count = 0;
+          L_read_cap_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_read_cap_state)
+       {
+           case GPU_STATE_READ_MEM_TEMP_CAPABLE_START:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_CAPABLE_STOP:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_STOP, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_CAPABLE_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
+               // Update capability
+               g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;
+
+               // Done with this GPU ready to move to new one
+               L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_read_mem_temp_capable: INVALID STATE: 0x%02X", L_read_cap_state);
+               L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_read_cap_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_read_mem_temp_capable: Scheduled read temp capability state 0x%02X at tick %d",
+                   L_read_cap_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_read_mem_temp_capable: failed to schedule state 0x%02X", L_read_cap_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_read_mem_temp_capable: NOT idle for state 0x%02X", L_read_cap_state);
+    }
+
+    return l_complete;
+} // end gpu_read_mem_temp_capability_sm()
+
+// Function Specification
+//
+// Name:  gpu_read_memory_temp_sm
+//
+// Description: Called from gpu_task_sm to read GPU memory temperature of G_current_gpu_id
+//              This function should only return that complete is TRUE when the temperature
+//              read is complete (or determined failed) and ready to start reading a different GPU
+//
+// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
+//
+// End Function Specification
+bool gpu_read_memory_temp_sm()
+{
+    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
+    uint16_t l_temp = 0;
+    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
+    static uint8_t L_read_failure_count = 0;
+    static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;  // 1st state for reading temp
+
+    if (async_request_is_idle(&G_gpu_op_request.request))
+    {
+       // If not starting a new read then need to check status of current state before moving on
+       // stay in current state if the schedule failed or the state isn't finished/failed
+       if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) &&
+           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
+       {
+          // If reached retry count give up on this read
+          if(L_read_failure_count > MAX_GPU_READ_ATTEMPT)
+          {
+             // INC memory error count and check if reached timeout threshold for new mem temp
+             uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
+             g_amec->gpu[G_current_gpu_id].status.memErrorCount++;
+             if((max_read_timeout) && (max_read_timeout != 0xFF) &&
+                (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
+             {
+                 // Disable memory temp reading for this GPU and log error
+                 g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
+                 // so BMC knows there is an error for fan control set sensor to 0xFF
+                 sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), 0xFFFF);
+
+                 INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
+                      G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+
+                 // Log error
+                 /* @
+                  * @errortype
+                  * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
+                  * @reasoncode  GPU_FAILURE
+                  * @userdata1   GPU ID
+                  * @userdata2   number consecutive read mem temp failures
+                  * @userdata4   ERC_GPU_READ_MEM_TEMP_TIMEOUT
+                  * @devdesc     Timeout reading new GPU memory temperature
+                  *
+                  */
+                 errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
+                                               GPU_FAILURE,
+                                               ERC_GPU_READ_MEM_TEMP_TIMEOUT,
+                                               ERRL_SEV_PREDICTIVE,
+                                               NULL,
+                                               DEFAULT_TRACE_SIZE,
+                                               G_current_gpu_id,
+                                               g_amec->gpu[G_current_gpu_id].status.memErrorCount);
+
+                 // Callout the GPU if have sensor ID for it
+                 if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                 {
+                    addCalloutToErrl(l_err,
+                                     ERRL_CALLOUT_TYPE_HUID,
+                                     G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                     ERRL_CALLOUT_PRIORITY_MED);
+                 }
+
+                 // Commit Error
+                 commitErrl(&l_err);
+
+             } // if timeout error
+
+             L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+             return TRUE;  // Done with this GPU, let GPU SM move to next
+          }
+          else
+          {
+             // INC failure count and retry current state
+             L_read_failure_count++;
+          }
+       }
+       else // success on last state go to next state and process it
+       {
+          L_read_failure_count = 0;
+          L_read_temp_state++;
+       }
+
+       L_scheduled = FALSE;  // default nothing scheduled
+
+       switch (L_read_temp_state)
+       {
+           case GPU_STATE_READ_MEM_TEMP_START:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_STOP:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_STOP, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_READ:
+               L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP, G_new_gpu_req_args);
+               break;
+
+           case GPU_STATE_READ_MEM_TEMP_COMPLETE:
+               // Update sensor
+               l_temp = G_gpu_op_req_args.data[0] >> 24;
+               sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
+
+               // Clear past errors
+               g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
+
+               // check if there is an overtemp that hasn't been reported
+               if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].error) &&
+                  (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].error) &&
+                  (!g_amec->gpu[G_current_gpu_id].status.memOvertempError) )
+               {
+                   g_amec->gpu[G_current_gpu_id].status.memOvertempError = TRUE;
+
+                   INTR_TRAC_ERR("gpu_read_memory_temp: GPU%d memory OT! temp[%d]",
+                                  G_current_gpu_id, l_temp);
+
+                   // Log an OT error
+                   /* @
+                    * @errortype
+                    * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
+                    * @reasoncode  GPU_MEMORY_ERROR_TEMP
+                    * @userdata1   GPU ID
+                    * @userdata2   GPU memory temperature
+                    * @userdata4   OCC_NO_EXTENDED_RC
+                    * @devdesc     GPU memory has reached error temperature
+                    *
+                    */
+                   errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
+                                                 GPU_MEMORY_ERROR_TEMP,
+                                                 OCC_NO_EXTENDED_RC,
+                                                 ERRL_SEV_PREDICTIVE,
+                                                 NULL,
+                                                 DEFAULT_TRACE_SIZE,
+                                                 G_current_gpu_id,
+                                                 l_temp);
+
+                   // Callout the over temperature procedure
+                   addCalloutToErrl(l_err,
+                                    ERRL_CALLOUT_TYPE_COMPONENT_ID,
+                                    ERRL_COMPONENT_ID_OVER_TEMPERATURE,
+                                    ERRL_CALLOUT_PRIORITY_HIGH);
+
+                   // Callout the GPU if have sensor ID for it
+                   if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
+                   {
+                      addCalloutToErrl(l_err,
+                                       ERRL_CALLOUT_TYPE_HUID,
+                                       G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
+                                       ERRL_CALLOUT_PRIORITY_MED);
+                   }
+
+                   // Commit Error
+                   commitErrl(&l_err);
+
+               } // if OT error
+
+               // Done with this GPU ready to move to new one
+               L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+               l_complete = TRUE;
+               break;
+
+           default:
+               INTR_TRAC_ERR("gpu_read_memory_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
+               L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
+               l_complete = TRUE;
+               break;
+       } // switch L_read_temp_state
+
+       if(L_scheduled)
+       {
+          GPU_DBG("gpu_read_memory_temp_sm: Scheduled read temp state 0x%02X at tick %d",
+                   L_read_temp_state, GPU_TICK);
+       }
+       else if(!l_complete)  // if not complete there must have been a failure on the schedule
+       {
+          INTR_TRAC_ERR("gpu_read_memory_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
+       }
+
+    } // if async_request_is_idle
+    else
+    {
+       INTR_TRAC_ERR("gpu_read_memory_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
+    }
+
+    return l_complete;
+} // end gpu_read_memory_temp_sm()
+
+
+// Function Specification
+//
+// Name:  gpu_sm_handle_idle_state
+//
+// Description: Called when GPU SM is idle to determine what state (if any) should
+//               be done next
+// End Function Specification
+bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_needed)
+{
+    bool l_new_state = FALSE;  // return TRUE if there is a new state for GPU communication
+    uint8_t l_gpu_id = 0;
+
+    do
+    {
+        // Check for next state in order of priority
+
+        // 1.  Need to set a power limit on a GPU?
+        l_gpu_id = gpu_id_need_set_power_limit();
+        if(l_gpu_id != 0xFF)
+        {
+            // Found a GPU that needs a power limit set
+            G_current_gpu_id = l_gpu_id;
+            G_gpu_state = GPU_STATE_SET_PWR_LIMIT;
+            l_new_state = TRUE;
+            break;
+        }
+
+        // 2.  check if Host needs lock
+        if (!check_and_update_i2c_lock(GPU_I2C_ENGINE))
+        {
+            // We don't own the lock anymore
+            // can't do anything until we get ownership back
+            G_gpu_state = GPU_STATE_NO_LOCK;
+            l_new_state = FALSE;
+            break;
+        }
+
+        // 3.  Need to check if driver is loaded?
+        l_gpu_id = gpu_id_need_driver_check();
+        if(l_gpu_id != 0xFF)
+        {
+            // Found a GPU that needs driver checked
+            G_current_gpu_id = l_gpu_id;
+            G_gpu_state = GPU_STATE_CHECK_DRIVER_LOADED;
+            l_new_state = TRUE;
+            break;
+        }
+
+        // 4.  Need to read power limits?
+        l_gpu_id = gpu_id_need_power_limits();
+        if(l_gpu_id != 0xFF)
+        {
+            // Found a GPU that needs power limits read
+            G_current_gpu_id = l_gpu_id;
+            G_gpu_state = GPU_STATE_READ_PWR_LIMIT;
+            l_new_state = TRUE;
+            break;
+        }
+
+        // 5.  Need to read memory temps?
+        if(i_mem_temp_needed)
+        {
+            // first check if there is a GPU that needs memory temp capability checked
+            l_gpu_id = gpu_id_need_memory_temp_capability_check();
+            if(l_gpu_id != 0xFF)
+            {
+                // Determine memory temp capability for this GPU
+                G_current_gpu_id = l_gpu_id;
+                G_gpu_state = GPU_STATE_CHECK_MEM_TEMP_CAPABLE;
+                l_new_state = TRUE;
+                break;
+            }
+            else
+            {
+               // memory temp capability checking is done start reading memory temp from capable GPUs
+               l_gpu_id = get_first_mem_temp_capable_gpu();
+               if(l_gpu_id != 0xFF)
+               {
+                  // Read memory temp for this GPU
+                  G_current_gpu_id = l_gpu_id;
+                  G_gpu_state = GPU_STATE_READ_MEMORY_TEMP;
+                  l_new_state = TRUE;
+                  break;
+               }
+            }
+        }
+
+        // 6.  Time to start new temperature reads?
+        if(i_read_temp_start_needed)
+        {
+            // Start reading core temp from first present and functional GPU
+            l_gpu_id = get_first_gpu();
+            if(l_gpu_id != 0xFF)
+            {
+               // Read core temp for this GPU
+               G_current_gpu_id = l_gpu_id;
+               G_gpu_state = GPU_STATE_READ_TEMP;
+               l_new_state = TRUE;
+               break;
+            }
+            else  // no functional GPUs
+            {
+               // release I2C lock to the host for this engine and stop monitoring
+               occ_i2c_lock_release(GPU_I2C_ENGINE);
+               G_gpu_state = GPU_STATE_NO_LOCK;
+               G_gpu_monitoring_allowed = FALSE;
+               l_new_state = FALSE;  // No new state for GPU communication
+               break;
+            }
+        }
+
+        // Else nothing stay idle
+    }while(0);
+
+    return l_new_state;
+}
+
+// Function Specification
+//
+// Name:  task_gpu_sm
+//
+// Description: GPU State Machine - Called from tick table to manage GPUs
+//
+// Task Flags: RTL_FLAG_ACTIVE
+//
+// End Function Specification
+void task_gpu_sm(struct task *i_self)
+{
+    bool l_start_next_state = FALSE;
+    bool l_next_state = FALSE;
+    uint8_t l_gpu_id = 0;
+
+    static bool L_occ_owns_lock = FALSE;
+    static bool L_gpu_first_run = TRUE;
+    static uint16_t L_numCallsForTempRead = 0;  // # of calls since last temp read was started
+    static bool L_read_temp_start_needed = FALSE;  // set to true when it is time to start reading GPU temps
+    static bool L_mem_temp_needed = FALSE;  // set to true after rading GPU core temp to read GPU memory temp
+
+    // GPU monitoring is enabled if GPUs are present and will be disabled if no GPUs
+    // are functional or GPU I2C interface is broken
+    if(G_gpu_monitoring_allowed)
+    {
+        // Initialize the IPC commands if this is our first run
+        if(L_gpu_first_run)
+        {
+            gpu_ipc_init();
+            G_gpu_sm_start_time = ssx_timebase_get();  // used for timeout establishing comm
+            L_gpu_first_run = FALSE;
+        }
+
+        // Check if time to start reading temperatures
+        // GPU tempertures (core and memory) are only used for fan control which happens every 1s
+        // so there is no need to read the GPU temperatures any faster than every 1s
+        if(!L_read_temp_start_needed)
+        {
+            L_numCallsForTempRead++;
+            if(L_numCallsForTempRead >= GPU_TEMP_READ_1S)
+            {
+                 L_read_temp_start_needed = TRUE;
+            }
+        }
+
+        // make sure OCC owns the lock in order to send commands to the GPU
+        if( (L_occ_owns_lock == FALSE) || (G_gpu_state == GPU_STATE_NO_LOCK) )
+        {
+            // Check if host gave up the I2C lock
+            L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE);
+            if (L_occ_owns_lock)
+            {
+                // We now own the lock start with reset and init state
+                G_gpu_state = GPU_STATE_RESET;
+            }
+            else
+            {
+                // Don't own the lock can't do anything this time
+                G_gpu_state = GPU_STATE_NO_LOCK;
+            }
+        }
+
+        // Process GPE response for what was scheduled on the last call
+        // and if that state finished schedule GPE job to start next state
+        // This means that this state machine can be ran twice
+        do
+        {
+           if(l_start_next_state)
+           {
+               // This is start of 2nd time processing state set next so we don't go thru here a 3rd time
+               l_next_state = TRUE;
+           }
+
+           // make sure previous action didn't disable GPU monitoring
+           if(!G_gpu_monitoring_allowed)
+           {
+              // release I2C lock to the host for this engine and stop monitoring
+              occ_i2c_lock_release(GPU_I2C_ENGINE);
+              L_occ_owns_lock = FALSE;
+              G_gpu_state = GPU_STATE_NO_LOCK;
+           }
+
+           switch(G_gpu_state)
+           {
+               case GPU_STATE_RESET:
+                   // Call the GPU Reset SM
+                   if (gpu_reset_sm())
+                   {
+                      // Reset complete and GPUs are ready for communication
+                      // Start first with reading core temp of first functional GPU
+                      L_numCallsForTempRead = 0;  // to track start of next temp reading in 1s
+                      L_read_temp_start_needed = FALSE;  // start is no longer needed
+                      l_gpu_id = get_first_gpu();
+                      if(l_gpu_id != 0xFF)
+                      {
+                          // Read core temp for this GPU
+                          G_current_gpu_id = l_gpu_id;
+                          G_gpu_state = GPU_STATE_READ_TEMP;
+                          l_start_next_state = TRUE;
+                      }
+                      else  // no functional GPUs
+                      {
+                         // release I2C lock to the host for this engine and stop monitoring
+                         occ_i2c_lock_release(GPU_I2C_ENGINE);
+                         L_occ_owns_lock = FALSE;
+                         G_gpu_state = GPU_STATE_NO_LOCK;
+                         G_gpu_monitoring_allowed = FALSE;
+                         l_start_next_state = FALSE;
+                      }
+                   }
+
+                   break;
+
+               case GPU_STATE_READ_TEMP:
+                   // Call the read core GPU temperature SM for the current GPU being processed
+                   if(gpu_read_temp_sm())
+                   {
+                      // Temp read complete for this GPU, move to next GPU
+                      // or memory temps if all GPU core temps were read
+                      l_gpu_id = get_next_gpu();
+                      if(l_gpu_id == 0xFF)
+                      {
+                         // Done reading core temps, now read GPU memory temps
+                         // set state to IDLE first to check if a higher priority
+                         // action is needed before starting to read memory temps
+                         L_mem_temp_needed = TRUE;
+                         G_gpu_state = GPU_STATE_IDLE;
+                      }
+                      else
+                      {
+                         // Stay in temperature read state and read temp for next GPU
+                         G_current_gpu_id = l_gpu_id;
+                      }
+
+                      l_start_next_state = TRUE;
+                   }
+
+                   break;
+
+               case GPU_STATE_READ_MEMORY_TEMP:
+                   // Call the read GPU memory temperature SM for the current GPU being processed
+                   if(gpu_read_memory_temp_sm())
+                   {
+                      // Temp read complete for this GPU, move to next GPU
+                      // or idle if all GPU memory temps were read
+                      l_gpu_id = get_next_mem_temp_capable_gpu();
+                      if(l_gpu_id == 0xFF)
+                      {
+                         // Done reading memory temps
+                         G_gpu_state = GPU_STATE_IDLE;
+                      }
+                      else
+                      {
+                         // Stay in memory read state and read memory temp for next GPU
+                         G_current_gpu_id = l_gpu_id;
+                      }
+
+                      l_start_next_state = TRUE;
+                   }
+
+                   break;
+
+               case GPU_STATE_CHECK_MEM_TEMP_CAPABLE:
+                   // Check if current GPU has memory temperature capability
+                   if(gpu_read_mem_temp_capability_sm())
+                   {
+                      // Capability check complete for this GPU, go to IDLE state
+                      // to let IDLE SM decide what to do next
+                      g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
+                      G_gpu_state = GPU_STATE_IDLE;
+                      l_start_next_state = TRUE;
+                   }
+                   break;
+
+               case GPU_STATE_CHECK_DRIVER_LOADED:
+                   // Check if driver is loaded for current GPU
+                   if(1) // TODO
+                   {
+                      // Driver check complete for this GPU, go to IDLE state
+                      // to let IDLE SM decide what to do next
+                      g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE;
+                      g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE;
+                      G_gpu_state = GPU_STATE_IDLE;
+                      l_start_next_state = TRUE;
+                   }
+                   break;
+
+               case GPU_STATE_READ_PWR_LIMIT:
+                   // Read power limits for current GPU
+                   if(1) // TODO
+                   {
+                      // Read power limits complete for this GPU, go to IDLE state
+                      // to let IDLE SM decide what to do next
+                      g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
+                      G_gpu_state = GPU_STATE_IDLE;
+                      l_start_next_state = TRUE;
+                   }
+                   break;
+
+               case GPU_STATE_SET_PWR_LIMIT:
+                   // Set power limit on current GPU
+                   if(1) // TODO
+                   {
+                      // Set power limit complete for this GPU, go to IDLE state
+                      // to let IDLE SM decide what to do next
+                      G_gpu_state = GPU_STATE_IDLE;
+                      l_start_next_state = TRUE;
+                   }
+                   break;
+
+               case GPU_STATE_NO_LOCK:
+                   // Host owns the I2C engine.  Need to wait until we get the lock
+                   l_start_next_state = FALSE;
+                   break;
+
+               default:
+                   // Nothing happened on last call
+                   G_gpu_state = GPU_STATE_IDLE;
+                   break;
+           } // switch G_gpu_state
+
+           // check if the previous action requires a reset
+           if(G_gpu_i2c_reset_required)
+           {
+               G_gpu_i2c_reset_required = FALSE;
+               G_gpu_state = GPU_STATE_RESET;
+               l_start_next_state = TRUE;
+               break;
+           }
+           else if(G_gpu_state == GPU_STATE_IDLE)
+           {
+               // time to decide what to do next
+               l_start_next_state = gpu_sm_handle_idle_state(L_read_temp_start_needed, L_mem_temp_needed);
+               if(l_start_next_state)
+               {
+                  if(G_gpu_state == GPU_STATE_READ_TEMP)
+                  {
+                     // new state to read core temp reset temperature reading timer
+                     L_numCallsForTempRead = 0;
+                     L_read_temp_start_needed = FALSE; // start no longer needed
+                  }
+                  else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP)
+                  {
+                     // new state to start reading memory temps, reset mem temp needed
+                     L_mem_temp_needed = FALSE;
+                  }
+               }
+           }
+        }while((l_start_next_state) && (!l_next_state));
+    } // GPU monitoring enabled
+} // end task_gpu_sm()
diff --git a/src/occ_405/gpu/gpu.h b/src/occ_405/gpu/gpu.h
new file mode 100644
index 0000000..91d081b
--- /dev/null
+++ b/src/occ_405/gpu/gpu.h
@@ -0,0 +1,100 @@
+/* IBM_PROLOG_BEGIN_TAG                                                   */
+/* This is an automatically generated prolog.                             */
+/*                                                                        */
+/* $Source: src/occ_405/gpu/gpu.h $                                       */
+/*                                                                        */
+/* OpenPOWER OnChipController Project                                     */
+/*                                                                        */
+/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
+/* [+] International Business Machines Corp.                              */
+/*                                                                        */
+/*                                                                        */
+/* Licensed under the Apache License, Version 2.0 (the "License");        */
+/* you may not use this file except in compliance with the License.       */
+/* You may obtain a copy of the License at                                */
+/*                                                                        */
+/*     http://www.apache.org/licenses/LICENSE-2.0                         */
+/*                                                                        */
+/* Unless required by applicable law or agreed to in writing, software    */
+/* distributed under the License is distributed on an "AS IS" BASIS,      */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
+/* implied. See the License for the specific language governing           */
+/* permissions and limitations under the License.                         */
+/*                                                                        */
+/* IBM_PROLOG_END_TAG                                                     */
+
+#ifndef _GPU_H
+#define _GPU_H
+
+#include <occ_common.h>
+#include <trac_interface.h>
+#include <errl.h>
+#include <rtls.h>
+#include "gpu_structs.h"
+
+#define GPU_TICK (CURRENT_TICK % MAX_NUM_TICKS)
+
+// States for the GPU state machine (task_gpu_sm)
+typedef enum
+{
+    GPU_STATE_RESET                     = 0x00, // Reset and initialize interface
+    GPU_STATE_READ_TEMP                 = 0x10, // Read GPU core temperature
+    GPU_STATE_READ_MEMORY_TEMP          = 0x20, // Read GPU memory temperature
+    GPU_STATE_CHECK_MEM_TEMP_CAPABLE    = 0x30, // Read memory temperature capability
+    GPU_STATE_CHECK_DRIVER_LOADED       = 0x40, // Check if Driver loaded
+    GPU_STATE_READ_PWR_LIMIT            = 0x50, // Read Power Limits
+    GPU_STATE_SET_PWR_LIMIT             = 0x60, // Set Power Limit
+    GPU_STATE_IDLE                      = 0xFE, // Ok to schedule new task
+    GPU_STATE_NO_LOCK                   = 0xFF  // Host owns, no communication allowed
+} gpuState_e;
+
+// States for the GPU reset state machine (gpu_reset_sm)
+typedef enum
+{
+    GPU_RESET_STATE_NEW                       = 0x01, // new reset attempt
+    GPU_RESET_STATE_RESET_MASTER              = 0x02, // Reset master
+    GPU_RESET_STATE_RESET_SLAVE               = 0x03, // Start of slave port 4 reset
+    GPU_RESET_STATE_RESET_SLAVE_WAIT          = 0x04,
+    GPU_RESET_STATE_RESET_SLAVE_COMPLETE      = 0x05,
+    GPU_RESET_STATE_INIT                      = 0x06,
+    GPU_RESET_STATE_INIT_COMPLETE             = 0x07,
+} gpuResetState_e;
+
+// States for reading GPU core temperature (gpu_read_temp_sm)
+typedef enum
+{
+    GPU_STATE_READ_TEMP_NEW      = 0x11, // new temp read
+    GPU_STATE_READ_TEMP_START    = 0x12, // start write temp reg
+    GPU_STATE_READ_TEMP_STOP     = 0x13, // stop write/begin read
+    GPU_STATE_READ_TEMP_READ     = 0x14, // read temperature
+    GPU_STATE_READ_TEMP_COMPLETE = 0x15, // store temperature read
+} gpuReadTempState_e;
+
+// States for reading GPU memory temperature (gpu_read_mem_temp_sm)
+typedef enum
+{
+    GPU_STATE_READ_MEM_TEMP_NEW      = 0x21,
+    GPU_STATE_READ_MEM_TEMP_START    = 0x22,
+    GPU_STATE_READ_MEM_TEMP_STOP     = 0x23,
+    GPU_STATE_READ_MEM_TEMP_READ     = 0x24,
+    GPU_STATE_READ_MEM_TEMP_COMPLETE = 0x25,
+} gpuReadMemTempState_e;
+
+// States for checking GPU memory temperature capability (gpu_read_mem_temp_capability_sm)
+typedef enum
+{
+    GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW      = 0x31,
+    GPU_STATE_READ_MEM_TEMP_CAPABLE_START    = 0x32,
+    GPU_STATE_READ_MEM_TEMP_CAPABLE_STOP     = 0x33,
+    GPU_STATE_READ_MEM_TEMP_CAPABLE_READ     = 0x34,
+    GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE = 0x35,
+} gpuReadMemTempCapableState_e;
+
+// GPU IPC initialization
+void gpu_ipc_init();
+
+// GPU state machine
+void task_gpu_sm(struct task *i_self);
+
+
+#endif //_GPU_H
diff --git a/src/occ_405/gpu/gpu_service_codes.h b/src/occ_405/gpu/gpu_service_codes.h
new file mode 100755
index 0000000..41cb3f9
--- /dev/null
+++ b/src/occ_405/gpu/gpu_service_codes.h
@@ -0,0 +1,44 @@
+/* IBM_PROLOG_BEGIN_TAG                                                   */
+/* This is an automatically generated prolog.                             */
+/*                                                                        */
+/* $Source: src/occ_405/gpu/gpu_service_codes.h $                         */
+/*                                                                        */
+/* OpenPOWER OnChipController Project                                     */
+/*                                                                        */
+/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
+/* [+] International Business Machines Corp.                              */
+/*                                                                        */
+/*                                                                        */
+/* Licensed under the Apache License, Version 2.0 (the "License");        */
+/* you may not use this file except in compliance with the License.       */
+/* You may obtain a copy of the License at                                */
+/*                                                                        */
+/*     http://www.apache.org/licenses/LICENSE-2.0                         */
+/*                                                                        */
+/* Unless required by applicable law or agreed to in writing, software    */
+/* distributed under the License is distributed on an "AS IS" BASIS,      */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
+/* implied. See the License for the specific language governing           */
+/* permissions and limitations under the License.                         */
+/*                                                                        */
+/* IBM_PROLOG_END_TAG                                                     */
+
+#ifndef _GPU_SERVICE_CODES_H_
+#define _GPU_SERVICE_CODES_H_
+
+#include <comp_ids.h>
+
+enum gpuModuleId
+{
+    GPU_MID_INIT                      =  GPU_COMP_ID | 0x00,
+    GPU_MID_GPU_SM                    =  GPU_COMP_ID | 0x01,
+    GPU_MID_MARK_GPU_FAILED           =  GPU_COMP_ID | 0x02,
+    GPU_MID_GPU_SCHED_REQ             =  GPU_COMP_ID | 0x03,
+    GPU_MID_GPU_SCHED_RSP             =  GPU_COMP_ID | 0x04,
+    GPU_MID_GPU_RESET_SM              =  GPU_COMP_ID | 0x05,
+    GPU_MID_GPU_READ_TEMP             =  GPU_COMP_ID | 0x06,
+    GPU_MID_GPU_READ_MEM_TEMP         =  GPU_COMP_ID | 0x07,
+    GPU_MID_GPU_READ_MEM_TEMP_CAPABLE =  GPU_COMP_ID | 0x08,
+};
+
+#endif /* #ifndef _GPU_SERVICE_CODES_H_ */
diff --git a/src/occ_405/img_defs.mk b/src/occ_405/img_defs.mk
index 9ef7a30..c68c91c 100644
--- a/src/occ_405/img_defs.mk
+++ b/src/occ_405/img_defs.mk
@@ -235,6 +235,7 @@ APP_INCLUDES =  -I$(IMAGE_SRCDIR)/rtls \
                 -I$(IMAGE_SRCDIR)/amec \
                 -I$(IMAGE_SRCDIR)/cent \
                 -I$(IMAGE_SRCDIR)/dimm \
+                -I$(IMAGE_SRCDIR)/gpu \
                 -I$(IMAGE_SRCDIR)/mem \
                 -I$(IMAGE_SRCDIR)/lock \
                 -I$(IMAGE_SRCDIR)/wof \
diff --git a/src/occ_405/incl/comp_ids.h b/src/occ_405/incl/comp_ids.h
index e6270d6..b6e61a7 100755
--- a/src/occ_405/incl/comp_ids.h
+++ b/src/occ_405/incl/comp_ids.h
@@ -96,5 +96,9 @@
 #define PGPE_COMP_ID         0x1200
 #define PGPE_COMP_NAME       "PGPE"
 
+// GPU Interface
+#define GPU_COMP_ID          0x1300
+#define GPU_COMP_NAME        "GPU"
+
 #endif
 
diff --git a/src/occ_405/incl/occ_common.h b/src/occ_405/incl/occ_common.h
index 626b744..d646442 100755
--- a/src/occ_405/incl/occ_common.h
+++ b/src/occ_405/incl/occ_common.h
@@ -5,7 +5,7 @@
 /*                                                                        */
 /* OpenPOWER OnChipController Project                                     */
 /*                                                                        */
-/* Contributors Listed Below - COPYRIGHT 2011,2016                        */
+/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
 /* [+] International Business Machines Corp.                              */
 /*                                                                        */
 /*                                                                        */
@@ -320,6 +320,10 @@ enum
 #define DURATION_IN_MS_UNTIL_NOW_FROM(start_time) \
   (uint32_t) ((ssx_timebase_get() - (SsxTimebase) start_time) / ( SSX_TIMEBASE_FREQUENCY_HZ / 1000 ))
 
+// Convert duration based in SsxTimestamps to seconds.
+#define DURATION_IN_S_UNTIL_NOW_FROM(start_time) \
+  (uint32_t) ((ssx_timebase_get() - (SsxTimebase) start_time) / SSX_TIMEBASE_FREQUENCY_HZ )
+
 // Skip this typedef in x86 environment
 #ifndef OCC_X86_PARSER
 typedef uint32_t      size_t ;
diff --git a/src/occ_405/occLinkInputFile b/src/occ_405/occLinkInputFile
index 123a0dc..97f2f9a 100644
--- a/src/occ_405/occLinkInputFile
+++ b/src/occ_405/occLinkInputFile
@@ -43,6 +43,7 @@ INPUT ( amec_amester.o
         dpss.o
         errl.o
         ffdc.o
+        gpu.o
         homer.o
         ll_ffdc.o
         lock.o
diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h
index b036921..310e76a 100644
--- a/src/occ_405/occ_service_codes.h
+++ b/src/occ_405/occ_service_codes.h
@@ -86,6 +86,14 @@ enum occReasonCode
     PCAP_THROTTLE_POWER_LIMIT       = 0x61,
     /// Firmware Failure: equivalent to assertion failures
     INTERNAL_FW_FAILURE             = 0xA0,
+
+    /// Error with GPU tasks
+    GPU_FAILURE                     = 0xA1,
+    /// GPU core reached error threshold
+    GPU_ERROR_TEMP                  = 0xA2,
+    /// GPU memory reached error threshold
+    GPU_MEMORY_ERROR_TEMP           = 0xA3,
+
     /// Failure within the OCC Complex of the processor
     INTERNAL_HW_FAILURE             = 0xB0,
     /// OCC GPE halted due to checkstop
@@ -135,6 +143,7 @@ enum occReasonCode
     INVALID_FREQUENCY               = 0xDE,
     WOF_RE_ENABLED                  = 0xDF,
 
+
 // NOTE: 0xE0 - 0xEF can NOT be used these are reserved for critical
 // OCC errors.  (H)TMGT will be looking for 0xEy ERRL_RC in cmd response RC
 // and create an OCC error log with OCC component ID and 0xEy RC if found
@@ -272,6 +281,16 @@ enum occExtReasonCode
     ERC_SMGR_NO_VALID_MODE_TRANSITION_CALL      = 0x00E0,
     ERC_SMGR_NO_VALID_STATE_TRANSITION_CALL     = 0x00E1,
 
+    ERC_GPU_COMPLETE_FAILURE                    = 0x00F0,
+    ERC_GPU_SCHEDULE_FAILURE                    = 0x00F1,
+    ERC_GPU_RESET_FAILURE                       = 0x00F2,
+    ERC_GPU_RESET_TIMEOUT                       = 0x00F3,
+    ERC_GPU_READ_TEMP_TIMEOUT                   = 0x00F4,
+    ERC_GPU_READ_MEM_TEMP_TIMEOUT               = 0x00F5,
+    ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE       = 0x00F6,
+    ERC_GPU_INVALID_GPU_OPERATION               = 0x00F7,
+    ERC_GPU_NO_GPE_SUPPORT                      = 0x00FF,
+
     ERC_STATE_FROM_ALL_TO_STB_FAILURE           = 0x0123,
     ERC_STATE_FROM_ACT_TO_CHR_FAILURE           = 0x0124,
     ERC_STATE_FROM_CHR_TO_ACT_FAILURE           = 0x0125,
diff --git a/src/occ_405/sensor/sensor_enum.h b/src/occ_405/sensor/sensor_enum.h
index 4d2d483..62745d3 100755
--- a/src/occ_405/sensor/sensor_enum.h
+++ b/src/occ_405/sensor/sensor_enum.h
@@ -689,6 +689,9 @@ enum e_gsid
     TEMPGPU0,
     TEMPGPU1,
     TEMPGPU2,
+    TEMPGPU0MEM,
+    TEMPGPU1MEM,
+    TEMPGPU2MEM,
 
     // ------------------------------------------------------
     // Partition Sensors
diff --git a/src/occ_405/sensor/sensor_info.c b/src/occ_405/sensor/sensor_info.c
index 99cd069..0592908 100755
--- a/src/occ_405/sensor/sensor_info.c
+++ b/src/occ_405/sensor/sensor_info.c
@@ -35,6 +35,7 @@
 #define AMEEFP_16MS_IN_HZ   AMEFP(625,-1)    //  62.5 Hz
 #define AMEEFP_32MS_IN_HZ   AMEFP(3125,-2)   // 31.25 Hz
 #define AMEEFP_64MS_IN_HZ   AMEFP(15625,-3)  // 15.625 Hz
+#define AMEEFP_1S_IN_HZ     AMEFP(1,0)       //   1.0 Hz
 #define AMEEFP_3S_IN_HZ     AMEFP(333,-3)    // 0.333 Hz
 #define AMEFP_SCALE_0_16384 AMEFP(610352,-8) // scalar so that digital 16384=100%
 
@@ -376,10 +377,13 @@ const sensor_info_t G_sensor_info[]   =
   SENSOR_INFO_T_ENTRY(       TEMPCENT,   "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM,   AMEEFP_EVERY_8TH_TICK_HZ, AMEFP(  1, 0)  ),
   SENSOR_INFO_T_ENTRY(   TEMPDIMMTHRM,   "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_MEM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_128TH_TICK_HZ, AMEFP(  1, 0)  ),
 
-  /* ==GPUSensors==      NameString  Units                      Type              Location             Number                Freq           ScaleFactor   */
-  SENSOR_INFO_T_ENTRY(   TEMPGPU0,   "C\0",    AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_EVERY_128TH_TICK_HZ, AMEFP(  1, 0)  ),
-  SENSOR_INFO_T_ENTRY(   TEMPGPU1,   "C\0",    AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_EVERY_128TH_TICK_HZ, AMEFP(  1, 0)  ),
-  SENSOR_INFO_T_ENTRY(   TEMPGPU2,   "C\0",    AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_EVERY_128TH_TICK_HZ, AMEFP(  1, 0)  ),
+  /* ==GPUSensors==      NameString  Units                      Type              Location             Number        Freq         ScaleFactor   */
+  SENSOR_INFO_T_ENTRY(   TEMPGPU0,    "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
+  SENSOR_INFO_T_ENTRY(   TEMPGPU1,    "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
+  SENSOR_INFO_T_ENTRY(   TEMPGPU2,    "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
+  SENSOR_INFO_T_ENTRY(   TEMPGPU0MEM, "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
+  SENSOR_INFO_T_ENTRY(   TEMPGPU1MEM, "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
+  SENSOR_INFO_T_ENTRY(   TEMPGPU2MEM, "C\0",   AMEC_SENSOR_TYPE_TEMP,  AMEC_SENSOR_LOC_GPU, AMEC_SENSOR_NONUM,  AMEEFP_1S_IN_HZ, AMEFP(  1, 0)  ),
 
   /* ==PartSummarySensors== NameString Units                   Type              Location             Number                Freq          ScaleFactor   */
   SENSOR_INFO_T_ENTRY(   UTILSLCG000,  "%\0", AMEC_SENSOR_TYPE_UTIL,  AMEC_SENSOR_LOC_LPAR, AMEC_SENSOR_NONUM,  AMEEFP_EVERY_8TH_TICK_HZ, AMEFP_SCALE_0_16384),
diff --git a/src/occ_405/sensor/sensor_table.c b/src/occ_405/sensor/sensor_table.c
index 11fffd9..b4128a3 100755
--- a/src/occ_405/sensor/sensor_table.c
+++ b/src/occ_405/sensor/sensor_table.c
@@ -424,9 +424,12 @@ const sensor_ptr_t G_amec_sensor_list[] =
   // ------------------------------------------------------
   // GPU Sensors
   // ------------------------------------------------------
-  SENSOR_PTR(TEMPGPU0,              &g_amec_sys.proc[0].tempgpu0),
-  SENSOR_PTR(TEMPGPU1,              &g_amec_sys.proc[0].tempgpu1),
-  SENSOR_PTR(TEMPGPU2,              &g_amec_sys.proc[0].tempgpu2),
+  SENSOR_PTR(TEMPGPU0,              &g_amec_sys.gpu[0].tempgpu),
+  SENSOR_PTR(TEMPGPU1,              &g_amec_sys.gpu[1].tempgpu),
+  SENSOR_PTR(TEMPGPU2,              &g_amec_sys.gpu[2].tempgpu),
+  SENSOR_PTR(TEMPGPU0MEM,           &g_amec_sys.gpu[0].tempgpumem),
+  SENSOR_PTR(TEMPGPU1MEM,           &g_amec_sys.gpu[1].tempgpumem),
+  SENSOR_PTR(TEMPGPU2MEM,           &g_amec_sys.gpu[2].tempgpumem),
 
   // ------------------------------------------------------
   // Partition Sensors
@@ -620,6 +623,9 @@ const minisensor_ptr_t G_amec_mini_sensor_list[] INIT_SECTION =
   MINI_SENSOR_PTR( TEMPGPU0,     NULL),
   MINI_SENSOR_PTR( TEMPGPU1,     NULL),
   MINI_SENSOR_PTR( TEMPGPU2,     NULL),
+  MINI_SENSOR_PTR( TEMPGPU0MEM,  NULL),
+  MINI_SENSOR_PTR( TEMPGPU1MEM,  NULL),
+  MINI_SENSOR_PTR( TEMPGPU2MEM,  NULL),
 
   // ------------------------------------------------------
   // Partition Sensors
diff --git a/src/occ_405/topfiles.mk b/src/occ_405/topfiles.mk
index faae172..78a7c3a 100644
--- a/src/occ_405/topfiles.mk
+++ b/src/occ_405/topfiles.mk
@@ -65,6 +65,7 @@ TOP-C-SOURCES = amec/amec_analytics.c \
                 dimm/dimm.c \
                 dimm/dimm_control.c \
                 errl/errl.c \
+                gpu/gpu.c \
                 homer.c \
                 lock/lock.c \
                 main.c \
diff --git a/src/occ_gpe1/ipc_func_tables.c b/src/occ_gpe1/ipc_func_tables.c
index 0e43fad..d694e3e 100644
--- a/src/occ_gpe1/ipc_func_tables.c
+++ b/src/occ_gpe1/ipc_func_tables.c
@@ -23,14 +23,38 @@
 /*                                                                        */
 /* IBM_PROLOG_END_TAG                                                     */
 #include "ipc_api.h"
+#include "ipc_async_cmd.h"
 #include "gpe1_dimm.h"
+#include "gpu_structs.h"
 
 void gpe_dimm_control(ipc_msg_t* cmd, void* arg);
 void gpe1_nop(ipc_msg_t* cmd, void* arg);
 void gpe_reset_mem_deadman(ipc_msg_t* cmd, void* arg);
 void gpe_24x7(ipc_msg_t* cmd, void* arg);
 void gpe_mem_power_control(ipc_msg_t* cmd, void* arg);
+void gpe_gpu_sm(ipc_msg_t* cmd, void* arg)
+{
+  // No GPU support.  The 405 should only be calling this on OCC GPU supported
+  // systems.  Those systems require a different OCC GPE1 image with GPU support.
+  // This is indication of an OCC image build issue.
+  // Return error so the 405 can log an error and disable GPU monitoring.
+  int      rc;
+  ipc_async_cmd_t *async_cmd = (ipc_async_cmd_t*)cmd;
+  gpu_sm_args_t *args = (gpu_sm_args_t*)async_cmd->cmd_data;
 
+  // set error return code for no GPU support
+  args->error.rc = GPE_RC_NO_GPU_SUPPORT;
+  PK_TRACE("E>gpu_sm: No GPU support!");
+
+  // Send back IPC response of success (IPC operation itself succeeded)
+  // 405 will handle no support set in error
+  rc = ipc_send_rsp(cmd, IPC_RC_SUCCESS);
+  if(rc)
+  {
+      PK_TRACE("E>gpu_sm: Failed to send response back. Halting GPE1", rc);
+      pk_halt();
+  }
+}
 
 // Function table for multi target (common) functions
 IPC_MT_FUNC_TABLE_START
@@ -52,7 +76,7 @@ IPC_HANDLER(gpe1_nop, 0)                   // 2 - IPC_ST_GPE1_NOP
 IPC_HANDLER(gpe_reset_mem_deadman, 0)      // 3 - IPC_ST_RESET_MEM_DEADMAN
 IPC_HANDLER(gpe_24x7, 0)                   // 4 - IPC_ST_24_X_7_FUNCID
 IPC_HANDLER(gpe_mem_power_control, 0)      // 5 - IPC_ST_MEM_POWER_CONTROL_FUNCID
-IPC_HANDLER_DEFAULT                        // 6
+IPC_HANDLER(gpe_gpu_sm, 0)                 // 6 - IPC_ST_GPU_SM_FUNCID
 IPC_HANDLER_DEFAULT                        // 7
 IPC_HANDLER_DEFAULT                        // 8
 IPC_HANDLER_DEFAULT                        // 9
author	mbroyles <mbroyles@us.ibm.com>	2017-08-06 19:08:00 -0500
committer	William A. Bryan <wilbryan@us.ibm.com>	2017-08-14 15:18:26 -0400
commit	8a335d83ed938f05f95ca1cfdbbb5292053ed51f (patch)
tree	bd2b38c6df596f3d3bf9f70f8a54a8a205e4e2e1
parent	71b5f68da8b725f9c5251261b41fd824e652e491 (diff)
download	talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.tar.gz talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.zip