/* IBM_PROLOG_BEGIN_TAG                                                   */
/* This is an automatically generated prolog.                             */
/*                                                                        */
/* $Source: src/occ_405/gpu/gpu.c $                                       */
/*                                                                        */
/* OpenPOWER OnChipController Project                                     */
/*                                                                        */
/* Contributors Listed Below - COPYRIGHT 2011,2017                        */
/* [+] International Business Machines Corp.                              */
/*                                                                        */
/*                                                                        */
/* Licensed under the Apache License, Version 2.0 (the "License");        */
/* you may not use this file except in compliance with the License.       */
/* You may obtain a copy of the License at                                */
/*                                                                        */
/*     http://www.apache.org/licenses/LICENSE-2.0                         */
/*                                                                        */
/* Unless required by applicable law or agreed to in writing, software    */
/* distributed under the License is distributed on an "AS IS" BASIS,      */
/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or        */
/* implied. See the License for the specific language governing           */
/* permissions and limitations under the License.                         */
/*                                                                        */
/* IBM_PROLOG_END_TAG                                                     */

#ifdef GPU_DEBUG
  #define GPU_DBG(frmt,args...)   DBG_PRINT(frmt,##args)
#else
  #define GPU_DBG(frmt,args...)
#endif

#include <ssx.h>
#include <occhw_async.h>

#include <trac_interface.h>
#include <trac.h>
#include <occ_common.h>
#include <comp_ids.h>
#include <occ_service_codes.h>
#include <state.h>
#include <occ_sys_config.h>
#include "sensor.h"
#include "amec_sys.h"
#include "lock.h"
#include "common.h"
#include "amec_health.h"
#include "gpu.h"
#include "gpu_structs.h"
#include "gpu_service_codes.h"
#include "i2c.h"

// Number calls with assumption the GPU SM task is called every other tick
#define GPU_TEMP_READ_1S  ( 1000000 / (MICS_PER_TICK * 2) )
#define GPU_TIMEOUT ( 10000000 / (MICS_PER_TICK *2) )

#define GPU_TICKS_TO_100MS ( 100000 / (MICS_PER_TICK * 2) )
#define GPU_TICKS_TO_1S ( 1000000 / (MICS_PER_TICK * 2) )

// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
#define GPU_INIT_ERROR_COUNT 300  // approximately 300 seconds

#define MAX_CONSECUTIVE_GPU_RESETS      5
#define MAX_GPU_RESET_STATE_RETRY 3
#define MAX_RESET_STATE_NOT_DONE_COUNT 100
#define MAX_GPU_READ_ATTEMPT    3
#define GPU_ERRORS_BEFORE_I2C_RESET 5

// consecutive error counts for GPU command failures before error is logged if GPU is not in reset
#define GPU_CHECK_DRIVER_ERROR_COUNT 5
#define GPU_READ_MEM_CAP_ERROR_COUNT 5
#define GPU_READ_PWR_LIMIT_ERROR_COUNT 5
#define GPU_SET_PWR_LIMIT_ERROR_COUNT 5

#define GPU_I2C_ENGINE      PIB_I2C_ENGINE_C

extern data_cnfg_t * G_data_cnfg;
extern PWR_READING_TYPE  G_pwr_reading_type;

// this is the global GPU task sm state each task within the GPU SM may have its own "state"
// to allow several calls to complete the task
gpuState_e G_gpu_state = GPU_STATE_IDLE;

bool     G_gpu_monitoring_allowed = FALSE;   // Set to true if GPU is present
bool     G_gpu_i2c_reset_required = FALSE;
uint32_t G_gpu_reset_cause = 0;

// GPE Requests
GpeRequest G_gpu_op_request;
GpeRequest G_gpu_init_request;

// GPE arguments
GPE_BUFFER(gpu_sm_args_t  G_gpu_op_req_args);
GPE_BUFFER(gpu_init_args_t G_gpu_init_args);

gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};

uint8_t G_current_gpu_id = 0;   // ID 0..2 of GPU currently being processed

gpuTimingTable_t G_gpu_tick_times;

void update_gpu_tick_sensor(gpuTimingSensor_t *sensor, uint32_t ticks)
{
    if(ticks > sensor->max)
    {
        sensor->max = ticks;
    }

    if(ticks > GPU_TICKS_TO_1S)
    {
        sensor->count_1s++;
    }
    else if( (ticks > GPU_TICKS_TO_100MS) )
    {
        sensor->count_100ms++;
    }
    else
    {
        sensor->count_lt100ms++;
    }

    sensor->count++;
    sensor->accum += ticks;
    sensor->avg = sensor->accum / sensor->count;
}

// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
uint8_t get_first_gpu(void)
{
    uint8_t first_gpu = 0xFF;  // default no GPUs present/functional
    uint8_t i = 0;

    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
    {
        if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) )
        {
           first_gpu = i;
           break;
        }
    }
    return first_gpu;
}

// Get GPU number for next present non-failed GPU from G_current_gpu_id
// returns 0xFF if there is no next GPU i.e. wrapped back to first GPU
uint8_t get_next_gpu(void)
{
    uint8_t next_gpu = G_current_gpu_id;

    if(G_current_gpu_id != 0xFF)
    {
        do
        {
           if(++next_gpu == MAX_NUM_GPU_PER_DOMAIN)
           {
              next_gpu = 0;
           }
           if( (GPU_PRESENT(next_gpu)) && (!g_amec->gpu[next_gpu].status.disabled) )
           {
              break;
           }
        }while(next_gpu != G_current_gpu_id);
    }

    if(next_gpu == get_first_gpu())
    {
        next_gpu = 0xFF;
    }

    return next_gpu;
}

// Get GPU number for a GPU that needs to be checked if driver is loaded
// returns 0xFF if no GPU needs to be checked
uint8_t gpu_id_need_driver_check(void)
{
    uint8_t gpu_id = 0xFF;  // default none needs checking
    uint8_t i = 0;

    // checking for driver loaded can be repeated until driver is loaded which may never happen
    // to avoid infinite loop checking the same GPU over and over we will use a static and each call
    // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
    static uint8_t L_current_gpu_id = 0;

    if(L_current_gpu_id == 0xFF)
    {
       // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
       L_current_gpu_id = 0;
    }
    else
    {
       for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
       {
           // only check for driver after i2c comm (readOnce) has been established
           if((GPU_PRESENT(i)) && (!g_amec->gpu[i].status.disabled) &&
              (g_amec->gpu[i].status.readOnce) && (g_amec->gpu[i].status.checkDriverLoaded))
           {
              gpu_id = i;
              break;
           }
       }

       //  setup L_current_gpu_id for next call based on what is happening this time
       if(gpu_id == 0xFF)
       {
          // no GPU needs checking start back at 0 next time
          L_current_gpu_id = 0;
       }
       else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
       {
          // last GPU is having driver checked do not check any next time
          L_current_gpu_id = 0xFF;
       }
       else
       {
          // next time look at the next GPU ID first
          L_current_gpu_id = gpu_id + 1;
       }
    }

    return gpu_id;
}

uint8_t gpu_id_need_memory_temp_capability_check(void)
{
    uint8_t gpu_id = 0xFF;  // default none needs checking
    uint8_t i = 0;

    // checking for memory temp capability will be repeated until memory temp is capable which may never happen
    // to avoid infinite loop checking the same GPU over and over we will use a static and each call
    // will start looking at the next GPU, after all GPUs checked will allow none before re-checking all GPUs
    static uint8_t L_current_gpu_id = 0;

    if(L_current_gpu_id == 0xFF)
    {
       // checked all GPUs once, do not check any this time and start over with GPU 0 on next call
       L_current_gpu_id = 0;
    }
    else
    {
       for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
       {
           // driver must be loaded for memory temp capability
           if( (!g_amec->gpu[i].status.disabled) && (g_amec->gpu[i].status.driverLoaded) &&
               (g_amec->gpu[i].status.checkMemTempSupport) )
           {
              gpu_id = i;
              break;
           }
       }

       //  setup L_current_gpu_id for next call based on what is happening this time
       if(gpu_id == 0xFF)
       {
          // no GPU needs checking start back at 0 next time
          L_current_gpu_id = 0;
       }
       else if(gpu_id == (MAX_NUM_GPU_PER_DOMAIN - 1) )
       {
          // last GPU is having memory capability checked do not check any next time
          L_current_gpu_id = 0xFF;
       }
       else
       {
          // next time look at the next GPU ID first
          L_current_gpu_id = gpu_id + 1;
       }
    }
    return gpu_id;
}

// Find first functional GPU with memory temp capability
// returns 0xFF if no functional GPU has memory temp capability
uint8_t get_first_mem_temp_capable_gpu(void)
{
    uint8_t first_gpu = 0xFF;  // default no GPU with mem temp capability
    uint8_t i = 0;

    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
    {
        if( (!g_amec->gpu[i].status.disabled) &&
            (g_amec->gpu[i].status.memTempSupported) )  // memTempSupported implies that driver is loaded
        {
           first_gpu = i;
           break;
        }
    }
    return first_gpu;
}

// Get GPU number for next functional GPU from G_current_gpu_id with mem temp capability
// returns 0xFF if there is no next GPU i.e. wrapped back to first GPU with mem temp
uint8_t get_next_mem_temp_capable_gpu(void)
{
    uint8_t next_gpu = G_current_gpu_id;

    if(G_current_gpu_id != 0xFF)
    {
        do
        {
           if(++next_gpu == MAX_NUM_GPU_PER_DOMAIN)
           {
              next_gpu = 0;
           }
           if( (!g_amec->gpu[next_gpu].status.disabled) &&
               (g_amec->gpu[next_gpu].status.memTempSupported) )  // memTempSupported implies that driver is loaded
           {
              break;
           }
        }while(next_gpu != G_current_gpu_id);
    }

    if(next_gpu == get_first_mem_temp_capable_gpu())
    {
        next_gpu = 0xFF;
    }
    else if( (next_gpu != 0xFF) && (!g_amec->gpu[next_gpu].status.memTempSupported) )
    {
        next_gpu = 0xFF;
    }

    return next_gpu;
}


// Get GPU number for a GPU that needs power limits read
// returns 0xFF if no GPU needs power limits read
uint8_t gpu_id_need_power_limits(void)
{
    uint8_t gpu_id = 0xFF;  // default none
    uint8_t i = 0;

    static uint8_t L_current_gpu_id = 0;

    if(0xFF == L_current_gpu_id)
    {
        // We attempted to read power limits for all GPUs
        // do not check any this time and start over with GPU 0 on next call
        L_current_gpu_id = 0;
    }
    else
    {
        for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
        {
            // to read power limits requires that the driver is loaded
            if( (g_amec->gpu[i].status.driverLoaded) &&
                (g_amec->gpu[i].pcap.check_pwr_limit))
            {
                // If there is no power capping support skip reading power limits
                if(G_pwr_reading_type == PWR_READING_TYPE_NONE)
                {
                    g_amec->gpu[i].pcap.check_pwr_limit = false;
                }
                else
                {
                    gpu_id = i;
                    break;
                }
            }
        }

        if(0xFF == gpu_id)
        {
            // We don't need to read power limits from any GPUs at the moment
            L_current_gpu_id = 0;
        }
        else if( (MAX_NUM_GPU_PER_DOMAIN - 1) == gpu_id)
        {
            // We're reading from last GPU, do not check any next time
            L_current_gpu_id = 0xFF;
        }
        else
        {
            // Next time look at next GPU ID first
            L_current_gpu_id = gpu_id + 1;
        }
    }

    return gpu_id;
}

// Get GPU number for a GPU that needs power limit set
// returns 0xFF if no GPU needs power limit set
uint8_t gpu_id_need_set_power_limit(void)
{
    uint8_t gpu_id = 0xFF;  // default none
    uint8_t i = 0;

    static uint8_t L_current_gpu_id = 0;

    if(0xFF == L_current_gpu_id)
    {
        // We've checked to see if all GPUs need a power cap set, start over
        // with GPU 0 next time
        L_current_gpu_id = 0;
    }
    else
    {
        for (i=L_current_gpu_id; i<MAX_NUM_GPU_PER_DOMAIN; i++)
        {
            // to set power limit requires that the driver is loaded and power limits were read
            if( (g_amec->gpu[i].status.driverLoaded) && (g_amec->gpu[i].pcap.pwr_limits_read) &&
                (!g_amec->gpu[i].pcap.set_failed) && (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != 0) &&
                (g_amec->gpu[i].pcap.gpu_desired_pcap_mw != g_amec->gpu[i].pcap.gpu_requested_pcap_mw) )
            {
               gpu_id = i;
               break;
            }
        }

        if(0xFF == gpu_id)
        {
            // no GPU needs checking start back at 0 next time
            L_current_gpu_id = 0;
        }
        else if( (MAX_NUM_GPU_PER_DOMAIN - 1) == gpu_id )
        {
            // Last GPU is being set do not check any next time
            L_current_gpu_id = 0xFF;
        }
        else
        {
            // next time look at the next GPU ID first
            L_current_gpu_id = gpu_id + 1;
        }
    }

    return gpu_id;
}

// For the given GPU clear status/data that requires GPU driver to be loaded
void clear_gpu_driver_status(uint8_t i_gpu_num)
{
    g_amec->gpu[i_gpu_num].status.checkDriverLoaded = false;
    g_amec->gpu[i_gpu_num].status.driverLoaded = false;

    // Reading memory temperature requires driver to be loaded.
    g_amec->gpu[i_gpu_num].status.checkMemTempSupport = false;
    g_amec->gpu[i_gpu_num].status.memTempSupported = false;
    g_amec->gpu[i_gpu_num].status.memErrorCount = 0;

    // Power capping requires driver to be loaded.  Clear GPU power limits
    g_amec->gpu[i_gpu_num].pcap.check_pwr_limit = false;
    g_amec->gpu[i_gpu_num].pcap.pwr_limits_read = false;
    g_amec->gpu[i_gpu_num].pcap.set_failed = false;
    g_amec->gpu[i_gpu_num].pcap.gpu_min_pcap_mw = 0;
    g_amec->gpu[i_gpu_num].pcap.gpu_max_pcap_mw = 0;
    g_amec->gpu[i_gpu_num].pcap.gpu_requested_pcap_mw = 0;
    g_amec->gpu[i_gpu_num].pcap.gpu_default_pcap_mw = 0;
    //amec will need to recalculate after power limits are read to handle any clipping with new GPU min/max
    g_amec->gpu[i_gpu_num].pcap.gpu_desired_pcap_mw = 0;
}

// Handles GPU not able to process request due to driver load or un-load
void handle_driver_change(void)
{
    // Clear out driver status while driver change completes and is determined if loaded/un-loaded
    clear_gpu_driver_status(G_current_gpu_id);

    // memory temp only available when driver is loaded. clear error and set not available
    g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;

    // when driver change is complete we must re-query to see if driver is loaded or not
    g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
}

// For all GPUs read GPU reset status and take action if reset status has changed
void update_gpu_reset_status(void)
{
    uint8_t gpu_num = 0;

    // GPU reset status is in the OCC FLAGS register and is updated by OPAL
    // Read the current reset status for all GPUs.  A reset status of '1' indicates NOT in reset
    ocb_occflg_t occ_flags = {0};
    occ_flags.value = in32(OCB_OCCFLG);
    bool not_in_reset[3] = {occ_flags.fields.gpu0_reset_status,
                            occ_flags.fields.gpu1_reset_status,
                            occ_flags.fields.gpu2_reset_status};

    // reset status of '0' (IN reset) is the default
    // the OCC will still try to read GPU when IN reset but will not log errors
    // this is so we still communicate with the GPUs without OPAL support to indicate
    // a GPU is not in reset.
    // Full OCC support below for when OPAL starts updating the reset status
    for (gpu_num=0; gpu_num<MAX_NUM_GPU_PER_DOMAIN; gpu_num++)
    {
        if(not_in_reset[gpu_num] != g_amec->gpu[gpu_num].status.notReset)
        {
            INTR_TRAC_IMP("update_gpu_reset_status: GPU%d NOT in reset is now = %d",
                           gpu_num,
                           not_in_reset[gpu_num]);

            // There has been a change to the reset status clear everything out except for errors logged so we don't log again
            clear_gpu_driver_status(gpu_num);
            g_amec->gpu[gpu_num].status.errorCount = 0;
            g_amec->gpu[gpu_num].status.retryCount = 0;

            // readOnce of false will force comm to be established and once established then checkDriverLoaded will get set
            g_amec->gpu[gpu_num].status.readOnce = false;

            if(not_in_reset[gpu_num])
            {
                // GPU was taken out of reset clear disabled to allow communication again
                g_amec->gpu[gpu_num].status.disabled = false;
            }
            else
            {
                // GPU was put in reset.  Clear temperature sensor errors and set to not available
                g_amec->gpu[gpu_num].status.coreTempFailure = false;
                g_amec->gpu[gpu_num].status.coreTempNotAvailable = true;
                g_amec->gpu[gpu_num].status.memTempFailure = false;
                g_amec->gpu[gpu_num].status.memTempNotAvailable = true;
            }

            g_amec->gpu[gpu_num].status.notReset = not_in_reset[gpu_num];

        }  // if GPU reset status changed
    }  // for each GPU
}  // end update_gpu_reset_status()

// Disable GPU monitoring for all GPUs
void disable_all_gpus(void)
{
    uint8_t i = 0;

    // release I2C lock to the host for this engine and stop monitoring
    occ_i2c_lock_release(GPU_I2C_ENGINE);
    G_gpu_monitoring_allowed = FALSE;

    // mark all GPUs as disabled
    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
    {
        g_amec->gpu[i].status.disabled = TRUE;
    }
}


// schedule request to init gpu info on gpe1
void schedule_gpe_gpu_init_req()
{
    errlHndl_t err = NULL;
    int rc = 0;

    memset(&G_gpu_init_args, 0, sizeof(G_gpu_init_args));
    // Need to add gpu i2c info
    // G_gpu_init_args.gpu_i2c


    rc = gpe_request_schedule(&G_gpu_init_request);
    if (rc)
    {
        INTR_TRAC_ERR
            ("schedule_gpe_gpu_init_req: gpe gpu init schedule failed w/rc=0x%08X", rc);
        /*
         * @errortype
         * @moduleid    GPU_MID_GPE_GPU_INIT_SCHED_REQ
         * @reasoncode  SSX_GENERIC_FAILURE
         * @userdata1   GPE schedule returned code
         * @userdata4   ERC_GPU_SCHEDULE_FAILURE
         * @devdesc     Failed to schedule GPE GPU initial request
         */
        err = createErrl(GPU_MID_GPE_GPU_INIT_SCHED_REQ,
                         SSX_GENERIC_FAILURE,
                         ERC_GPU_SCHEDULE_FAILURE,
                         ERRL_SEV_PREDICTIVE,
                         NULL,
                         DEFAULT_TRACE_SIZE,
                         rc,
                         0);
        commitErrl(&err);

        // release I2C lock to the host for this engine and stop monitoring
        occ_i2c_lock_release(GPU_I2C_ENGINE);
        G_gpu_monitoring_allowed = FALSE;
    }

}


// Create GPU IPC requests
void gpu_ipc_init()
{
    errlHndl_t l_err = NULL;
    int rc = 0;

    do
    {
        // Initialize IPC request for GPU operation requests
        GPU_DBG("gpu_ipc_init: Creating GPE1 IPC request for GPU op requests");
        rc = gpe_request_create(&G_gpu_op_request,
                                &G_async_gpe_queue1,
                                IPC_ST_GPU_SM_FUNCID,
                                &G_gpu_op_req_args,
                                SSX_WAIT_FOREVER,
                                NULL, // no callback/arg
                                NULL,
                                ASYNC_CALLBACK_IMMEDIATE);
        if (rc)
        {
            TRAC_ERR("gpu_ipc_init: Failed to create GPE1 IPC request for GPU op req (rc=%d)", rc);
            break;
        }

        // Initialize GPU support on GPE1
        GPU_DBG("gpu_ipc_init: Creating GPE1 IPC request for GPU initialization");
        rc = gpe_request_create(&G_gpu_init_request,
                                &G_async_gpe_queue1,
                                IPC_ST_GPE_GPU_INIT_FUNCID,
                                &G_gpu_init_args,
                                SSX_WAIT_FOREVER,
                                NULL, // no callback
                                NULL, // no args
                                ASYNC_CALLBACK_IMMEDIATE);
        if (rc)
        {
            TRAC_ERR("gpu_ipc_init: Failed to create GPE1 GPU init request. (rc=%d)", rc);
            break;
        }
    }
    while(0);

    if (rc)
    {
        /* @
         * @errortype
         * @moduleid    GPU_MID_INIT
         * @reasoncode  SSX_GENERIC_FAILURE
         * @userdata1   return code
         * @userdata4   OCC_NO_EXTENDED_RC
         * @devdesc     Failed to create GPE1 GPU IPC request
         */
        l_err = createErrl(GPU_MID_INIT,
                           SSX_GENERIC_FAILURE,
                           OCC_NO_EXTENDED_RC,
                           ERRL_SEV_PREDICTIVE,
                           NULL, // trace buffer
                           DEFAULT_TRACE_SIZE,
                           rc,
                           0);

        REQUEST_RESET(l_err);

        // release I2C lock to the host for this engine and stop monitoring
        occ_i2c_lock_release(GPU_I2C_ENGINE);
        G_gpu_monitoring_allowed = FALSE;
    }
    else
    {
        // if (redundant ps policy is set)
        if (G_sysConfigData.system_type.non_redund_ps == false)
        {
            // gpe gpu init only needs to be done once, so do it here.
            schedule_gpe_gpu_init_req();
        }
    }
}

// Called after a failure reading core temp for a specified GPU.  The error will
// be counted and if threshold is reached, an error will be created with
// the GPU as a callout if the GPU is not in reset
void mark_gpu_failed(const gpu_sm_args_t *i_arg)
{
    uint32_t gpu_id = i_arg->gpu_id;

    do
    {
        if((false == g_amec->gpu[gpu_id].status.disabled) &&
           (true == g_amec->gpu[gpu_id].status.readOnce))
        {
            GPU_DBG("mark_gpu_failed: GPU%d failed in op/rc/count=0x%06X "
                    "(ffdc 0x%08X%08X)",
                    gpu_id, (i_arg->operation << 16) | (i_arg->error.rc << 8) | g_amec->gpu[gpu_id].status.errorCount,
                    WORD_HIGH(i_arg->error.ffdc), WORD_LOW(i_arg->error.ffdc));
        }

        // Always inc retry count for I2C reset regardless of if GPU is in reset or not
        g_amec->gpu[gpu_id].status.retryCount++;

        // Only inc error count if it is known that GPU is NOT in reset
        // NOTE: Default is IN reset so this will only be true when OPAL/OS supports telling the OCC reset status
        // if OS never tells the OCC reset status the OCC will never disable or log a comm error
        if(g_amec->gpu[gpu_id].status.notReset)
        {
            // INC count and check if reached error threshold
            if( ++g_amec->gpu[gpu_id].status.errorCount > GPU_INIT_ERROR_COUNT)
            {
               // set that GPU temperature readings failed
               g_amec->gpu[gpu_id].status.memTempFailure = true;
               g_amec->gpu[gpu_id].status.memTempNotAvailable = true;
               g_amec->gpu[gpu_id].status.coreTempFailure = true;
               g_amec->gpu[gpu_id].status.coreTempNotAvailable = true;

               // Disable this GPU.  GPU will get re-enabled if detected that GPU is put in reset and then taken out
               g_amec->gpu[gpu_id].status.disabled = true;

               INTR_TRAC_ERR("mark_gpu_failed: disabling GPU%d due to %d consecutive errors (op=%d)",
                             gpu_id, g_amec->gpu[gpu_id].status.errorCount, i_arg->operation);

               if(g_amec->gpu[gpu_id].status.commErrorLogged == false)
               {

                  errlHndl_t l_err = NULL;
                  /*
                   * @errortype
                   * @moduleid    GPU_MID_MARK_GPU_FAILED
                   * @reasoncode  GPU_FAILURE
                   * @userdata1   GPE returned rc code
                   * @userdata4   ERC_GPU_COMPLETE_FAILURE
                   * @devdesc     GPU failure
                   */
                  l_err = createErrl(GPU_MID_MARK_GPU_FAILED,
                                     GPU_FAILURE,
                                     ERC_GPU_COMPLETE_FAILURE,
                                     ERRL_SEV_PREDICTIVE,
                                     NULL,
                                     DEFAULT_TRACE_SIZE,
                                     i_arg->error.rc,
                                     0);
                  addUsrDtlsToErrl(l_err,
                                   (uint8_t*)&i_arg->error.ffdc,
                                   sizeof(i_arg->error.ffdc),
                                   ERRL_STRUCT_VERSION_1,
                                   ERRL_USR_DTL_BINARY_DATA);

                  // Callout the GPU if have sensor ID for it
                  if(G_sysConfigData.gpu_sensor_ids[gpu_id])
                  {
                     addCalloutToErrl(l_err,
                                      ERRL_CALLOUT_TYPE_GPU_ID,
                                      G_sysConfigData.gpu_sensor_ids[gpu_id],
                                      ERRL_CALLOUT_PRIORITY_MED);
                  }

                  commitErrl(&l_err);
                  g_amec->gpu[gpu_id].status.commErrorLogged = true;

               } // if !commErrorLogged

            } // if errorCount > threshold

        } // if notReset

    } while(0);

    // Do an I2C reset if reached retry count
    // don't want to do I2C reset every time since could be that this GPU really is in reset and
    // while resetting I2C we are unable to read other GPUs that may not be in reset
    if( g_amec->gpu[gpu_id].status.retryCount > GPU_ERRORS_BEFORE_I2C_RESET)
    {
       g_amec->gpu[gpu_id].status.retryCount = 0;
       G_gpu_i2c_reset_required = true;
       G_gpu_reset_cause = gpu_id<<24 | (i_arg->error.rc & 0xFFFF);
    }

} // end mark_gpu_failed()

// Schedule a GPE request for GPU operation
bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)
{
    bool l_scheduled = false;
    bool scheduleRequest = true;
    errlHndl_t err = NULL;

    GPU_DBG(">>schedule_gpu_req(op 0x%02X)", i_operation);

    if (!async_request_is_idle(&G_gpu_op_request.request))
    {
        INTR_TRAC_INFO("E>schedule_gpu_req: prior request (op 0x%02X) not idle when scheduling 0x%02X (tick=%d)",
                       G_gpu_op_req_args.operation, i_operation, GPU_TICK);
    }
    else
    {
        // Ready for next request
        G_gpu_op_req_args = i_new_args;
        switch(i_operation)
        {
            // Init
            case GPU_REQ_INIT:
                break;

            // Read GPU memory temp capability
            case GPU_REQ_READ_CAPS_START:
            case GPU_REQ_READ_CAPS_2:
            case GPU_REQ_READ_CAPS_3:
            case GPU_REQ_READ_CAPS_FINISH:
                break;

            // Read GPU core temp
            case GPU_REQ_READ_TEMP_START:
            case GPU_REQ_READ_TEMP_FINISH:
                break;

            // Read GPU memory temp
            case GPU_REQ_READ_MEM_TEMP_START:
            case GPU_REQ_READ_MEM_TEMP_2:
            case GPU_REQ_READ_MEM_TEMP_3:
            case GPU_REQ_READ_MEM_TEMP_FINISH:
                break;

            // Check if driver is loaded
            case GPU_REQ_CHECK_DRIVER_START:
            case GPU_REQ_CHECK_DRIVER_2:
            case GPU_REQ_CHECK_DRIVER_3:
            case GPU_REQ_CHECK_DRIVER_FINISH:
                break;

            // Read GPU Power Limit
            case GPU_REQ_GET_PWR_LIMIT_1_START:
            case GPU_REQ_GET_PWR_LIMIT_1_2:
            case GPU_REQ_GET_PWR_LIMIT_1_3:
            case GPU_REQ_GET_PWR_LIMIT_1_FINISH:
            case GPU_REQ_GET_PWR_LIMIT_2_START:
            case GPU_REQ_GET_PWR_LIMIT_2_2:
            case GPU_REQ_GET_PWR_LIMIT_2_FINISH:
            case GPU_REQ_GET_PWR_LIMIT_3_START:
            case GPU_REQ_GET_PWR_LIMIT_3_2:
            case GPU_REQ_GET_PWR_LIMIT_3_3:
            case GPU_REQ_GET_PWR_LIMIT_3_FINISH:
            case GPU_REQ_GET_PWR_LIMIT_4_START:
            case GPU_REQ_GET_PWR_LIMIT_4_2:
            case GPU_REQ_GET_PWR_LIMIT_4_3:
            case GPU_REQ_GET_PWR_LIMIT_4_FINISH:
            case GPU_REQ_GET_PWR_LIMIT_5_START:
            case GPU_REQ_GET_PWR_LIMIT_5_2:
            case GPU_REQ_GET_PWR_LIMIT_5_3:
            case GPU_REQ_GET_PWR_LIMIT_5_FINISH:
                break;

            // Set GPU Power Limit
            case GPU_REQ_SET_PWR_LIMIT_1_START:
            case GPU_REQ_SET_PWR_LIMIT_1_1:
            case GPU_REQ_SET_PWR_LIMIT_1_2:
            case GPU_REQ_SET_PWR_LIMIT_1_FINISH:
            case GPU_REQ_SET_PWR_LIMIT_2_START:
            case GPU_REQ_SET_PWR_LIMIT_2_1:
            case GPU_REQ_SET_PWR_LIMIT_2_2:
            case GPU_REQ_SET_PWR_LIMIT_2_FINISH:
            case GPU_REQ_SET_PWR_LIMIT_3_START:
            case GPU_REQ_SET_PWR_LIMIT_3_2:
            case GPU_REQ_SET_PWR_LIMIT_3_3:
            case GPU_REQ_SET_PWR_LIMIT_3_FINISH:
            case GPU_REQ_SET_PWR_LIMIT_4_START:
            case GPU_REQ_SET_PWR_LIMIT_4_2:
            case GPU_REQ_SET_PWR_LIMIT_4_FINISH:
                break;

            // I2C reset
            case GPU_REQ_RESET:
                break;

            default:
                INTR_TRAC_ERR("schedule_gpu_req: Invalid GPU request operation: 0x%02X", i_operation);
                /*
                 * @errortype
                 * @moduleid    GPU_MID_GPU_SCHED_REQ
                 * @reasoncode  GPU_FAILURE
                 * @userdata1   operation
                 * @userdata2   0
                 * @userdata4   ERC_GPU_INVALID_GPU_OPERATION
                 * @devdesc     Invalid GPU request operation
                 */
                err = createErrl(GPU_MID_GPU_SCHED_REQ,
                                 GPU_FAILURE,
                                 ERC_GPU_INVALID_GPU_OPERATION,
                                 ERRL_SEV_PREDICTIVE,
                                 NULL,
                                 DEFAULT_TRACE_SIZE,
                                 i_operation,
                                 0);

                commitErrl(&err);
                scheduleRequest = false;

                // release I2C lock to the host for this engine and stop monitoring
                occ_i2c_lock_release(GPU_I2C_ENGINE);
                G_gpu_monitoring_allowed = FALSE;
                break;
        }

        if (scheduleRequest)
        {
            // Clear errors and init common arguments for GPE
            G_gpu_op_req_args.error.error = 0;
            G_gpu_op_req_args.gpu_rc = 0;
            G_gpu_op_req_args.operation = i_operation;
            G_gpu_op_req_args.gpu_id = G_current_gpu_id;

            GPU_DBG("schedule_gpu_req: Scheduling GPE1 GPU operation 0x%02X (tick %d)", i_operation, GPU_TICK);
            int l_rc = gpe_request_schedule(&G_gpu_op_request);
            if (0 == l_rc)
            {
                l_scheduled = true;
            }
            else
            {
                INTR_TRAC_ERR("schedule_gpu_req: schedule failed w/rc=0x%08X (%d us)",
                              l_rc, (int) ((ssx_timebase_get())/(SSX_TIMEBASE_FREQUENCY_HZ/1000000)));
                /*
                 * @errortype
                 * @moduleid    GPU_MID_GPU_SCHED_REQ
                 * @reasoncode  SSX_GENERIC_FAILURE
                 * @userdata1   GPE schedule returned code
                 * @userdata2   GPU operation
                 * @userdata4   ERC_GPU_SCHEDULE_FAILURE
                 * @devdesc     Failed to schedule GPU operation request
                 */
                err = createErrl(GPU_MID_GPU_SCHED_REQ,
                                 SSX_GENERIC_FAILURE,
                                 ERC_GPU_SCHEDULE_FAILURE,
                                 ERRL_SEV_PREDICTIVE,
                                 NULL,
                                 DEFAULT_TRACE_SIZE,
                                 l_rc,
                                 i_operation);
                commitErrl(&err);

                // release I2C lock to the host for this engine and stop monitoring
                occ_i2c_lock_release(GPU_I2C_ENGINE);
                G_gpu_monitoring_allowed = FALSE;
            }
        }
    }

    return l_scheduled;

} // end schedule_gpu_req()

// Function Specification
//
// Name:  gpu_reset_sm
//
// Description: GPU Reset State Machine.  This is not called per GPU if any handling is needed
//               per GPU this function must handle and not indicate that reset is complete
//               until all present GPUs are ready
//
// End Function Specification
bool gpu_reset_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the reset AND initialization is complete
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_state_retry_count = 0;
    static uint8_t L_consec_reset_failure_count = 0;
    static gpuResetState_e L_reset_state = GPU_RESET_STATE_NEW;  // 1st state for a reset

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
        // check if the previous state was successfully scheduled and success/done
        if( (L_reset_state != GPU_RESET_STATE_NEW) &&
            (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) &&
            (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
            // Check if failure was due to GPE image not having GPU support
            if(G_gpu_op_req_args.error.rc == GPE_RC_NO_GPU_SUPPORT)
            {
                // No GPU Support, log error and disable all GPUs
                INTR_TRAC_ERR("gpu_reset_sm: GPE image doesn't support GPUs!");

                /*
                 * @errortype
                 * @moduleid    GPU_MID_GPU_RESET_SM
                 * @reasoncode  GPU_NO_GPE_SUPPORT
                 * @userdata1   0
                 * @userdata2   0
                 * @userdata4   ERC_GPU_NO_GPE_SUPPORT
                 * @devdesc     GPE1 image doesn't support GPU communication
                 */
                errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
                                         GPU_NO_GPE_SUPPORT,
                                         ERC_GPU_NO_GPE_SUPPORT,
                                         ERRL_SEV_UNRECOVERABLE,
                                         NULL,
                                         DEFAULT_TRACE_SIZE,
                                         0,
                                         0);
                commitErrl(&err);

                disable_all_gpus();

                L_reset_state = GPU_RESET_STATE_NEW;
                return FALSE;  // GPUs are not ready for communication
            }
            else
            {
                // Stay in current state if haven't reached state retry count
                if(L_state_retry_count < MAX_GPU_RESET_STATE_RETRY)
                {
                    // INC state retry count and retry current state
                    L_state_retry_count++;
                }
                else  // this reset attempt failed
                {
                    // Stop trying if reached max resets
                    if(L_consec_reset_failure_count > MAX_CONSECUTIVE_GPU_RESETS)
                    {
                        INTR_TRAC_ERR("gpu_reset_sm: Max Resets reached failed at state 0x%02X",
                                      L_reset_state);

                        /*
                         * @errortype
                         * @moduleid    GPU_MID_GPU_RESET_SM
                         * @reasoncode  GPU_FAILURE
                         * @userdata1   GPU reset state
                         * @userdata2   0
                         * @userdata4   ERC_GPU_RESET_FAILURE
                         * @devdesc     Failure resetting GPU interface
                         */
                        errlHndl_t err = createErrl(GPU_MID_GPU_RESET_SM,
                                               GPU_FAILURE,
                                               ERC_GPU_RESET_FAILURE,
                                               ERRL_SEV_UNRECOVERABLE,
                                               NULL,
                                               DEFAULT_TRACE_SIZE,
                                               L_reset_state,
                                               0);
                        commitErrl(&err);

                        disable_all_gpus();

                        L_reset_state = GPU_RESET_STATE_NEW;
                        return FALSE;  // GPUs are not ready for communication
                    }
                    else  // try the reset again from the beginning
                    {
                        L_consec_reset_failure_count++;
                        L_state_retry_count = 0;
                        L_reset_state = GPU_RESET_STATE_NEW;
                    }
                }  // else reset attempt failed
            }  // else GPE supports GPU
        }// if previous state failed
        else // success on last state go to next state and process it
        {
            L_state_retry_count = 0;
            L_reset_state++;
        }

        L_scheduled = FALSE;  // default nothing scheduled

        switch (L_reset_state)
        {
            case GPU_RESET_STATE_INIT_BUS:
                // Setup I2C Interrupt Mask Register
                L_scheduled = schedule_gpu_req(GPU_REQ_INIT, G_new_gpu_req_args);
                break;

            case GPU_RESET_STATE_RESET_MASTER:
                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_MASTER;
                L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                break;

            case GPU_RESET_STATE_RESET_SLAVE:
                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV;
                L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                break;

            case GPU_RESET_STATE_RESET_SLAVE_WAIT:
                // Delay to allow reset to complete
                GPU_DBG("gpu_reset_sm: waiting during slave port 4 reset");
                break;

            case GPU_RESET_STATE_RESET_SLAVE_COMPLETE:
                G_new_gpu_req_args.data[0] = GPU_RESET_REQ_SLV_COMPLETE;
                L_scheduled = schedule_gpu_req(GPU_REQ_RESET, G_new_gpu_req_args);
                break;

            case GPU_RESET_STATE_RESET_FINISH:
                // Reset and init is complete ready to start sending commands to the GPUs
                l_complete = TRUE;
                L_consec_reset_failure_count = 0;
                // next time this is called will be to start a new reset
                L_reset_state = GPU_RESET_STATE_NEW;
                break;

           default:
               INTR_TRAC_ERR("gpu_reset_sm: INVALID STATE: 0x%02X when reset is required", L_reset_state);
               L_reset_state = GPU_RESET_STATE_NEW;
               break;
       } // switch L_reset_state

        if(L_scheduled)
        {
            GPU_DBG("gpu_reset_sm: Scheduled reset state 0x%02X", L_reset_state);
        }
        // check if the state was expected to have a schedule. Only new and slave wait
        // don't schedule for all other states the schedule must have failed
        else if( (L_reset_state != GPU_RESET_STATE_NEW) &&
                 (L_reset_state != GPU_RESET_STATE_RESET_SLAVE_WAIT) )
        {
            INTR_TRAC_ERR("gpu_reset_sm: failed to schedule state 0x%02X", L_reset_state);
        }

    } // if async_request_is_idle
    else
    {
        INTR_TRAC_ERR("gpu_reset_sm: NOT idle for state 0x%02X", L_reset_state);
    }

    return l_complete;
} // end gpu_reset_sm()

// Function Specification
//
// Name:  gpu_check_driver_loaded_sm
//
// Description: Called from gpu_task_sm to check if driver is loaded for G_current_gpu_id
//              This function should only return that complete is TRUE when the check
//              is complete (or determined failed) and ready for a different GPU
//
// Pre-Req:  Caller must have G_current_gpu_id set for GPU to check
//
// End Function Specification
bool gpu_check_driver_loaded_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
    bool l_new_driver_loaded = FALSE;
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_check_driver_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
    static uint8_t L_state_failure_count = 0;
    static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
       // If not starting a new read then need to check status of current state before moving on
       // stay in current state if the schedule failed or the state isn't finished/failed
       if( (L_check_driver_state != GPU_STATE_CHECK_DRIVER_LOADED_NEW) &&
           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
       {
          // Check if failure was due to driver change
          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
          {
             handle_driver_change();
             // Request can't be processed by GPU at this time so we are done with this GPU
             // setup to start new request
             L_state_failure_count = 0;
             L_check_driver_failure_count[G_current_gpu_id] = 0; // clear driver failure count since there's a driver change
             L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          }

          // If reached state retry count give up on this read
          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
          {
             // if GPU is not in reset then INC error count and check if reached threshold
             if(g_amec->gpu[G_current_gpu_id].status.notReset)
             {
                if(++L_check_driver_failure_count[G_current_gpu_id] > GPU_CHECK_DRIVER_ERROR_COUNT)
                {
                    INTR_TRAC_ERR("gpu_check_driver_loaded: Failed to check driver loaded for GPU%d RC: 0x%02X",
                                   G_current_gpu_id,
                                   G_gpu_op_req_args.gpu_rc);

                    // give up checking driver loaded for this GPU
                    // It will be retried if detected that GPU is put in reset and then taken out
                    g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
                    L_check_driver_failure_count[G_current_gpu_id] = 0;

                    // without driver loaded cannot read memory temp, mark memory temp as failed
                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;

                    // log one time error that driver loaded couldn't be determined
                    if(!L_error_logged[G_current_gpu_id])
                    {
                       L_error_logged[G_current_gpu_id] = TRUE;

                       // Log error
                       /* @
                        * @errortype
                        * @moduleid    GPU_MID_GPU_CHECK_DRIVER_LOADED
                        * @reasoncode  GPU_FAILURE
                        * @userdata1   GPU ID
                        * @userdata2   GPU RC
                        * @userdata4   ERC_GPU_CHECK_DRIVER_LOADED_FAILURE
                        * @devdesc     Failure to check GPU driver loaded
                        *
                        */
                       errlHndl_t l_err = createErrl(GPU_MID_GPU_CHECK_DRIVER_LOADED,
                                                     GPU_FAILURE,
                                                     ERC_GPU_CHECK_DRIVER_LOADED_FAILURE,
                                                     ERRL_SEV_PREDICTIVE,
                                                     NULL,
                                                     DEFAULT_TRACE_SIZE,
                                                     G_current_gpu_id,
                                                     G_gpu_op_req_args.gpu_rc);

                       // Callout the GPU if have sensor ID for it
                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                       {
                          addCalloutToErrl(l_err,
                                           ERRL_CALLOUT_TYPE_GPU_ID,
                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                           ERRL_CALLOUT_PRIORITY_MED);
                       }

                       // Commit Error
                       commitErrl(&l_err);
                    } // if error not logged
                } // if reached error count
             } // if notReset

             L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
             L_state_failure_count = 0;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          } // if reached state retry count
          else
          {
             // INC failure count and retry current state
             L_state_failure_count++;
          }
       }
       else // success on last state go to next state and process it
       {
          L_state_failure_count = 0;
          L_check_driver_state++;
       }

       L_scheduled = FALSE;  // default nothing scheduled

       switch (L_check_driver_state)
       {
           case GPU_STATE_CHECK_DRIVER_LOADED_START:
               L_num_ticks = 1;
               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
               break;

           case GPU_STATE_CHECK_DRIVER_LOADED_2:
               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_2, G_new_gpu_req_args);
               break;

           case GPU_STATE_CHECK_DRIVER_LOADED_3:
               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_3, G_new_gpu_req_args);
               break;

           case GPU_STATE_CHECK_DRIVER_LOADED_READ:
               L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_FINISH, G_new_gpu_req_args);
               break;

           case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
                // Update GPU tick timing table
                update_gpu_tick_sensor(&G_gpu_tick_times.checkdriver[G_current_gpu_id], L_num_ticks);

               // Update driver loaded
               l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
               if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
               {
                  // Driver loaded status changed
                  GPU_DBG("gpu_check_driver_loaded: GPU%d driver loaded changed to %d",
                          G_current_gpu_id,
                          l_new_driver_loaded);

                  if(l_new_driver_loaded)
                  {
                     // Driver is now loaded do checking that required driver to be loaded
                     g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = true;
                     g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
                     // done checking for driver to be loaded
                     g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = false;
                  }
                  else
                  {
                     // Driver is no longer loaded
                     clear_gpu_driver_status(G_current_gpu_id);

                     // memory temp only available when driver is loaded
                     // clear error and set not available
                     g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
                     g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;

                     // Need to keep query for driver loaded to detect when driver is loaded
                     g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = true;
                  }

                  g_amec->gpu[G_current_gpu_id].status.driverLoaded = l_new_driver_loaded;
               }

               // Done with this GPU ready to move to new one
               L_check_driver_failure_count[G_current_gpu_id] = 0;
               L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
               l_complete = TRUE;
               break;

           default:
               INTR_TRAC_ERR("gpu_check_driver_loaded: INVALID STATE: 0x%02X", L_check_driver_state);
               L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
               l_complete = TRUE;
               break;
       } // switch L_check_driver_state

       if(L_scheduled)
       {
          GPU_DBG("gpu_check_driver_loaded: Scheduled check driver loaded state 0x%02X at tick %d",
                   L_check_driver_state, GPU_TICK);
       }
       else if(!l_complete)  // if not complete there must have been a failure on the schedule
       {
          INTR_TRAC_ERR("gpu_check_driver_loaded: failed to schedule state 0x%02X", L_check_driver_state);
       }

    } // if async_request_is_idle
    else
    {
       INTR_TRAC_ERR("gpu_check_driver_loaded: NOT idle for state 0x%02X", L_check_driver_state);
    }

    return l_complete;
} // end gpu_check_driver_loaded_sm()

// Function Specification
//
// Name:  gpu_read_pwr_limit_sm
//
// Description: Called from gpu_task_sm to read GPU power limits for G_current_gpu_id
//              This function should only return that complete is TRUE when the read
//              is complete (or determined failed) and ready for a different GPU
//
// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
//
// End Function Specification
bool gpu_read_pwr_limit_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_read_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
    static uint8_t L_state_failure_count = 0;
    static gpuReadPwrLimitState_e L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
    static uint32_t L_attempts = 0;

    static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0};
    static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0};

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
        // If not starting a new read then need to check status of current state before moving on
        // stay in current state if the schedule failed or the state isn't finished/failed
        if( (L_read_pwr_limit_state != GPU_STATE_READ_PWR_LIMIT_NEW) &&
            (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
            // Repeat step 2 until it succeeds. More details on this in GPE code.
            if( (L_read_pwr_limit_state == GPU_STATE_READ_PWR_LIMIT_2_FINISH) &&
                (GPE_RC_NOT_COMPLETE == G_gpu_op_req_args.error.rc) &&
                (L_attempts <= GPU_TIMEOUT) )
            {
                L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_2_START;
                L_state_failure_count = 0;
                L_attempts++;
            }
            // Check if failure was due to driver change
            else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
            {
                handle_driver_change();
                // Request can't be processed by GPU at this time so we are done with this GPU
                // setup to start new request
                L_state_failure_count = 0;
                L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
                L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
                L_attempts = 0;
                return TRUE;  // Done with this GPU, let GPU SM move to next
            }

            // If reached retry count give up on this read
            else if( (L_state_failure_count > MAX_GPU_READ_ATTEMPT) ||
                     (L_attempts > GPU_TIMEOUT) )
            {
                if(L_attempts > GPU_TIMEOUT)
                {
                    // give up trying to read power limits for this GPU
                    // It will be retried if detected that GPU is put in reset and then taken out
                    g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false;
                    L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
                }
                // if GPU is not in reset then INC error count and check if reached threshold
                if(g_amec->gpu[G_current_gpu_id].status.notReset)
                {
                    if(++L_read_pwr_limit_failure_count[G_current_gpu_id] > GPU_READ_PWR_LIMIT_ERROR_COUNT)
                    {
                        INTR_TRAC_ERR("gpu_read_pwr_limit_sm: Failed to read power limits for GPU%d RC: 0x%02X",
                                       G_current_gpu_id,
                                       G_gpu_op_req_args.gpu_rc);

                        // give up trying to read power limits for this GPU
                        // It will be retried if detected that GPU is put in reset and then taken out
                        g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = false;
                        L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;

                        // log one time error that power limits could not be read
                        if(!L_error_logged[G_current_gpu_id])
                        {
                            L_error_logged[G_current_gpu_id] = TRUE;

                            // Log error
                            /* @
                             * @errortype
                             * @moduleid    GPU_MID_GPU_READ_PWR_LIMIT
                             * @reasoncode  GPU_FAILURE
                             * @userdata1   GPU ID
                             * @userdata2   GPU RC
                             * @userdata4   ERC_GPU_READ_PWR_LIMIT_FAILURE
                             * @devdesc     Failure to read GPU power limits
                             *
                             */
                            errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_PWR_LIMIT,
                                                          GPU_FAILURE,
                                                          ERC_GPU_READ_PWR_LIMIT_FAILURE,
                                                          ERRL_SEV_PREDICTIVE,
                                                          NULL,
                                                          DEFAULT_TRACE_SIZE,
                                                          G_current_gpu_id,
                                                          G_gpu_op_req_args.gpu_rc);

                            // Callout the GPU if have sensor ID for it
                            if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                            {
                                addCalloutToErrl(l_err,
                                                 ERRL_CALLOUT_TYPE_GPU_ID,
                                                 G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                                 ERRL_CALLOUT_PRIORITY_MED);
                            }

                            // Commit Error
                            commitErrl(&l_err);
                        } // if error not logged
                    } // if reached error count
                } // if notReset

                L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
                L_state_failure_count = 0;
                L_attempts = 0;
                return TRUE;  // Done with this GPU, let GPU SM move to next
            } // if reached retry count
            else
            {
                // INC failure count and retry current state
                L_state_failure_count++;
            }
        }
        else // success on last state go to next state and process it
        {
            L_state_failure_count = 0;
            L_read_pwr_limit_state++;
        }

        L_scheduled = FALSE;  // default nothing scheduled

        switch (L_read_pwr_limit_state)
        {
            // Step 1
            case GPU_STATE_READ_PWR_LIMIT_1_START:
                L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_1_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_1_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_3, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_1_FINISH:
                L_attempts = 0;
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args);
                break;

            // Step 2
            case GPU_STATE_READ_PWR_LIMIT_2_START:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_2_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_2_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args);
                break;

            // Step 3
            case GPU_STATE_READ_PWR_LIMIT_3_START:
                GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap for GPU%d", L_attempts, G_current_gpu_id);
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_3_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_3_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_3, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_3_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_FINISH, G_new_gpu_req_args);
                break;

            // Step 4
            case GPU_STATE_READ_PWR_LIMIT_4_START:
                G_new_gpu_req_args.data[0] = G_gpu_op_req_args.data[0];
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_4_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_4_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_3, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_4_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_4_FINISH, G_new_gpu_req_args);
                break;

            // Step 5
            case GPU_STATE_READ_PWR_LIMIT_5_START:
                G_new_gpu_req_args.data[0] = G_gpu_op_req_args.data[0];
                G_new_gpu_req_args.data[1] = G_gpu_op_req_args.data[1];
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_5_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_5_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_3, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_5_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_5_FINISH, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
                update_gpu_tick_sensor(&G_gpu_tick_times.getpcap[G_current_gpu_id], L_num_ticks);

                g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
                // Update power limits
                g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
                g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw = G_gpu_op_req_args.data[0];
                g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw =  G_gpu_op_req_args.data[1];
                g_amec->gpu[G_current_gpu_id].pcap.gpu_default_pcap_mw =  G_gpu_op_req_args.data[2];

                if( (g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw != L_last_min[G_current_gpu_id]) ||
                    (g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw != L_last_max[G_current_gpu_id]) )
                {
                    L_last_min[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw;
                    L_last_max[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw;
                    TRAC_IMP("gpu_read_pwr_limit: GPU%d min=0x%08XmW max=0x%08XmW",
                             G_current_gpu_id,
                             g_amec->gpu[G_current_gpu_id].pcap.gpu_min_pcap_mw,
                             g_amec->gpu[G_current_gpu_id].pcap.gpu_max_pcap_mw);
                }

                // Done with this GPU ready to move to new one
                L_read_pwr_limit_failure_count[G_current_gpu_id] = 0;
                L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
                L_attempts = 0;
                l_complete = TRUE;
                break;

            default:
                INTR_TRAC_ERR("gpu_read_pwr_limit: INVALID STATE: 0x%02X", L_read_pwr_limit_state);
                L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
                l_complete = TRUE;
                break;
        } // switch L_read_pwr_limit_state

        if(L_scheduled)
        {
            GPU_DBG("gpu_read_pwr_limit: Scheduled check driver loaded state 0x%02X at tick %d",
                     L_read_pwr_limit_state, GPU_TICK);
        }
        else if(!l_complete)  // if not complete there must have been a failure on the schedule
        {
            INTR_TRAC_ERR("gpu_read_pwr_limit: failed to schedule state 0x%02X", L_read_pwr_limit_state);
        }

    } // if async_request_is_idle
    else
    {
       INTR_TRAC_ERR("gpu_read_pwr_limit: NOT idle for state 0x%02X", L_read_pwr_limit_state);
    }

    return l_complete;
} // end gpu_read_pwr_limit_sm()

// Function Specification
//
// Name:  gpu_set_pwr_limit_sm
//
// Description: Called from gpu_task_sm to set GPU power limit for G_current_gpu_id
//              This function should only return that complete is TRUE when the set
//              is complete (or determined failed) and ready for a different GPU
//
// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
//
// End Function Specification
bool gpu_set_pwr_limit_sm()
{
    bool l_complete = FALSE;   // only return TRUE when complete or failed
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_state_failure_count = 0;
    static uint8_t L_set_pwr_limit_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
    static gpuSetPwrLimitState_e L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};
    static uint32_t L_attempts = 0;

    static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0};

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
        // If not starting a new set limit then need to check status of current state before moving on
        // stay in current state if the schedule failed or the state isn't finished/failed
        if( (L_set_pwr_limit_state != GPU_STATE_SET_PWR_LIMIT_NEW) &&
            (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
            // Repeat step 4 until it succeeds. More details on this in GPE code.
            if( (L_set_pwr_limit_state == GPU_STATE_SET_PWR_LIMIT_4_FINISH) &&
                (GPE_RC_NOT_COMPLETE == G_gpu_op_req_args.error.rc) &&
                (L_attempts <= GPU_TIMEOUT) )
            {
                L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_4_START;
                L_state_failure_count = 0;
                L_attempts++;
            }
            // Check if failure was due to driver change
            else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
            {
                handle_driver_change();
                // Request can't be processed by GPU at this time so we are done with this GPU
                // setup to start new request
                L_state_failure_count = 0;
                L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
                L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
                L_attempts = 0;
                return TRUE;  // Done with this GPU, let GPU SM move to next
            }
            // If reached retry count give up on this read
            else if( (L_state_failure_count > MAX_GPU_READ_ATTEMPT) ||
                     (L_attempts > GPU_TIMEOUT) )
            {

                if(L_attempts > GPU_TIMEOUT)
                {
                    // give up trying to set power limit for this GPU
                    // It will be retried if detected that GPU is put in reset and then taken out or driver change
                    g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
                    L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
                }
                // if GPU is not in reset then INC error count and check if reached threshold
                if(g_amec->gpu[G_current_gpu_id].status.notReset)
                {
                    if(++L_set_pwr_limit_failure_count[G_current_gpu_id] > GPU_SET_PWR_LIMIT_ERROR_COUNT)
                    {
                        INTR_TRAC_ERR("gpu_set_pwr_limit: Failed to set power limit %d for GPU%d RC: 0x%02X",
                                       G_gpu_op_req_args.data[0],
                                       G_current_gpu_id,
                                       G_gpu_op_req_args.gpu_rc);

                        // give up trying to set power limit for this GPU
                        // It will be retried if detected that GPU is put in reset and then taken out or driver change
                        g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
                        L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;

                        // log error that power limit could not be set
                        if(!L_error_logged[G_current_gpu_id])
                        {
                            L_error_logged[G_current_gpu_id] = TRUE;

                            // Log error
                            /* @
                             * @errortype
                             * @moduleid    GPU_MID_GPU_SET_PWR_LIMIT
                             * @reasoncode  GPU_FAILURE
                             * @userdata1   GPU ID
                             * @userdata2   GPU RC
                             * @userdata4   ERC_GPU_SET_PWR_LIMIT_FAILURE
                             * @devdesc     Failure to set GPU power limit
                             *
                             */
                            errlHndl_t l_err = createErrl(GPU_MID_GPU_SET_PWR_LIMIT,
                                                          GPU_FAILURE,
                                                          ERC_GPU_SET_PWR_LIMIT_FAILURE,
                                                          ERRL_SEV_PREDICTIVE,
                                                          NULL,
                                                          DEFAULT_TRACE_SIZE,
                                                          G_current_gpu_id,
                                                          G_gpu_op_req_args.gpu_rc);

                            // Callout the GPU if have sensor ID for it
                            if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                            {
                                addCalloutToErrl(l_err,
                                                 ERRL_CALLOUT_TYPE_GPU_ID,
                                                 G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                                 ERRL_CALLOUT_PRIORITY_MED);
                            }

                            // Commit Error
                            commitErrl(&l_err);
                        } // if error not logged
                    } // if reached error count
                } // if notReset

                L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
                L_state_failure_count = 0;
                L_attempts = 0;
                return TRUE;  // Done with this GPU, let GPU SM move to next
            } // if reached retry count
            else
            {
                // INC failure count and retry current state
                L_state_failure_count++;
            }
        }
        else // success on last state go to next state and process it
        {
            L_state_failure_count = 0;
            L_set_pwr_limit_state++;
        }

        L_scheduled = FALSE;  // default nothing scheduled

        switch (L_set_pwr_limit_state)
        {
            // Step 1
            case GPU_STATE_SET_PWR_LIMIT_1_START:
                L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_1_1:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_1, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_1_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_1_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args);
                break;

            // Step 2
            case GPU_STATE_SET_PWR_LIMIT_2_START:
                // send the desired GPU power cap to the GPE to send to GPU
                GPU_DBG("gpu_set_pwr_limit_sm: setting power limit to %dmW on GPU%d",
                         g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw, G_current_gpu_id);
                G_new_gpu_req_args.data[1] = g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw;
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_2_1:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_1, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_2_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_2_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args);
                break;

            // Step 3
            case GPU_STATE_SET_PWR_LIMIT_3_START:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_3_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_3_3:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_3, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_3_FINISH:
                L_attempts = 0;
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_3_FINISH, G_new_gpu_req_args);
                break;

            // Step 4
            case GPU_STATE_SET_PWR_LIMIT_4_START:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_4_2:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_2, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_4_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_4_FINISH, G_new_gpu_req_args);
                break;

            case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
                update_gpu_tick_sensor(&G_gpu_tick_times.setpcap[G_current_gpu_id], L_num_ticks);
                GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap for GPU%d", L_attempts, G_current_gpu_id);
                // Update the requested power limit since it was successfully sent
                // NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
                // has caluclated a new desired pcap while this one was already in process of being set
                g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw = (uint32_t) G_gpu_op_req_args.data[0];
                if(g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw != L_last_pcap[G_current_gpu_id])
                {
                    L_last_pcap[G_current_gpu_id] = g_amec->gpu[G_current_gpu_id].pcap.gpu_requested_pcap_mw;
                    TRAC_IMP("gpu_set_pwr_limit_sm: successfully set power limit to %dmW on GPU%d",
                              g_amec->gpu[G_current_gpu_id].pcap.gpu_desired_pcap_mw, G_current_gpu_id);
                }

                // Done with this GPU ready to move to new one
                L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
                L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
                L_attempts = 0;
                l_complete = TRUE;
                break;

            default:
                INTR_TRAC_ERR("gpu_set_pwr_limit: INVALID STATE: 0x%02X", L_set_pwr_limit_state);
                L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
                l_complete = TRUE;
                break;
        } // switch L_set_pwr_limit_state

        if(L_scheduled)
        {
            GPU_DBG("gpu_set_pwr_limit: Scheduled set power cap state 0x%02X at tick %d",
                    L_set_pwr_limit_state, GPU_TICK);
        }
        else if(!l_complete)  // if not complete there must have been a failure on the schedule
        {
            INTR_TRAC_ERR("gpu_set_pwr_limit: failed to schedule state 0x%02X", L_set_pwr_limit_state);
        }
    } // if async_request_is_idle
    else
    {
        INTR_TRAC_ERR("gpu_set_pwr_limit: NOT idle for state 0x%02X", L_set_pwr_limit_state);
    }

    return l_complete;
} // end gpu_set_pwr_limit_sm()

// Function Specification
//
// Name:  gpu_read_temp_sm
//
// Description: Called from gpu_task_sm to read GPU core temperature of G_current_gpu_id
//              This function should only return that complete is TRUE when the temperature
//              read is complete (or determined failed) and ready to start reading a different GPU
//
// Pre-Req:     Caller must have G_current_gpu_id set for GPU to read and
//              verified G_gpu_op_request is idle to allow scheduling
// End Function Specification
bool gpu_read_temp_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
    uint16_t l_temp = 0;
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_read_failure_count = 0; // Used for I2C errors
    static bool L_trace_success = FALSE;
    static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW;  // 1st state for reading temp

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
        // If not starting a new read then need to check status of current state before moving on
        // stay in current state if the schedule failed or the state isn't finished/failed
        if( (L_read_temp_state != GPU_STATE_READ_TEMP_NEW) &&
             (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
        {
            // If reached retry count give up on this GPU
            if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
                (GPE_RC_I2C_ERROR == G_gpu_op_req_args.error.rc) )
            {
                mark_gpu_failed(&G_gpu_op_req_args);

                L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
                L_read_failure_count = 0;
                return TRUE;  // Done with this GPU, let GPU SM move to next
            }
            else
            {
                // INC failure count and retry current state
                L_read_failure_count++;
            }
        }
        else // success on last state go to next state and process it
        {
            L_read_failure_count = 0;
            L_read_temp_state++;
        }

        L_scheduled = FALSE;  // default nothing scheduled

        switch (L_read_temp_state)
        {
            case GPU_STATE_READ_TEMP_START:
                L_num_ticks = 1;
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_TEMP_FINISH:
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_FINISH, G_new_gpu_req_args);
                break;

            case GPU_STATE_READ_TEMP_COMPLETE:
                update_gpu_tick_sensor(&G_gpu_tick_times.coretemp[G_current_gpu_id], L_num_ticks);
                if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
                    (0 != G_gpu_op_req_args.data[0]) )
                {
                    g_amec->gpu[G_current_gpu_id].status.readOnce = true;

                    // Only trace this once
                    if(FALSE == L_trace_success)
                    {
                        TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
                                   G_current_gpu_id, CURRENT_TICK);
                        L_trace_success = TRUE;
                    }

                    // comm is now established update for capability checking to take place
                    g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
                }
                // Update sensor
                l_temp = G_gpu_op_req_args.data[0];
                sensor_update(AMECSENSOR_PTR(TEMPGPU0 + G_current_gpu_id), l_temp);

                // Clear all past errors
                g_amec->gpu[G_current_gpu_id].status.coreTempFailure = false;
                g_amec->gpu[G_current_gpu_id].status.coreTempNotAvailable = false;
                g_amec->gpu[G_current_gpu_id].status.errorCount = 0;
                g_amec->gpu[G_current_gpu_id].status.retryCount = 0;

                // check if there is an overtemp that hasn't been reported
                if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
                   (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU].error) &&
                   (!g_amec->gpu[G_current_gpu_id].status.overtempError) )
                {
                    g_amec->gpu[G_current_gpu_id].status.overtempError = TRUE;

                    INTR_TRAC_ERR("gpu_read_temp: GPU%d OT! temp[%d]",
                                   G_current_gpu_id, l_temp);

                    // Log an OT error
                    /* @
                     * @errortype
                     * @moduleid    GPU_MID_GPU_READ_TEMP
                     * @reasoncode  GPU_ERROR_TEMP
                     * @userdata1   GPU ID
                     * @userdata2   GPU memory temperature
                     * @userdata4   OCC_NO_EXTENDED_RC
                     * @devdesc     GPU memory has reached error temperature
                     *
                     */
                    errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_TEMP,
                                                  GPU_ERROR_TEMP,
                                                  OCC_NO_EXTENDED_RC,
                                                  ERRL_SEV_PREDICTIVE,
                                                  NULL,
                                                  DEFAULT_TRACE_SIZE,
                                                  G_current_gpu_id,
                                                  l_temp);

                    // Callout the over temperature procedure
                    addCalloutToErrl(l_err,
                                     ERRL_CALLOUT_TYPE_COMPONENT_ID,
                                     ERRL_COMPONENT_ID_OVER_TEMPERATURE,
                                     ERRL_CALLOUT_PRIORITY_HIGH);

                    // Callout the GPU if have sensor ID for it
                    if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                    {
                        addCalloutToErrl(l_err,
                                         ERRL_CALLOUT_TYPE_GPU_ID,
                                         G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                         ERRL_CALLOUT_PRIORITY_MED);
                    }

                    // Commit Error
                    commitErrl(&l_err);

                } // if OT error

                // Done with this GPU ready to move to new one
                L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
                l_complete = TRUE;
                break;

            default:
                INTR_TRAC_ERR("gpu_read_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
                L_read_temp_state = GPU_STATE_READ_TEMP_NEW;
                l_complete = TRUE;
               break;
        } // switch L_read_temp_state

        if(L_scheduled)
        {
            GPU_DBG("gpu_read_temp_sm: Scheduled read temp state 0x%02X at tick %d",
                    L_read_temp_state, GPU_TICK);
        }
        else if(!l_complete)  // if not complete there must have been a failure on the schedule
        {
            INTR_TRAC_ERR("gpu_read_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
        }

    } // if async_request_is_idle
    else
    {
        INTR_TRAC_ERR("gpu_read_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
    }

    return l_complete;
} // end gpu_read_temp_sm()

// Function Specification
//
// Name:  gpu_read_mem_temp_capability_sm
//
// Description: Called from gpu_task_sm to read GPU memory temp capability of G_current_gpu_id
//              This function should only return that complete is TRUE when the capability
//              read is complete (or determined failed) and ready to start reading a different GPU
//
// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
//
// End Function Specification
bool gpu_read_mem_temp_capability_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_read_mem_cap_failure_count[MAX_NUM_GPU_PER_DOMAIN] = {0};
    static uint8_t L_state_failure_count = 0;
    static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
    static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
       // If not starting a new read then need to check status of current state before moving on
       // stay in current state if the schedule failed or the state isn't finished/failed
       if( (L_read_cap_state != GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW) &&
           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
       {
          // Check if failure was due to driver change
          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
          {
             handle_driver_change();
             // Request can't be processed by GPU at this time so we are done with this GPU
             // setup to start new request
             L_state_failure_count = 0;
             L_read_mem_cap_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
             L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          }
          // If reached state retry count give up on this read
          else if(L_state_failure_count > MAX_GPU_READ_ATTEMPT)
          {
             // if GPU is not in reset then INC error count and check if reached threshold
             if(g_amec->gpu[G_current_gpu_id].status.notReset)
             {
                if(++L_read_mem_cap_failure_count[G_current_gpu_id] > GPU_READ_MEM_CAP_ERROR_COUNT)
                {
                    INTR_TRAC_ERR("gpu_read_mem_temp_capable: Failed to read capability for GPU%d RC: 0x%02X",
                                   G_current_gpu_id,
                                   G_gpu_op_req_args.gpu_rc);

                    // give up trying to read mem temp capability for this GPU
                    // It will be retried if detected that GPU driver is re-loaded
                    g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
                    L_read_mem_cap_failure_count[G_current_gpu_id] = 0;

                    // cannot determine memory temp capability, mark memory temp as failed
                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;

                    // log one time error that memory temp capability couldn't be determined
                    if(!L_error_logged[G_current_gpu_id])
                    {
                       L_error_logged[G_current_gpu_id] = TRUE;

                       // Log error
                       /* @
                        * @errortype
                        * @moduleid    GPU_MID_GPU_READ_MEM_TEMP_CAPABLE
                        * @reasoncode  GPU_FAILURE
                        * @userdata1   GPU ID
                        * @userdata2   GPU RC
                        * @userdata4   ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE
                        * @devdesc     Failure to read memory temp capability
                        *
                        */
                       errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP_CAPABLE,
                                                     GPU_FAILURE,
                                                     ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE,
                                                     ERRL_SEV_PREDICTIVE,
                                                     NULL,
                                                     DEFAULT_TRACE_SIZE,
                                                     G_current_gpu_id,
                                                     G_gpu_op_req_args.gpu_rc);

                       // Callout the GPU if have sensor ID for it
                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                       {
                          addCalloutToErrl(l_err,
                                           ERRL_CALLOUT_TYPE_GPU_ID,
                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                           ERRL_CALLOUT_PRIORITY_MED);
                       }

                       // Commit Error
                       commitErrl(&l_err);
                    } // if error not logged
                } // if reached error count
             } // if notReset

             L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
             L_state_failure_count = 0;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          } // if reached state retry count
          else
          {
             // INC failure count and retry current state
             L_state_failure_count++;
          }
       }
       else // success on last state go to next state and process it
       {
          L_state_failure_count = 0;
          L_read_cap_state++;
       }

       L_scheduled = FALSE;  // default nothing scheduled

       switch (L_read_cap_state)
       {
           case GPU_STATE_READ_MEM_TEMP_CAPABLE_START:
               L_num_ticks = 1;
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_CAPABLE_2:
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_2, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_CAPABLE_3:
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_3, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_CAPABLE_READ:
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_FINISH, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
               update_gpu_tick_sensor(&G_gpu_tick_times.capabilities[G_current_gpu_id], L_num_ticks);
               // Update capability
               g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;

               if(g_amec->gpu[G_current_gpu_id].status.memTempSupported)
               {
                  // mem temp is supported no need to re-check capability
                  g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = FALSE;
               }
               else
               {
                  // Need to keep query for mem temp capability to detect if ever changes to capable
                  g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
               }

               // Done with this GPU ready to move to new one
               L_read_mem_cap_failure_count[G_current_gpu_id] = 0;
               L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
               l_complete = TRUE;
               break;

           default:
               INTR_TRAC_ERR("gpu_read_mem_temp_capable: INVALID STATE: 0x%02X", L_read_cap_state);
               L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
               l_complete = TRUE;
               break;
       } // switch L_read_cap_state

       if(L_scheduled)
       {
          GPU_DBG("gpu_read_mem_temp_capable: Scheduled read temp capability state 0x%02X at tick %d",
                   L_read_cap_state, GPU_TICK);
       }
       else if(!l_complete)  // if not complete there must have been a failure on the schedule
       {
          INTR_TRAC_ERR("gpu_read_mem_temp_capable: failed to schedule state 0x%02X", L_read_cap_state);
       }

    } // if async_request_is_idle
    else
    {
       INTR_TRAC_ERR("gpu_read_mem_temp_capable: NOT idle for state 0x%02X", L_read_cap_state);
    }

    return l_complete;
} // end gpu_read_mem_temp_capability_sm()

// Function Specification
//
// Name:  gpu_read_memory_temp_sm
//
// Description: Called from gpu_task_sm to read GPU memory temperature of G_current_gpu_id
//              This function should only return that complete is TRUE when the temperature
//              read is complete (or determined failed) and ready to start reading a different GPU
//
// Pre-Req:  Caller must have G_current_gpu_id set for GPU to read
//
// End Function Specification
bool gpu_read_memory_temp_sm()
{
    bool l_complete = FALSE;   // only return TRUE when the read is complete or failed
    uint16_t l_temp = 0;
    static bool L_scheduled = FALSE;  // indicates if a GPU GPE request was scheduled
    static uint8_t L_read_failure_count = 0;
    static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;  // 1st state for reading temp

    static uint32_t L_num_ticks = 0;

    L_num_ticks++;

    if (async_request_is_idle(&G_gpu_op_request.request))
    {
       // If not starting a new read then need to check status of current state before moving on
       // stay in current state if the schedule failed or the state isn't finished/failed
       if( (L_read_temp_state != GPU_STATE_READ_MEM_TEMP_NEW) &&
           (!L_scheduled || (GPE_RC_SUCCESS != G_gpu_op_req_args.error.rc)) )
       {
          // Check if failure was due to driver change
          if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
          {
             handle_driver_change();
             // Request can't be processed by GPU at this time so we are done with this GPU
             // setup to start new request
             L_read_failure_count = 0;
             g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;
             L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          }

          // If reached retry count or GPU indicated cmd not supported then give up on this read
          else if( (L_read_failure_count > MAX_GPU_READ_ATTEMPT) ||
                   (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
          {
             // if GPU is not in reset or the GPU responded with command not supported then
             // INC memory error count and check if reached timeout for new mem temp
             if( (g_amec->gpu[G_current_gpu_id].status.notReset) ||
                 (G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED) )
             {
                g_amec->gpu[G_current_gpu_id].status.memErrorCount++;

                uint8_t max_read_timeout = G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].max_read_timeout;
                if((max_read_timeout) && (max_read_timeout != 0xFF) &&
                   (g_amec->gpu[G_current_gpu_id].status.memErrorCount >= max_read_timeout) )
                {
                    // Disable memory temp reading for this GPU and log error
                    g_amec->gpu[G_current_gpu_id].status.memTempSupported = FALSE;
                    // so BMC knows there is an error for fan control set failure
                    g_amec->gpu[G_current_gpu_id].status.memTempFailure = true;
                    g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = true;

                    INTR_TRAC_ERR("gpu_read_memory_temp: disabling memory temp for GPU%d due to %d consecutive errors",
                                    G_current_gpu_id, g_amec->gpu[G_current_gpu_id].status.memErrorCount);

                    if(g_amec->gpu[G_current_gpu_id].status.commErrorLogged == false)
                    {
                       INTR_TRAC_ERR("notReset: %d rc: 0x%0X", g_amec->gpu[G_current_gpu_id].status.notReset,
                                     G_gpu_op_req_args.error.rc);
                       // Log error
                       /* @
                        * @errortype
                        * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
                        * @reasoncode  GPU_FAILURE
                        * @userdata1   GPU ID
                        * @userdata2   GPU RC
                        * @userdata4   ERC_GPU_READ_MEM_TEMP_TIMEOUT
                        * @devdesc     Timeout reading new GPU memory temperature
                        *
                        */
                       errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
                                                     GPU_FAILURE,
                                                     ERC_GPU_READ_MEM_TEMP_TIMEOUT,
                                                     ERRL_SEV_PREDICTIVE,
                                                     NULL,
                                                     DEFAULT_TRACE_SIZE,
                                                     G_gpu_op_req_args.gpu_rc,
                                                     g_amec->gpu[G_current_gpu_id].status.memErrorCount);

                       // Callout the GPU if have sensor ID for it
                       if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                       {
                          addCalloutToErrl(l_err,
                                           ERRL_CALLOUT_TYPE_GPU_ID,
                                           G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                           ERRL_CALLOUT_PRIORITY_MED);
                       }

                       // Commit Error
                       commitErrl(&l_err);
                       g_amec->gpu[G_current_gpu_id].status.commErrorLogged = true;
                    } // if !commErrorLogged
                } // if timeout error
                else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_CMD_NOT_SUPPORTED)
                {
                    // GPU indicated command not supported, re-check mem temp capability
                    // if we try to read mem temp again that means mem temp was reported capable
                    // and if this continues to fail eventually an error will be logged above at timeout
                    g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = true;
                    g_amec->gpu[G_current_gpu_id].status.memTempSupported = false;
                }
             } // if notReset or command not supported

             // setup to start new request
             L_read_failure_count = 0;
             L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
             return TRUE;  // Done with this GPU, let GPU SM move to next
          }  // else if failure count exceeded or command not supported
          else
          {
             // INC failure count and retry current state
             L_read_failure_count++;
          }
       }
       else // success on last state go to next state and process it
       {
          L_read_failure_count = 0;
          L_read_temp_state++;
       }

       L_scheduled = FALSE;  // default nothing scheduled

       switch (L_read_temp_state)
       {
           case GPU_STATE_READ_MEM_TEMP_START:
               L_num_ticks = 1;
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_2:
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_2, G_new_gpu_req_args);
               break;

           case GPU_STATE_READ_MEM_TEMP_3:
               L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_3, G_new_gpu_req_args);
               break;

            case GPU_STATE_READ_MEM_TEMP_READ:
                L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_FINISH, G_new_gpu_req_args);
                break;

           case GPU_STATE_READ_MEM_TEMP_COMPLETE:
               update_gpu_tick_sensor(&G_gpu_tick_times.memtemp[G_current_gpu_id], L_num_ticks);
               // Update sensor
               l_temp = G_gpu_op_req_args.data[0];
               sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);

               // Clear past errors
               g_amec->gpu[G_current_gpu_id].status.memTempFailure = false;
               g_amec->gpu[G_current_gpu_id].status.memTempNotAvailable = false;
               g_amec->gpu[G_current_gpu_id].status.memErrorCount = 0;

               // check if there is an overtemp that hasn't been reported
               if((G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].error) &&
                  (l_temp > G_data_cnfg->thrm_thresh.data[DATA_FRU_GPU_MEM].error) &&
                  (!g_amec->gpu[G_current_gpu_id].status.memOvertempError) )
               {
                   g_amec->gpu[G_current_gpu_id].status.memOvertempError = TRUE;

                   INTR_TRAC_ERR("gpu_read_memory_temp: GPU%d memory OT! temp[%d]",
                                  G_current_gpu_id, l_temp);

                   // Log an OT error
                   /* @
                    * @errortype
                    * @moduleid    GPU_MID_GPU_READ_MEM_TEMP
                    * @reasoncode  GPU_MEMORY_ERROR_TEMP
                    * @userdata1   GPU ID
                    * @userdata2   GPU memory temperature
                    * @userdata4   OCC_NO_EXTENDED_RC
                    * @devdesc     GPU memory has reached error temperature
                    *
                    */
                   errlHndl_t l_err = createErrl(GPU_MID_GPU_READ_MEM_TEMP,
                                                 GPU_MEMORY_ERROR_TEMP,
                                                 OCC_NO_EXTENDED_RC,
                                                 ERRL_SEV_PREDICTIVE,
                                                 NULL,
                                                 DEFAULT_TRACE_SIZE,
                                                 G_current_gpu_id,
                                                 l_temp);

                   // Callout the over temperature procedure
                   addCalloutToErrl(l_err,
                                    ERRL_CALLOUT_TYPE_COMPONENT_ID,
                                    ERRL_COMPONENT_ID_OVER_TEMPERATURE,
                                    ERRL_CALLOUT_PRIORITY_HIGH);

                   // Callout the GPU if have sensor ID for it
                   if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                   {
                      addCalloutToErrl(l_err,
                                       ERRL_CALLOUT_TYPE_GPU_ID,
                                       G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                       ERRL_CALLOUT_PRIORITY_MED);
                   }

                   // Commit Error
                   commitErrl(&l_err);

               } // if OT error

               // Done with this GPU ready to move to new one
               L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
               l_complete = TRUE;
               break;

           default:
               INTR_TRAC_ERR("gpu_read_memory_temp_sm: INVALID STATE: 0x%02X", L_read_temp_state);
               L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW;
               l_complete = TRUE;
               break;
       } // switch L_read_temp_state

       if(L_scheduled)
       {
          GPU_DBG("gpu_read_memory_temp_sm: Scheduled read temp state 0x%02X at tick %d",
                   L_read_temp_state, GPU_TICK);
       }
       else if(!l_complete)  // if not complete there must have been a failure on the schedule
       {
          INTR_TRAC_ERR("gpu_read_memory_temp_sm: failed to schedule state 0x%02X", L_read_temp_state);
       }

    } // if async_request_is_idle
    else
    {
       INTR_TRAC_ERR("gpu_read_memory_temp_sm: NOT idle for state 0x%02X", L_read_temp_state);
    }

    return l_complete;
} // end gpu_read_memory_temp_sm()


// Function Specification
//
// Name:  gpu_sm_handle_idle_state
//
// Description: Called when GPU SM is idle to determine what state (if any) should
//               be done next
// End Function Specification
bool gpu_sm_handle_idle_state(bool i_read_temp_start_needed, bool i_mem_temp_needed)
{
    bool l_new_state = FALSE;  // return TRUE if there is a new state for GPU communication
    uint8_t l_gpu_id = 0;

    do
    {
        // Check for next state in order of priority

        // 1.  Need to set a power limit on a GPU?
        l_gpu_id = gpu_id_need_set_power_limit();
        if(l_gpu_id != 0xFF)
        {
            // Found a GPU that needs a power limit set
            G_current_gpu_id = l_gpu_id;
            G_gpu_state = GPU_STATE_SET_PWR_LIMIT;
            l_new_state = TRUE;
            break;
        }

        // 2.  check if Host needs lock
        if (!check_and_update_i2c_lock(GPU_I2C_ENGINE))
        {
            // We don't own the lock anymore
            // can't do anything until we get ownership back
            G_gpu_state = GPU_STATE_NO_LOCK;
            l_new_state = FALSE;
            break;
        }

        // 3.  Time to start new temperature reads?
        if(i_read_temp_start_needed)
        {
            // Start reading core temp from first present and functional GPU
            l_gpu_id = get_first_gpu();
            if(l_gpu_id != 0xFF)
            {
               // Read core temp for this GPU
               G_current_gpu_id = l_gpu_id;
               G_gpu_state = GPU_STATE_READ_TEMP;
               l_new_state = TRUE;
               break;
            }
            else  // no functional GPUs
            {
               // release I2C lock to the host for this engine and stop monitoring
               occ_i2c_lock_release(GPU_I2C_ENGINE);
               G_gpu_state = GPU_STATE_NO_LOCK;
               G_gpu_monitoring_allowed = FALSE;
               l_new_state = FALSE;  // No new state for GPU communication
               break;
            }
        }
        // 4.  Need to check if driver is loaded?
        l_gpu_id = gpu_id_need_driver_check();
        if(l_gpu_id != 0xFF)
        {
            // Found a GPU that needs driver checked
            G_current_gpu_id = l_gpu_id;
            G_gpu_state = GPU_STATE_CHECK_DRIVER_LOADED;
            l_new_state = TRUE;
            break;
        }

        // 5.  Need to read power limits?
        l_gpu_id = gpu_id_need_power_limits();
        if(l_gpu_id != 0xFF)
        {
            // Found a GPU that needs power limits read
            G_current_gpu_id = l_gpu_id;
            G_gpu_state = GPU_STATE_READ_PWR_LIMIT;
            l_new_state = TRUE;
            break;
        }

        // 6.  Need to read memory temps?
        if(i_mem_temp_needed)
        {
            // first check if there is a GPU that needs memory temp capability checked
            l_gpu_id = gpu_id_need_memory_temp_capability_check();
            if(l_gpu_id != 0xFF)
            {
                // Determine memory temp capability for this GPU
                G_current_gpu_id = l_gpu_id;
                G_gpu_state = GPU_STATE_CHECK_MEM_TEMP_CAPABLE;
                l_new_state = TRUE;
                break;
            }
            else
            {
               // memory temp capability checking is done start reading memory temp from capable GPUs
               l_gpu_id = get_first_mem_temp_capable_gpu();
               if(l_gpu_id != 0xFF)
               {
                  // Read memory temp for this GPU
                  G_current_gpu_id = l_gpu_id;
                  G_gpu_state = GPU_STATE_READ_MEMORY_TEMP;
                  l_new_state = TRUE;
                  break;
               }
            }
        }

        // Else nothing stay idle
    }while(0);

    return l_new_state;
}

// Function Specification
//
// Name:  task_gpu_sm
//
// Description: GPU State Machine - Called from tick table to manage GPUs
//
// Task Flags: RTL_FLAG_ACTIVE
//
// End Function Specification
void task_gpu_sm(struct task *i_self)
{
    bool l_start_next_state = FALSE;
    bool l_next_state = FALSE;
    uint8_t l_gpu_id = 0;

    static bool L_occ_owns_lock = FALSE;
    static bool L_gpu_first_run = TRUE;
    static uint16_t L_numCallsForTempRead = 0;  // # of calls since last temp read was started
    static bool L_read_temp_start_needed = FALSE;  // set to true when it is time to start reading GPU temps
    static bool L_mem_temp_needed = FALSE;  // set to true after rading GPU core temp to read GPU memory temp

    // GPU monitoring is enabled if GPUs are present and will be disabled if no GPUs
    // are functional or GPU I2C interface is broken
    if(G_gpu_monitoring_allowed)
    {
        // Read and update reset status for all GPUs
        update_gpu_reset_status();

        // Initialize the IPC commands if this is our first run
        if(L_gpu_first_run)
        {
            gpu_ipc_init();
            L_gpu_first_run = FALSE;
        }

        // Check if time to start reading temperatures
        // GPU temperatures (core and memory) are only used for fan control which happens every 1s
        // so there is no need to read the GPU temperatures any faster than every 1s
        if(!L_read_temp_start_needed)
        {
            L_numCallsForTempRead++;
            if(L_numCallsForTempRead >= GPU_TEMP_READ_1S)
            {
                 L_read_temp_start_needed = TRUE;
                 L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
            }
        }

        // make sure OCC owns the lock in order to send commands to the GPU
        if( (L_occ_owns_lock == FALSE) || (G_gpu_state == GPU_STATE_NO_LOCK) )
        {

            // Check if host gave up the I2C lock
            L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE);
            if (L_occ_owns_lock)
            {
                // We now own the lock start with reset and init state
                G_gpu_state = GPU_STATE_RESET;
            }
            else
            {
                // Don't own the lock can't do anything this time
                G_gpu_state = GPU_STATE_NO_LOCK;
            }
        }

        // Process GPE response for what was scheduled on the last call
        // and if that state finished schedule GPE job to start next state
        // This means that this state machine can be ran twice
        do
        {
           if(l_start_next_state)
           {
               // This is start of 2nd time processing state set next so we don't go thru here a 3rd time
               l_next_state = TRUE;
           }

           // make sure previous action didn't disable GPU monitoring
           if(!G_gpu_monitoring_allowed)
           {
              // release I2C lock to the host for this engine and stop monitoring
              occ_i2c_lock_release(GPU_I2C_ENGINE);
              L_occ_owns_lock = FALSE;
              G_gpu_state = GPU_STATE_NO_LOCK;
           }

           switch(G_gpu_state)
           {
               case GPU_STATE_RESET:
                   // Call the GPU Reset SM
                   if (gpu_reset_sm())
                   {
                      // Reset complete and GPUs are ready for communication
                      // Start first with reading core temp of first functional GPU
                      L_numCallsForTempRead = 0;  // to track start of next temp reading in 1s
                      L_read_temp_start_needed = FALSE;  // start is no longer needed
                      L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
                      l_gpu_id = get_first_gpu();
                      if(l_gpu_id != 0xFF)
                      {
                          // Read core temp for this GPU
                          G_current_gpu_id = l_gpu_id;
                          G_gpu_state = GPU_STATE_READ_TEMP;
                          l_start_next_state = TRUE;
                      }
                      else  // no functional GPUs
                      {
                         // release I2C lock to the host for this engine and stop monitoring
                         occ_i2c_lock_release(GPU_I2C_ENGINE);
                         L_occ_owns_lock = FALSE;
                         G_gpu_state = GPU_STATE_NO_LOCK;
                         G_gpu_monitoring_allowed = FALSE;
                         l_start_next_state = FALSE;
                      }
                   }

                   break;

               case GPU_STATE_READ_TEMP:
                   // Call the read core GPU temperature SM for the current GPU being processed
                   if(gpu_read_temp_sm())
                   {
                      // Temp read complete for this GPU, move to next GPU
                      // or memory temps if all GPU core temps were read
                      l_gpu_id = get_next_gpu();
                      if(l_gpu_id == 0xFF)
                      {
                         // Done reading core temps, now read GPU memory temps
                         // set state to IDLE first to check if a higher priority
                         // action is needed before starting to read memory temps
                         L_mem_temp_needed = TRUE;
                         G_gpu_state = GPU_STATE_IDLE;
                      }
                      else
                      {
                         // Stay in temperature read state and read temp for next GPU
                         G_current_gpu_id = l_gpu_id;
                      }

                      l_start_next_state = TRUE;
                   }

                   break;

               case GPU_STATE_READ_MEMORY_TEMP:
                   // Call the read GPU memory temperature SM for the current GPU being processed
                   if(gpu_read_memory_temp_sm())
                   {
                      // Temp read complete for this GPU, move to next GPU
                      // or idle if all GPU memory temps were read
                      l_gpu_id = get_next_mem_temp_capable_gpu();
                      if(l_gpu_id == 0xFF)
                      {
                         // Done reading memory temps
                         G_gpu_state = GPU_STATE_IDLE;
                      }
                      else
                      {
                         // Stay in memory read state and read memory temp for next GPU
                         G_current_gpu_id = l_gpu_id;
                      }

                      l_start_next_state = TRUE;
                   }

                   break;

               case GPU_STATE_CHECK_MEM_TEMP_CAPABLE:
                   // Check if current GPU has memory temperature capability
                   if(gpu_read_mem_temp_capability_sm())
                   {
                      // Capability check complete for this GPU, go to IDLE state
                      // to let IDLE SM decide what to do next
                      G_gpu_state = GPU_STATE_IDLE;
                      l_start_next_state = TRUE;
                   }
                   break;

               case GPU_STATE_CHECK_DRIVER_LOADED:
                   // Check if driver is loaded for current GPU
                   if(gpu_check_driver_loaded_sm())
                   {
                      // Driver check complete for this GPU,
                      // NOTE: Do not set status.checkDriverLoaded to false here, if driver is
                      // not loaded we need to keep checking for driver to be loaded this is decided
                      // inside gpu_check_driver_loaded_sm()

                      // go to IDLE state to let IDLE SM decide what to do next
                      G_gpu_state = GPU_STATE_IDLE;
                      l_start_next_state = TRUE;
                   }
                   break;

               case GPU_STATE_READ_PWR_LIMIT:
                   // Read power limits for current GPU
                   if(gpu_read_pwr_limit_sm())
                   {
                      // Read power limits complete for this GPU, go to IDLE state
                      // to let IDLE SM decide what to do next
                      G_gpu_state = GPU_STATE_IDLE;
                      l_start_next_state = TRUE;
                   }
                   break;

               case GPU_STATE_SET_PWR_LIMIT:
                   // Set power limit on current GPU
                   if(gpu_set_pwr_limit_sm())
                   {
                      // Set power limit complete for this GPU, go to IDLE state
                      // to let IDLE SM decide what to do next
                      G_gpu_state = GPU_STATE_IDLE;
                      l_start_next_state = TRUE;
                   }
                   break;

               case GPU_STATE_NO_LOCK:
                   // Host owns the I2C engine.  Need to wait until we get the lock
                   l_start_next_state = FALSE;
                   break;

               default:
                   // Nothing happened on last call
                   G_gpu_state = GPU_STATE_IDLE;
                   break;
           } // switch G_gpu_state

           // check if the previous action requires a reset
           if(G_gpu_i2c_reset_required)
           {
               G_gpu_i2c_reset_required = FALSE;

               // before starting the reset check if OPAL needs the lock
               L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE);
               if (L_occ_owns_lock)
               {
                   // We still own the lock start the reset
                   G_gpu_state = GPU_STATE_RESET;
                   l_start_next_state = TRUE;
               }
               else
               {
                   // We don't own the lock, the reset will happen when we get the lock back
                   G_gpu_state = GPU_STATE_NO_LOCK;
                   l_start_next_state = FALSE;
               }
               break;
           }
           else if(G_gpu_state == GPU_STATE_IDLE)
           {
               // time to decide what to do next
               l_start_next_state = gpu_sm_handle_idle_state(L_read_temp_start_needed, L_mem_temp_needed);
               if(l_start_next_state)
               {
                  if(G_gpu_state == GPU_STATE_READ_TEMP)
                  {
                     // new state to read core temp reset temperature reading timer
                     L_numCallsForTempRead = 0;
                     L_read_temp_start_needed = FALSE; // start no longer needed
                     L_mem_temp_needed = FALSE;  // will get set to TRUE when core temp reads finish
                  }
                  else if(G_gpu_state == GPU_STATE_READ_MEMORY_TEMP)
                  {
                     // new state to start reading memory temps, reset mem temp needed
                     L_mem_temp_needed = FALSE;
                  }
               }
           }
        }while((l_start_next_state) && (!l_next_state));
    } // GPU monitoring enabled
} // end task_gpu_sm()