diff options
author | mbroyles <mbroyles@us.ibm.com> | 2017-08-06 19:08:00 -0500 |
---|---|---|
committer | William A. Bryan <wilbryan@us.ibm.com> | 2017-08-14 15:18:26 -0400 |
commit | 8a335d83ed938f05f95ca1cfdbbb5292053ed51f (patch) | |
tree | bd2b38c6df596f3d3bf9f70f8a54a8a205e4e2e1 /src/occ_405/amec | |
parent | 71b5f68da8b725f9c5251261b41fd824e652e491 (diff) | |
download | talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.tar.gz talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.zip |
Initial 405 GPU support
Change-Id: I6e957ca1aa643d257274e99957df5b15ac8c889b
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44254
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec')
-rwxr-xr-x | src/occ_405/amec/amec_sensors_power.c | 6 | ||||
-rwxr-xr-x | src/occ_405/amec/amec_sys.h | 55 |
2 files changed, 56 insertions, 5 deletions
diff --git a/src/occ_405/amec/amec_sensors_power.c b/src/occ_405/amec/amec_sensors_power.c index 2233738..3820330 100755 --- a/src/occ_405/amec/amec_sensors_power.c +++ b/src/occ_405/amec/amec_sensors_power.c @@ -69,6 +69,7 @@ uint32_t G_curr_num_gpus_sys = 0; #define ADC_CONVERTED_VALUE(i_chan) \ ((i_chan < MAX_APSS_ADC_CHANNELS) ? G_lastValidAdcValue[i_chan] : 0) +extern bool G_gpu_monitoring_allowed; extern uint8_t G_occ_interrupt_type; extern bool G_vrm_thermal_monitoring; extern PWR_READING_TYPE G_pwr_reading_type; @@ -821,6 +822,11 @@ void amec_update_gpu_configuration(void) { G_gpu_config_done = TRUE; G_first_proc_gpu_config = l_valid_bitmask_proc; + if(G_first_proc_gpu_config) + { + // GPUs are present enable monitoring + G_gpu_monitoring_allowed = TRUE; + } G_first_sys_gpu_config = l_valid_bitmask_sys; G_first_num_gpus_sys = l_num_gpus_sys; TRAC_IMP("GPU presence detection completed. GPU configuration for this OCC: 0x%08X, total[%d]", diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 74b2812..a45fb42 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -430,6 +430,53 @@ typedef struct } amec_quad_t; //------------------------------------------------------------- +// GPU Structures +//------------------------------------------------------------- + +typedef struct { + bool disabled; // GPU has been marked failed and no longer monitored + bool readOnce; // Comm has been established with GPU + bool overtempError; // Core OT error has been logged against GPU + bool memOvertempError; // Memory OT error has been logged against GPU + bool checkDriverLoaded; // Indicates if need to check if driver is loaded + bool driverLoaded; // Indicates if GPU driver is loaded + bool checkMemTempSupport; // Indicates if need to check if mem monitoring is supported + bool memTempSupported; // Indicates if memory temperature monitoring is supported + uint8_t memErrorCount; // count of consecutive GPU mem temp read failures + uint8_t errorCount; // count of consecutive GPU core temp read failures +} gpuStatus_t; + +typedef struct { + bool check_pwr_limit; // Indicates if need to read power limits from GPU + bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max + uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU + uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU + uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set + uint32_t gpu_requested_pcap_mw; // Requested power cap in mW sent to GPU + uint32_t gpu_actual_pcap_mw; // Actual power cap in mW read back from the GPU +} gpuPcap_t; + + +typedef struct +{ + //----------------------------------- + // Sensors + //----------------------------------- + sensor_t tempgpu; // GPU core temperature + sensor_t tempgpumem; // GPU HBM temperature + + //----------------------------------- + // Data + //----------------------------------- + // General Status of GPU + gpuStatus_t status; + + // GPU Power Cap Information + gpuPcap_t pcap; + +} amec_gpu_t; + +//------------------------------------------------------------- // Proc Structure //------------------------------------------------------------- typedef struct @@ -468,11 +515,6 @@ typedef struct // Nimbus DIMM Sensors sensor_t tempdimm[NUM_DIMM_PORTS*NUM_DIMMS_PER_I2CPORT]; - // GPU Sensors - sensor_t tempgpu0; - sensor_t tempgpu1; - sensor_t tempgpu2; - sensor_t curvdn; sensor_t pwrvdd; sensor_t pwrvdn; @@ -607,6 +649,9 @@ typedef struct // in the hopes of perhaps reusing some code from previous projects. amec_proc_t proc[NUM_PROC_CHIPS_PER_OCC]; + // GPU Data + amec_gpu_t gpu[MAX_NUM_GPU_PER_DOMAIN]; + // OCC Firmware Data amec_fw_t fw; |