summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2017-08-06 19:08:00 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2017-08-14 15:18:26 -0400
commit8a335d83ed938f05f95ca1cfdbbb5292053ed51f (patch)
treebd2b38c6df596f3d3bf9f70f8a54a8a205e4e2e1 /src/occ_405/amec
parent71b5f68da8b725f9c5251261b41fd824e652e491 (diff)
downloadtalos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.tar.gz
talos-occ-8a335d83ed938f05f95ca1cfdbbb5292053ed51f.zip
Initial 405 GPU support
Change-Id: I6e957ca1aa643d257274e99957df5b15ac8c889b Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/44254 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec')
-rwxr-xr-xsrc/occ_405/amec/amec_sensors_power.c6
-rwxr-xr-xsrc/occ_405/amec/amec_sys.h55
2 files changed, 56 insertions, 5 deletions
diff --git a/src/occ_405/amec/amec_sensors_power.c b/src/occ_405/amec/amec_sensors_power.c
index 2233738..3820330 100755
--- a/src/occ_405/amec/amec_sensors_power.c
+++ b/src/occ_405/amec/amec_sensors_power.c
@@ -69,6 +69,7 @@ uint32_t G_curr_num_gpus_sys = 0;
#define ADC_CONVERTED_VALUE(i_chan) \
((i_chan < MAX_APSS_ADC_CHANNELS) ? G_lastValidAdcValue[i_chan] : 0)
+extern bool G_gpu_monitoring_allowed;
extern uint8_t G_occ_interrupt_type;
extern bool G_vrm_thermal_monitoring;
extern PWR_READING_TYPE G_pwr_reading_type;
@@ -821,6 +822,11 @@ void amec_update_gpu_configuration(void)
{
G_gpu_config_done = TRUE;
G_first_proc_gpu_config = l_valid_bitmask_proc;
+ if(G_first_proc_gpu_config)
+ {
+ // GPUs are present enable monitoring
+ G_gpu_monitoring_allowed = TRUE;
+ }
G_first_sys_gpu_config = l_valid_bitmask_sys;
G_first_num_gpus_sys = l_num_gpus_sys;
TRAC_IMP("GPU presence detection completed. GPU configuration for this OCC: 0x%08X, total[%d]",
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 74b2812..a45fb42 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -430,6 +430,53 @@ typedef struct
} amec_quad_t;
//-------------------------------------------------------------
+// GPU Structures
+//-------------------------------------------------------------
+
+typedef struct {
+ bool disabled; // GPU has been marked failed and no longer monitored
+ bool readOnce; // Comm has been established with GPU
+ bool overtempError; // Core OT error has been logged against GPU
+ bool memOvertempError; // Memory OT error has been logged against GPU
+ bool checkDriverLoaded; // Indicates if need to check if driver is loaded
+ bool driverLoaded; // Indicates if GPU driver is loaded
+ bool checkMemTempSupport; // Indicates if need to check if mem monitoring is supported
+ bool memTempSupported; // Indicates if memory temperature monitoring is supported
+ uint8_t memErrorCount; // count of consecutive GPU mem temp read failures
+ uint8_t errorCount; // count of consecutive GPU core temp read failures
+} gpuStatus_t;
+
+typedef struct {
+ bool check_pwr_limit; // Indicates if need to read power limits from GPU
+ bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max
+ uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
+ uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
+ uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set
+ uint32_t gpu_requested_pcap_mw; // Requested power cap in mW sent to GPU
+ uint32_t gpu_actual_pcap_mw; // Actual power cap in mW read back from the GPU
+} gpuPcap_t;
+
+
+typedef struct
+{
+ //-----------------------------------
+ // Sensors
+ //-----------------------------------
+ sensor_t tempgpu; // GPU core temperature
+ sensor_t tempgpumem; // GPU HBM temperature
+
+ //-----------------------------------
+ // Data
+ //-----------------------------------
+ // General Status of GPU
+ gpuStatus_t status;
+
+ // GPU Power Cap Information
+ gpuPcap_t pcap;
+
+} amec_gpu_t;
+
+//-------------------------------------------------------------
// Proc Structure
//-------------------------------------------------------------
typedef struct
@@ -468,11 +515,6 @@ typedef struct
// Nimbus DIMM Sensors
sensor_t tempdimm[NUM_DIMM_PORTS*NUM_DIMMS_PER_I2CPORT];
- // GPU Sensors
- sensor_t tempgpu0;
- sensor_t tempgpu1;
- sensor_t tempgpu2;
-
sensor_t curvdn;
sensor_t pwrvdd;
sensor_t pwrvdn;
@@ -607,6 +649,9 @@ typedef struct
// in the hopes of perhaps reusing some code from previous projects.
amec_proc_t proc[NUM_PROC_CHIPS_PER_OCC];
+ // GPU Data
+ amec_gpu_t gpu[MAX_NUM_GPU_PER_DOMAIN];
+
// OCC Firmware Data
amec_fw_t fw;
OpenPOWER on IntegriCloud