summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWilliam Bryan <wilbryan@us.ibm.com>2017-09-01 14:56:46 -0500
committerWilliam A. Bryan <wilbryan@us.ibm.com>2017-09-05 16:08:22 -0400
commit4b775f5f2da291d98a0ecdcfd40fb24c64052956 (patch)
tree713040933ff4b0a9b12d10eaf5574407adcf4ee0
parentf6b9c4c2f61472fab97c75d719ae7224f0e9e416 (diff)
downloadtalos-occ-4b775f5f2da291d98a0ecdcfd40fb24c64052956.tar.gz
talos-occ-4b775f5f2da291d98a0ecdcfd40fb24c64052956.zip
Fix I2C locking issue and sensor IDs
Change-Id: Ib6255d96ca18b45e69184bc5126e53a09d2f26fe Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45577 Reviewed-by: Martha Broyles <mbroyles@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
-rwxr-xr-xsrc/occ_405/cmdh/cmdh_fsp_cmds.c4
-rwxr-xr-xsrc/occ_405/gpu/gpu.c47
-rw-r--r--src/occ_405/sensor/sensor_main_memory.c30
3 files changed, 61 insertions, 20 deletions
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index 6cdf79e..4606f6d 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -364,7 +364,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp
l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid;
else
- l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k; // temp
+ l_tempSensorList[l_sensorHeader.count].id = 0xC6 + (9 * G_pbax_id.chip_id) + (k*3); // temp
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU;
l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF;
l_sensorHeader.count++;
@@ -373,7 +373,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp
l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid;
else
- l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k; // temp
+ l_tempSensorList[l_sensorHeader.count].id = 0xC7 + (9 * G_pbax_id.chip_id) + (k*3); // temp
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM;
l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF;
l_sensorHeader.count++;
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 052039e..8666e12 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -357,6 +357,20 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg)
(true == g_amec->gpu[gpu_id].status.readOnce))
{
G_gpu_state = GPU_STATE_IDLE;
+
+ // Something has gone wrong and it may be that OPAL has put
+ // the GPU into reset. For now, if this happens we will just
+ // continue polling the GPU until it comes back.
+ g_amec->gpu[gpu_id].status.readOnce = false;
+ g_amec->gpu[gpu_id].status.checkDriverLoaded = true;
+ g_amec->gpu[gpu_id].status.driverLoaded = false;
+ g_amec->gpu[gpu_id].status.checkMemTempSupport = true;
+ g_amec->gpu[gpu_id].status.memTempSupported = false;
+ g_amec->gpu[gpu_id].status.memErrorCount = 0;
+ g_amec->gpu[gpu_id].status.errorCount = 0;
+
+// This code can be used if an interlock with OPAL is ever introduced
+#if 0
// Disable this GPU, collect FFDC and log error
g_amec->gpu[gpu_id].status.disabled = true;
@@ -395,6 +409,7 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg)
}
commitErrl(&l_err);
+#endif
}
} while(0);
@@ -626,7 +641,7 @@ bool gpu_reset_sm()
{
L_consec_reset_failure_count++;
L_state_retry_count = 0;
- L_reset_state = GPU_RESET_STATE_RESET_MASTER;
+ L_reset_state = GPU_RESET_STATE_NEW;
}
} // else reset attempt failed
} // else GPE supports GPU
@@ -718,7 +733,7 @@ bool gpu_read_temp_sm()
uint16_t l_temp = 0;
static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled
static uint8_t L_read_failure_count = 0; // Used for I2C errors
-
+ static bool L_trace_success = FALSE;
static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp
if (async_request_is_idle(&G_gpu_op_request.request))
@@ -766,8 +781,15 @@ bool gpu_read_temp_sm()
(0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp?
{
g_amec->gpu[G_current_gpu_id].status.readOnce = true;
- TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
- G_current_gpu_id, CURRENT_TICK);
+
+ // Only trace this once
+ if(FALSE == L_trace_success)
+ {
+ TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d",
+ G_current_gpu_id, CURRENT_TICK);
+ L_trace_success = TRUE;
+ }
+
// comm is now established update for capability checking to take place
g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE;
g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE;
@@ -1546,8 +1568,21 @@ void task_gpu_sm(struct task *i_self)
if(G_gpu_i2c_reset_required)
{
G_gpu_i2c_reset_required = FALSE;
- G_gpu_state = GPU_STATE_RESET;
- l_start_next_state = TRUE;
+
+ // before starting the reset check if OPAL needs the lock
+ L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE);
+ if (L_occ_owns_lock)
+ {
+ // We still own the lock start the reset
+ G_gpu_state = GPU_STATE_RESET;
+ l_start_next_state = TRUE;
+ }
+ else
+ {
+ // We don't own the lock, the reset will happen when we get the lock back
+ G_gpu_state = GPU_STATE_NO_LOCK;
+ l_start_next_state = FALSE;
+ }
break;
}
else if(G_gpu_state == GPU_STATE_IDLE)
diff --git a/src/occ_405/sensor/sensor_main_memory.c b/src/occ_405/sensor/sensor_main_memory.c
index 7e4cb51..70fb8a5 100644
--- a/src/occ_405/sensor/sensor_main_memory.c
+++ b/src/occ_405/sensor/sensor_main_memory.c
@@ -30,9 +30,9 @@
* the OCC sensors to main memory. See the header file for more information.
*/
-//******************************************************************************
+//******************************************************************************/
// Includes
-//******************************************************************************
+//******************************************************************************/
#include <sensor_main_memory.h> // Primary header
#include <stdint.h> // For uint*_t
#include <string.h> // For memset(), memcpy()
@@ -48,9 +48,9 @@
#include <cmdh_fsp_cmds.h> // For G_apss_ch_to_function
-//******************************************************************************
+//******************************************************************************/
// Main Memory Sensors - Private Defines/Structs/Globals
-//******************************************************************************
+//******************************************************************************/
/**
* Main memory sensor struct. Represents one OCC sensor that should be copied
@@ -173,16 +173,22 @@ main_mem_sensor_t G_main_mem_sensors[] =
MAIN_MEM_SENSOR (TEMPNEST, false, false),
MAIN_MEM_CORE_SENSORS (TEMPPROCTHRMC, false, false),
MAIN_MEM_DIMM_SENSORS (TEMPDIMM, false, false),
-
+ MAIN_MEM_SENSOR (TEMPGPU0, false, false),
+ MAIN_MEM_SENSOR (TEMPGPU1, false, false),
+ MAIN_MEM_SENSOR (TEMPGPU2, false, false),
+ MAIN_MEM_SENSOR (TEMPGPU0MEM, false, false),
+ MAIN_MEM_SENSOR (TEMPGPU1MEM, false, false),
+ MAIN_MEM_SENSOR (TEMPGPU2MEM, false, false),
+
// AMEC_SENSOR_TYPE_UTIL: gsid smf_mode master_only
MAIN_MEM_CORE_SENSORS (UTILC, false, false),
MAIN_MEM_SENSOR (UTIL, false, false),
MAIN_MEM_CORE_SENSORS (NUTILC, false, false),
-
+
// AMEC_SENSOR_TYPE_FREQ: gsid smf_mode master_only
MAIN_MEM_SENSOR (FREQA, true, false),
MAIN_MEM_CORE_SENSORS (FREQAC, true, false),
-
+
// AMEC_SENSOR_TYPE_POWER: gsid smf_mode master_only
MAIN_MEM_SENSOR (PWRSYS, true, true ),
MAIN_MEM_SENSOR (PWRGPU, true, false),
@@ -191,7 +197,7 @@ main_mem_sensor_t G_main_mem_sensors[] =
MAIN_MEM_SENSOR (PWRVDD, true, false),
MAIN_MEM_SENSOR (PWRVDN, true, false),
MAIN_MEM_SENSOR (PWRMEM, true, false),
-
+
// AMEC_SENSOR_TYPE_PERF: gsid smf_mode master_only
MAIN_MEM_SENSOR (IPS, false, false),
MAIN_MEM_CORE_SENSORS (STOPDEEPACTC, true, false),
@@ -288,7 +294,7 @@ bool G_mm_sensors_bce_req_scheduled = false;
* ^
* |
* field to modify later
- *
+ *
* AA is the two byte value of field A, BBBB is the four byte value of field
* B, etc.
*
@@ -774,7 +780,7 @@ void mm_sensors_save_last_write(uint32_t i_main_mem_addr, size_t i_byte_count)
// Copy last 128 bytes into last write buffer
memcpy(G_mm_sensors_last_write_buf, &G_mm_sensors_bce_buffer[l_offset],
MM_SENSORS_MIN_WRITE_SIZE);
-
+
// Save main memory address where last 128 bytes were written
G_mm_sensors_last_write_buf_addr = i_main_mem_addr + l_offset;
}
@@ -984,7 +990,7 @@ void mm_sensors_write_sensor_names(void)
// If we previously wrote bytes to the same 128-byte aligned address, copy
// those bytes from the last write buffer. This will retain the value of
- // the bytes that precede the current names entry.
+ // the bytes that precede the current names entry.
if (G_mm_sensors_last_write_buf_addr == l_write_addr)
{
memcpy(G_mm_sensors_bce_buffer, G_mm_sensors_last_write_buf,
@@ -1181,7 +1187,7 @@ void mm_sensors_init_readings_counter(const main_mem_sensor_t * i_mm_sensor,
/**
* Stores sensor readings in the BCE buffer at the specified offset for the
* specified sensor.
- *
+ *
* @param i_mm_sensor Main memory sensor whose readings to store
* @param i_bce_buf_offset Offset in BCE buffer where readings should be stored
* @param io_readings_offset Offset to current sensor's readings within Sensor
OpenPOWER on IntegriCloud