From 4b775f5f2da291d98a0ecdcfd40fb24c64052956 Mon Sep 17 00:00:00 2001 From: William Bryan Date: Fri, 1 Sep 2017 14:56:46 -0500 Subject: Fix I2C locking issue and sensor IDs Change-Id: Ib6255d96ca18b45e69184bc5126e53a09d2f26fe Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45577 Reviewed-by: Martha Broyles Tested-by: FSP CI Jenkins Reviewed-by: Christopher J. Cain Reviewed-by: William A. Bryan --- src/occ_405/cmdh/cmdh_fsp_cmds.c | 4 +-- src/occ_405/gpu/gpu.c | 47 ++++++++++++++++++++++++++++----- src/occ_405/sensor/sensor_main_memory.c | 30 ++++++++++++--------- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 6cdf79e..4606f6d 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -364,7 +364,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid; else - l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k; // temp + l_tempSensorList[l_sensorHeader.count].id = 0xC6 + (9 * G_pbax_id.chip_id) + (k*3); // temp l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU; l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0 + k]->sample) & 0xFF; l_sensorHeader.count++; @@ -373,7 +373,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) if(G_amec_sensor_list[TEMPGPU0 + k]->ipmi_sid) // temp l_tempSensorList[l_sensorHeader.count].id = G_amec_sensor_list[TEMPGPU0MEM + k]->ipmi_sid; else - l_tempSensorList[l_sensorHeader.count].id = 0x47505500 | k; // temp + l_tempSensorList[l_sensorHeader.count].id = 0xC7 + (9 * G_pbax_id.chip_id) + (k*3); // temp l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_GPU_MEM; l_tempSensorList[l_sensorHeader.count].value = (G_amec_sensor_list[TEMPGPU0MEM + k]->sample) & 0xFF; l_sensorHeader.count++; diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 052039e..8666e12 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -357,6 +357,20 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg) (true == g_amec->gpu[gpu_id].status.readOnce)) { G_gpu_state = GPU_STATE_IDLE; + + // Something has gone wrong and it may be that OPAL has put + // the GPU into reset. For now, if this happens we will just + // continue polling the GPU until it comes back. + g_amec->gpu[gpu_id].status.readOnce = false; + g_amec->gpu[gpu_id].status.checkDriverLoaded = true; + g_amec->gpu[gpu_id].status.driverLoaded = false; + g_amec->gpu[gpu_id].status.checkMemTempSupport = true; + g_amec->gpu[gpu_id].status.memTempSupported = false; + g_amec->gpu[gpu_id].status.memErrorCount = 0; + g_amec->gpu[gpu_id].status.errorCount = 0; + +// This code can be used if an interlock with OPAL is ever introduced +#if 0 // Disable this GPU, collect FFDC and log error g_amec->gpu[gpu_id].status.disabled = true; @@ -395,6 +409,7 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg) } commitErrl(&l_err); +#endif } } while(0); @@ -626,7 +641,7 @@ bool gpu_reset_sm() { L_consec_reset_failure_count++; L_state_retry_count = 0; - L_reset_state = GPU_RESET_STATE_RESET_MASTER; + L_reset_state = GPU_RESET_STATE_NEW; } } // else reset attempt failed } // else GPE supports GPU @@ -718,7 +733,7 @@ bool gpu_read_temp_sm() uint16_t l_temp = 0; static bool L_scheduled = FALSE; // indicates if a GPU GPE request was scheduled static uint8_t L_read_failure_count = 0; // Used for I2C errors - + static bool L_trace_success = FALSE; static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp if (async_request_is_idle(&G_gpu_op_request.request)) @@ -766,8 +781,15 @@ bool gpu_read_temp_sm() (0 != G_gpu_op_req_args.data) ) // TODO: check for valid temp? { g_amec->gpu[G_current_gpu_id].status.readOnce = true; - TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d", - G_current_gpu_id, CURRENT_TICK); + + // Only trace this once + if(FALSE == L_trace_success) + { + TRAC_INFO("First successful attempt to read temp from GPU%d was on tick %d", + G_current_gpu_id, CURRENT_TICK); + L_trace_success = TRUE; + } + // comm is now established update for capability checking to take place g_amec->gpu[G_current_gpu_id].status.checkMemTempSupport = TRUE; g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = TRUE; @@ -1546,8 +1568,21 @@ void task_gpu_sm(struct task *i_self) if(G_gpu_i2c_reset_required) { G_gpu_i2c_reset_required = FALSE; - G_gpu_state = GPU_STATE_RESET; - l_start_next_state = TRUE; + + // before starting the reset check if OPAL needs the lock + L_occ_owns_lock = check_and_update_i2c_lock(GPU_I2C_ENGINE); + if (L_occ_owns_lock) + { + // We still own the lock start the reset + G_gpu_state = GPU_STATE_RESET; + l_start_next_state = TRUE; + } + else + { + // We don't own the lock, the reset will happen when we get the lock back + G_gpu_state = GPU_STATE_NO_LOCK; + l_start_next_state = FALSE; + } break; } else if(G_gpu_state == GPU_STATE_IDLE) diff --git a/src/occ_405/sensor/sensor_main_memory.c b/src/occ_405/sensor/sensor_main_memory.c index 7e4cb51..70fb8a5 100644 --- a/src/occ_405/sensor/sensor_main_memory.c +++ b/src/occ_405/sensor/sensor_main_memory.c @@ -30,9 +30,9 @@ * the OCC sensors to main memory. See the header file for more information. */ -//****************************************************************************** +//******************************************************************************/ // Includes -//****************************************************************************** +//******************************************************************************/ #include // Primary header #include // For uint*_t #include // For memset(), memcpy() @@ -48,9 +48,9 @@ #include // For G_apss_ch_to_function -//****************************************************************************** +//******************************************************************************/ // Main Memory Sensors - Private Defines/Structs/Globals -//****************************************************************************** +//******************************************************************************/ /** * Main memory sensor struct. Represents one OCC sensor that should be copied @@ -173,16 +173,22 @@ main_mem_sensor_t G_main_mem_sensors[] = MAIN_MEM_SENSOR (TEMPNEST, false, false), MAIN_MEM_CORE_SENSORS (TEMPPROCTHRMC, false, false), MAIN_MEM_DIMM_SENSORS (TEMPDIMM, false, false), - + MAIN_MEM_SENSOR (TEMPGPU0, false, false), + MAIN_MEM_SENSOR (TEMPGPU1, false, false), + MAIN_MEM_SENSOR (TEMPGPU2, false, false), + MAIN_MEM_SENSOR (TEMPGPU0MEM, false, false), + MAIN_MEM_SENSOR (TEMPGPU1MEM, false, false), + MAIN_MEM_SENSOR (TEMPGPU2MEM, false, false), + // AMEC_SENSOR_TYPE_UTIL: gsid smf_mode master_only MAIN_MEM_CORE_SENSORS (UTILC, false, false), MAIN_MEM_SENSOR (UTIL, false, false), MAIN_MEM_CORE_SENSORS (NUTILC, false, false), - + // AMEC_SENSOR_TYPE_FREQ: gsid smf_mode master_only MAIN_MEM_SENSOR (FREQA, true, false), MAIN_MEM_CORE_SENSORS (FREQAC, true, false), - + // AMEC_SENSOR_TYPE_POWER: gsid smf_mode master_only MAIN_MEM_SENSOR (PWRSYS, true, true ), MAIN_MEM_SENSOR (PWRGPU, true, false), @@ -191,7 +197,7 @@ main_mem_sensor_t G_main_mem_sensors[] = MAIN_MEM_SENSOR (PWRVDD, true, false), MAIN_MEM_SENSOR (PWRVDN, true, false), MAIN_MEM_SENSOR (PWRMEM, true, false), - + // AMEC_SENSOR_TYPE_PERF: gsid smf_mode master_only MAIN_MEM_SENSOR (IPS, false, false), MAIN_MEM_CORE_SENSORS (STOPDEEPACTC, true, false), @@ -288,7 +294,7 @@ bool G_mm_sensors_bce_req_scheduled = false; * ^ * | * field to modify later - * + * * AA is the two byte value of field A, BBBB is the four byte value of field * B, etc. * @@ -774,7 +780,7 @@ void mm_sensors_save_last_write(uint32_t i_main_mem_addr, size_t i_byte_count) // Copy last 128 bytes into last write buffer memcpy(G_mm_sensors_last_write_buf, &G_mm_sensors_bce_buffer[l_offset], MM_SENSORS_MIN_WRITE_SIZE); - + // Save main memory address where last 128 bytes were written G_mm_sensors_last_write_buf_addr = i_main_mem_addr + l_offset; } @@ -984,7 +990,7 @@ void mm_sensors_write_sensor_names(void) // If we previously wrote bytes to the same 128-byte aligned address, copy // those bytes from the last write buffer. This will retain the value of - // the bytes that precede the current names entry. + // the bytes that precede the current names entry. if (G_mm_sensors_last_write_buf_addr == l_write_addr) { memcpy(G_mm_sensors_bce_buffer, G_mm_sensors_last_write_buf, @@ -1181,7 +1187,7 @@ void mm_sensors_init_readings_counter(const main_mem_sensor_t * i_mm_sensor, /** * Stores sensor readings in the BCE buffer at the specified offset for the * specified sensor. - * + * * @param i_mm_sensor Main memory sensor whose readings to store * @param i_bce_buf_offset Offset in BCE buffer where readings should be stored * @param io_readings_offset Offset to current sensor's readings within Sensor -- cgit v1.2.1