From 051cc0a10cb61b410252098d13fb7dd8727a8e52 Mon Sep 17 00:00:00 2001 From: mbroyles Date: Fri, 6 Oct 2017 11:19:10 -0500 Subject: VRM Vdd Interfaces Change-Id: I8e2b597773c940ebc79972974a95fb323ea26660 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48065 Tested-by: FSP CI Jenkins Reviewed-by: William A. Bryan Reviewed-by: Andres A. Lugo-Reyes Reviewed-by: Martha Broyles --- src/occ_405/amec/amec_data.c | 31 +++++- src/occ_405/amec/amec_health.c | 197 +++++++++++++++++++++++++++++++++- src/occ_405/amec/amec_health.h | 2 + src/occ_405/amec/amec_service_codes.h | 40 +++---- src/occ_405/amec/amec_sys.h | 3 + 5 files changed, 249 insertions(+), 24 deletions(-) (limited to 'src/occ_405/amec') diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c index 4c553d6..9294c1d 100755 --- a/src/occ_405/amec/amec_data.c +++ b/src/occ_405/amec/amec_data.c @@ -261,11 +261,40 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for DIMM", l_dvfs_temp); - g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM].error_count; + g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM_OT_STATUS].error_count; TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRHOT", g_amec->vrhotproc.setpoint); + // Store the VRM Vdd thermal data + if(i_mode == OCC_MODE_NOMINAL) + { + l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].dvfs; + l_error = l_frudata[DATA_FRU_VRM_VDD].error; + } + else + { + l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].pm_dvfs; + if(i_mode == OCC_MODE_TURBO) + { + //Need to log an error if we dvfs in static turbo mode (for mfg) + l_error = l_dvfs_temp; + } + else + { + l_error = l_frudata[DATA_FRU_VRM_VDD].pm_error; + } + } + // Store the DVFS thermal setpoint in 0.1 degrees C + g_amec->thermalvdd.setpoint = l_dvfs_temp * 10; + // Store the error temperature for OT detection + g_amec->thermalvdd.ot_error = l_error; + // Store the temperature timeout value + g_amec->thermalvdd.temp_timeout = l_frudata[DATA_FRU_VRM_VDD].max_read_timeout; + + TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRM Vdd", + l_dvfs_temp); + } while(0); return l_err; diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 1d026d2..12c348d 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -67,6 +67,9 @@ uint8_t G_cent_temp_expired_bitmap = 0; // Array to store the update tag of each core's temperature sensor uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0}; +// Reading VRM Vdd temperature timedout? +bool G_vrm_vdd_temp_expired = false; + //*************************************************************************/ // Function Declarations //*************************************************************************/ @@ -398,13 +401,13 @@ void amec_health_check_dimm_timeout() * @reasoncode FRU_TEMP_TIMEOUT * @userdata1 timeout value in seconds * @userdata2 0 - * @userdata4 OCC_NO_EXTENDED_RC + * @userdata4 ERC_AMEC_DIMM_TEMP_TIMEOUT * @devdesc Failed to read a memory DIMM temperature * */ l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId FRU_TEMP_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code + ERC_AMEC_DIMM_TEMP_TIMEOUT, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size @@ -706,14 +709,14 @@ void amec_health_check_cent_timeout() * @reasoncode FRU_TEMP_TIMEOUT * @userdata1 timeout value in seconds * @userdata2 0 - * @userdata4 OCC_NO_EXTENDED_RC + * @userdata4 ERC_AMEC_CENT_TEMP_TIMEOUT * @devdesc Failed to read a centaur memory controller * temperature * */ l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TIMEOUT, //modId FRU_TEMP_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code + ERC_AMEC_CENT_TEMP_TIMEOUT, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size @@ -999,6 +1002,192 @@ void amec_health_check_proc_timeout() }while(0); } +// Function Specification +// +// Name: amec_health_check_vrm_vdd_temp +// +// Description: This function checks if the VRM Vdd temperature has +// exceeded the error temperature sent in data format 0x13. +// +// End Function Specification +void amec_health_check_vrm_vdd_temp() +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + uint16_t l_ot_error; + static uint32_t L_error_count = 0; + static BOOLEAN L_ot_error_logged = FALSE; + sensor_t *l_sensor; + errlHndl_t l_err = NULL; + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + do + { + // Get TEMPVDD sensor + l_sensor = getSensorByGsid(TEMPVDD); + l_ot_error = g_amec->thermalvdd.ot_error; + + // Check to see if we exceeded our error temperature + if (l_sensor->sample > l_ot_error) + { + // Increment the error counter for this FRU + L_error_count++; + + // Trace and log error the first time this occurs + if (L_error_count == AMEC_HEALTH_ERROR_TIMER) + { + // Have we logged an OT error for this FRU already? + if (L_ot_error_logged == TRUE) + { + break; + } + + L_ot_error_logged = TRUE; + + TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]", + l_sensor->sample, + l_ot_error); + + // Log an OT error + /* @ + * @errortype + * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TEMP + * @reasoncode VRM_VDD_ERROR_TEMP + * @userdata1 0 + * @userdata2 Fru peak temperature sensor + * @devdesc VRM Vdd has reached error temperature + * threshold and is called out in this error log. + * + */ + l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TEMP, + VRM_VDD_ERROR_TEMP, + ERC_AMEC_PROC_ERROR_OVER_TEMPERATURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + 0, + l_sensor->sample_max); + + // Callout the Ambient procedure + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_OVER_TEMPERATURE, + ERRL_CALLOUT_PRIORITY_HIGH); + + // Callout VRM Vdd + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.vrm_vdd_huid, + ERRL_CALLOUT_PRIORITY_MED); + + // Commit Error + commitErrl(&l_err); + } + } + else + { + // Trace that we have now dropped below the error threshold + if (L_error_count >= AMEC_HEALTH_ERROR_TIMER) + { + TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]", + l_sensor->sample, l_ot_error, L_error_count); + } + + // Reset the error counter for this FRU + L_error_count = 0; + } + }while (0); + +} + +// Function Specification +// +// Name: amec_health_check_vrm_vdd_temp_timeout +// +// Description: This function checks if OCC has failed to read the VRM Vdd +// temperature and if it has exceeded the maximum allowed number of retries. +// +// End Function Specification +void amec_health_check_vrm_vdd_temp_timeout() +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + errlHndl_t l_err = NULL; + uint32_t l_update_tag = 0; + static uint32_t L_read_fail_cnt = 0; + static BOOLEAN L_error_logged = FALSE; + static uint32_t L_vdd_temp_update_tag = 0; + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + + // Check if VRM Vdd temperature sensor has been updated by checking the sensor update tag + // If the update tag is not changing, then temperature sensor is not being updated. + l_update_tag = AMECSENSOR_PTR(TEMPVDD)->update_tag; + if (l_update_tag != L_vdd_temp_update_tag) + { + // We were able to read VRM Vdd temperature + L_read_fail_cnt = 0; + G_vrm_vdd_temp_expired = false; + L_vdd_temp_update_tag = l_update_tag; + } + else + { + // Failed to read VRM Vdd temperature sensor + L_read_fail_cnt++; + + // Check if we have reached the maximum read time allowed + if((L_read_fail_cnt == g_amec->thermalvdd.temp_timeout) && + (g_amec->thermalvdd.temp_timeout != 0xFF)) + { + //temperature has expired. Notify control algorithms + G_vrm_vdd_temp_expired = true; + + // Log error one time + if (L_error_logged == FALSE) + { + L_error_logged = TRUE; + + TRAC_ERR("Timed out reading VRM Vdd temperature for timeout[%u]", + g_amec->thermalvdd.temp_timeout); + + /* @ + * @errortype + * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT + * @reasoncode FRU_TEMP_TIMEOUT + * @userdata1 timeout value in seconds + * @userdata2 0 + * @userdata4 ERC_AMEC_VRM_VDD_TEMP_TIMEOUT + * @devdesc Failed to read VRM Vdd temperature. + * + */ + l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT, //modId + FRU_TEMP_TIMEOUT, //reasoncode + ERC_AMEC_VRM_VDD_TEMP_TIMEOUT, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + g_amec->thermalvdd.temp_timeout, //userdata1 + 0); //userdata2 + + // Callout the VRM + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.vrm_vdd_huid, + ERRL_CALLOUT_PRIORITY_MED); + + // Commit error log and request reset + REQUEST_RESET(l_err); + } + } // if reached timeout + } // else failed to read temp +} + /*----------------------------------------------------------------------------*/ /* End */ /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_health.h b/src/occ_405/amec/amec_health.h index 11d8fb0..7992f26 100755 --- a/src/occ_405/amec/amec_health.h +++ b/src/occ_405/amec/amec_health.h @@ -51,5 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent, uint8_t i_dimm, uint8_t* i_clog_bitmap, uint8_t* i_dlog_bitmap); +void amec_health_check_vrm_vdd_temp(void); +void amec_health_check_vrm_vdd_temp_timeout(void); #endif diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h index f206daf..8c87e5f 100755 --- a/src/occ_405/amec/amec_service_codes.h +++ b/src/occ_405/amec/amec_service_codes.h @@ -48,25 +48,27 @@ /*----------------------------------------------------------------------------*/ enum occAmecModuleId { - AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00, - AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01, - AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02, - AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03, - AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04, - AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05, - AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06, - AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07, - AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08, - AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09, - AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10, - AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11, - AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12, - AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13, - AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14, - AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, - AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, - AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, - AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19, + AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00, + AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01, + AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02, + AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03, + AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04, + AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05, + AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06, + AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07, + AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08, + AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09, + AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10, + AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11, + AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12, + AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13, + AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14, + AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, + AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, + AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, + AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19, + AMEC_HEALTH_CHECK_VRM_VDD_TEMP = AMEC_COMP_ID | 0x1A, + AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT = AMEC_COMP_ID | 0x1B, }; /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 3f1d333..e86a000 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -362,6 +362,7 @@ typedef struct sensor_t vrhot_mem_proc; sensor_t vrfan; + sensor_t tempvdd; // Chip Sensors sensor_t todclock0; @@ -687,6 +688,8 @@ typedef struct amec_controller_t thermaldimm; // Thermal Controller based on VRHOT signal from processor VRM amec_controller_t vrhotproc; + // Thermal Controller based on VRM Vdd temperatures + amec_controller_t thermalvdd; // Oversubscription Status oversub_status_t oversub_status; -- cgit v1.2.1