summaryrefslogtreecommitdiffstats
path: root/src/occ_405/amec
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2017-10-06 11:19:10 -0500
committerMartha Broyles <mbroyles@us.ibm.com>2017-10-16 15:51:26 -0400
commit051cc0a10cb61b410252098d13fb7dd8727a8e52 (patch)
tree76003c4722b3a64cc0565475e3bc4e6065a59638 /src/occ_405/amec
parentd4fb4c372702ee71440e9f7affc40bba01366c5a (diff)
downloadtalos-occ-051cc0a10cb61b410252098d13fb7dd8727a8e52.tar.gz
talos-occ-051cc0a10cb61b410252098d13fb7dd8727a8e52.zip
VRM Vdd Interfaces
Change-Id: I8e2b597773c940ebc79972974a95fb323ea26660 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48065 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Diffstat (limited to 'src/occ_405/amec')
-rwxr-xr-xsrc/occ_405/amec/amec_data.c31
-rwxr-xr-xsrc/occ_405/amec/amec_health.c197
-rwxr-xr-xsrc/occ_405/amec/amec_health.h2
-rwxr-xr-xsrc/occ_405/amec/amec_service_codes.h40
-rwxr-xr-xsrc/occ_405/amec/amec_sys.h3
5 files changed, 249 insertions, 24 deletions
diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c
index 4c553d6..9294c1d 100755
--- a/src/occ_405/amec/amec_data.c
+++ b/src/occ_405/amec/amec_data.c
@@ -261,11 +261,40 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode)
TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for DIMM",
l_dvfs_temp);
- g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM].error_count;
+ g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM_OT_STATUS].error_count;
TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRHOT",
g_amec->vrhotproc.setpoint);
+ // Store the VRM Vdd thermal data
+ if(i_mode == OCC_MODE_NOMINAL)
+ {
+ l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].dvfs;
+ l_error = l_frudata[DATA_FRU_VRM_VDD].error;
+ }
+ else
+ {
+ l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].pm_dvfs;
+ if(i_mode == OCC_MODE_TURBO)
+ {
+ //Need to log an error if we dvfs in static turbo mode (for mfg)
+ l_error = l_dvfs_temp;
+ }
+ else
+ {
+ l_error = l_frudata[DATA_FRU_VRM_VDD].pm_error;
+ }
+ }
+ // Store the DVFS thermal setpoint in 0.1 degrees C
+ g_amec->thermalvdd.setpoint = l_dvfs_temp * 10;
+ // Store the error temperature for OT detection
+ g_amec->thermalvdd.ot_error = l_error;
+ // Store the temperature timeout value
+ g_amec->thermalvdd.temp_timeout = l_frudata[DATA_FRU_VRM_VDD].max_read_timeout;
+
+ TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRM Vdd",
+ l_dvfs_temp);
+
} while(0);
return l_err;
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index 1d026d2..12c348d 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -67,6 +67,9 @@ uint8_t G_cent_temp_expired_bitmap = 0;
// Array to store the update tag of each core's temperature sensor
uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0};
+// Reading VRM Vdd temperature timedout?
+bool G_vrm_vdd_temp_expired = false;
+
//*************************************************************************/
// Function Declarations
//*************************************************************************/
@@ -398,13 +401,13 @@ void amec_health_check_dimm_timeout()
* @reasoncode FRU_TEMP_TIMEOUT
* @userdata1 timeout value in seconds
* @userdata2 0
- * @userdata4 OCC_NO_EXTENDED_RC
+ * @userdata4 ERC_AMEC_DIMM_TEMP_TIMEOUT
* @devdesc Failed to read a memory DIMM temperature
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId
FRU_TEMP_TIMEOUT, //reasoncode
- OCC_NO_EXTENDED_RC, //Extended reason code
+ ERC_AMEC_DIMM_TEMP_TIMEOUT, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
@@ -706,14 +709,14 @@ void amec_health_check_cent_timeout()
* @reasoncode FRU_TEMP_TIMEOUT
* @userdata1 timeout value in seconds
* @userdata2 0
- * @userdata4 OCC_NO_EXTENDED_RC
+ * @userdata4 ERC_AMEC_CENT_TEMP_TIMEOUT
* @devdesc Failed to read a centaur memory controller
* temperature
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TIMEOUT, //modId
FRU_TEMP_TIMEOUT, //reasoncode
- OCC_NO_EXTENDED_RC, //Extended reason code
+ ERC_AMEC_CENT_TEMP_TIMEOUT, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
@@ -999,6 +1002,192 @@ void amec_health_check_proc_timeout()
}while(0);
}
+// Function Specification
+//
+// Name: amec_health_check_vrm_vdd_temp
+//
+// Description: This function checks if the VRM Vdd temperature has
+// exceeded the error temperature sent in data format 0x13.
+//
+// End Function Specification
+void amec_health_check_vrm_vdd_temp()
+{
+ /*------------------------------------------------------------------------*/
+ /* Local Variables */
+ /*------------------------------------------------------------------------*/
+ uint16_t l_ot_error;
+ static uint32_t L_error_count = 0;
+ static BOOLEAN L_ot_error_logged = FALSE;
+ sensor_t *l_sensor;
+ errlHndl_t l_err = NULL;
+
+ /*------------------------------------------------------------------------*/
+ /* Code */
+ /*------------------------------------------------------------------------*/
+ do
+ {
+ // Get TEMPVDD sensor
+ l_sensor = getSensorByGsid(TEMPVDD);
+ l_ot_error = g_amec->thermalvdd.ot_error;
+
+ // Check to see if we exceeded our error temperature
+ if (l_sensor->sample > l_ot_error)
+ {
+ // Increment the error counter for this FRU
+ L_error_count++;
+
+ // Trace and log error the first time this occurs
+ if (L_error_count == AMEC_HEALTH_ERROR_TIMER)
+ {
+ // Have we logged an OT error for this FRU already?
+ if (L_ot_error_logged == TRUE)
+ {
+ break;
+ }
+
+ L_ot_error_logged = TRUE;
+
+ TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]",
+ l_sensor->sample,
+ l_ot_error);
+
+ // Log an OT error
+ /* @
+ * @errortype
+ * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TEMP
+ * @reasoncode VRM_VDD_ERROR_TEMP
+ * @userdata1 0
+ * @userdata2 Fru peak temperature sensor
+ * @devdesc VRM Vdd has reached error temperature
+ * threshold and is called out in this error log.
+ *
+ */
+ l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TEMP,
+ VRM_VDD_ERROR_TEMP,
+ ERC_AMEC_PROC_ERROR_OVER_TEMPERATURE,
+ ERRL_SEV_PREDICTIVE,
+ NULL,
+ DEFAULT_TRACE_SIZE,
+ 0,
+ l_sensor->sample_max);
+
+ // Callout the Ambient procedure
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_COMPONENT_ID,
+ ERRL_COMPONENT_ID_OVER_TEMPERATURE,
+ ERRL_CALLOUT_PRIORITY_HIGH);
+
+ // Callout VRM Vdd
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_HUID,
+ G_sysConfigData.vrm_vdd_huid,
+ ERRL_CALLOUT_PRIORITY_MED);
+
+ // Commit Error
+ commitErrl(&l_err);
+ }
+ }
+ else
+ {
+ // Trace that we have now dropped below the error threshold
+ if (L_error_count >= AMEC_HEALTH_ERROR_TIMER)
+ {
+ TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]",
+ l_sensor->sample, l_ot_error, L_error_count);
+ }
+
+ // Reset the error counter for this FRU
+ L_error_count = 0;
+ }
+ }while (0);
+
+}
+
+// Function Specification
+//
+// Name: amec_health_check_vrm_vdd_temp_timeout
+//
+// Description: This function checks if OCC has failed to read the VRM Vdd
+// temperature and if it has exceeded the maximum allowed number of retries.
+//
+// End Function Specification
+void amec_health_check_vrm_vdd_temp_timeout()
+{
+ /*------------------------------------------------------------------------*/
+ /* Local Variables */
+ /*------------------------------------------------------------------------*/
+ errlHndl_t l_err = NULL;
+ uint32_t l_update_tag = 0;
+ static uint32_t L_read_fail_cnt = 0;
+ static BOOLEAN L_error_logged = FALSE;
+ static uint32_t L_vdd_temp_update_tag = 0;
+
+ /*------------------------------------------------------------------------*/
+ /* Code */
+ /*------------------------------------------------------------------------*/
+
+ // Check if VRM Vdd temperature sensor has been updated by checking the sensor update tag
+ // If the update tag is not changing, then temperature sensor is not being updated.
+ l_update_tag = AMECSENSOR_PTR(TEMPVDD)->update_tag;
+ if (l_update_tag != L_vdd_temp_update_tag)
+ {
+ // We were able to read VRM Vdd temperature
+ L_read_fail_cnt = 0;
+ G_vrm_vdd_temp_expired = false;
+ L_vdd_temp_update_tag = l_update_tag;
+ }
+ else
+ {
+ // Failed to read VRM Vdd temperature sensor
+ L_read_fail_cnt++;
+
+ // Check if we have reached the maximum read time allowed
+ if((L_read_fail_cnt == g_amec->thermalvdd.temp_timeout) &&
+ (g_amec->thermalvdd.temp_timeout != 0xFF))
+ {
+ //temperature has expired. Notify control algorithms
+ G_vrm_vdd_temp_expired = true;
+
+ // Log error one time
+ if (L_error_logged == FALSE)
+ {
+ L_error_logged = TRUE;
+
+ TRAC_ERR("Timed out reading VRM Vdd temperature for timeout[%u]",
+ g_amec->thermalvdd.temp_timeout);
+
+ /* @
+ * @errortype
+ * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT
+ * @reasoncode FRU_TEMP_TIMEOUT
+ * @userdata1 timeout value in seconds
+ * @userdata2 0
+ * @userdata4 ERC_AMEC_VRM_VDD_TEMP_TIMEOUT
+ * @devdesc Failed to read VRM Vdd temperature.
+ *
+ */
+ l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT, //modId
+ FRU_TEMP_TIMEOUT, //reasoncode
+ ERC_AMEC_VRM_VDD_TEMP_TIMEOUT, //Extended reason code
+ ERRL_SEV_PREDICTIVE, //Severity
+ NULL, //Trace Buf
+ DEFAULT_TRACE_SIZE, //Trace Size
+ g_amec->thermalvdd.temp_timeout, //userdata1
+ 0); //userdata2
+
+ // Callout the VRM
+ addCalloutToErrl(l_err,
+ ERRL_CALLOUT_TYPE_HUID,
+ G_sysConfigData.vrm_vdd_huid,
+ ERRL_CALLOUT_PRIORITY_MED);
+
+ // Commit error log and request reset
+ REQUEST_RESET(l_err);
+ }
+ } // if reached timeout
+ } // else failed to read temp
+}
+
/*----------------------------------------------------------------------------*/
/* End */
/*----------------------------------------------------------------------------*/
diff --git a/src/occ_405/amec/amec_health.h b/src/occ_405/amec/amec_health.h
index 11d8fb0..7992f26 100755
--- a/src/occ_405/amec/amec_health.h
+++ b/src/occ_405/amec/amec_health.h
@@ -51,5 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent,
uint8_t i_dimm,
uint8_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap);
+void amec_health_check_vrm_vdd_temp(void);
+void amec_health_check_vrm_vdd_temp_timeout(void);
#endif
diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h
index f206daf..8c87e5f 100755
--- a/src/occ_405/amec/amec_service_codes.h
+++ b/src/occ_405/amec/amec_service_codes.h
@@ -48,25 +48,27 @@
/*----------------------------------------------------------------------------*/
enum occAmecModuleId
{
- AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00,
- AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01,
- AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02,
- AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03,
- AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04,
- AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05,
- AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06,
- AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07,
- AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08,
- AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09,
- AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10,
- AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11,
- AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12,
- AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13,
- AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14,
- AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16,
- AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17,
- AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18,
- AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19,
+ AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00,
+ AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01,
+ AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02,
+ AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03,
+ AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04,
+ AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05,
+ AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06,
+ AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07,
+ AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08,
+ AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09,
+ AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10,
+ AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11,
+ AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12,
+ AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13,
+ AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14,
+ AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16,
+ AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17,
+ AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18,
+ AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19,
+ AMEC_HEALTH_CHECK_VRM_VDD_TEMP = AMEC_COMP_ID | 0x1A,
+ AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT = AMEC_COMP_ID | 0x1B,
};
/*----------------------------------------------------------------------------*/
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index 3f1d333..e86a000 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -362,6 +362,7 @@ typedef struct
sensor_t vrhot_mem_proc;
sensor_t vrfan;
+ sensor_t tempvdd;
// Chip Sensors
sensor_t todclock0;
@@ -687,6 +688,8 @@ typedef struct
amec_controller_t thermaldimm;
// Thermal Controller based on VRHOT signal from processor VRM
amec_controller_t vrhotproc;
+ // Thermal Controller based on VRM Vdd temperatures
+ amec_controller_t thermalvdd;
// Oversubscription Status
oversub_status_t oversub_status;
OpenPOWER on IntegriCloud