diff options
author | Chris Cain <cjcain@us.ibm.com> | 2017-11-01 12:07:10 -0500 |
---|---|---|
committer | Christopher J. Cain <cjcain@us.ibm.com> | 2017-11-02 18:49:11 -0400 |
commit | d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (patch) | |
tree | f83c7b467354bd91ddbecfe9152b529cef5dfe96 /src/occ_405/pss | |
parent | 0b07301e60afae16acdf326e7f50367d7b41c7a1 (diff) | |
download | talos-occ-d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102.tar.gz talos-occ-d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102.zip |
AVSbus Re-sync Bus Error Recovery
Change-Id: I226c5a7e7b8b5db4a3dc60608494c8cf9cbbb212
RTC: 180106
CQ: SW396400
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/49113
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Diffstat (limited to 'src/occ_405/pss')
-rw-r--r-- | src/occ_405/pss/avsbus.c | 267 | ||||
-rwxr-xr-x | src/occ_405/pss/pss_service_codes.h | 3 |
2 files changed, 119 insertions, 151 deletions
diff --git a/src/occ_405/pss/avsbus.c b/src/occ_405/pss/avsbus.c index 75e2325..236ff68 100644 --- a/src/occ_405/pss/avsbus.c +++ b/src/occ_405/pss/avsbus.c @@ -70,9 +70,10 @@ extern bool G_vrm_vdd_temp_expired; void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor); // Number of read failures allowed before elog is created and reset requested. -// This should be no longer than 4ms (or it will impact WOF calculations) -// (readings are taken every 500us => 500us * 8 = 4ms) -const uint8_t MAX_READ_ATTEMPTS = 8; +// If readings take longer than 4ms, it will impact WOF calculations. +// Voltage/Current are read every 3 ticks (1.5ms). +// Because 3 read attemps would take 4.5ms (> 4ms), an error needs to be logged. +const uint8_t MAX_READ_ATTEMPTS = 3; const uint16_t AVSBUS_STATUS_READ_ERROR = 0xFFFF; extern data_cnfg_t * G_data_cnfg; @@ -81,26 +82,46 @@ uint32_t avs_crc_calculate(const uint32_t i_avs_cmd); // NOTE: OCC must use Bridge B, because Bridge A is reserved for PGPE +// Registers are based on bus number +const uint32_t OCB_O2SCMDxB[2] = { OCB_O2SCMD0B, OCB_O2SCMD1B }; +const uint32_t OCB_O2SWDxB[2] = { OCB_O2SWD0B, OCB_O2SWD1B }; +const uint32_t OCB_O2SSTxB[2] = { OCB_O2SST0B, OCB_O2SST1B }; +const uint32_t OCB_O2SRDxB[2] = { OCB_O2SRD0B, OCB_O2SRD1B }; -// Clear OT/OC bit in status reg for both busses -uint32_t deglitch_status_errors(const uint8_t i_bus, const uint32_t i_status_mask) +// Wait for operation to complete (clear ongoing) +uint32_t wait_for_complete(const uint8_t i_bus) { - // Determine register based on the bus number - uint32_t o2scmd_reg = OCB_O2SCMD0B; - uint32_t o2swd_reg = OCB_O2SWD0B; - uint32_t o2sst_reg = OCB_O2SST0B; - if (i_bus != 0) + uint32_t l_status = in32(OCB_O2SSTxB[i_bus]); + DEBUG_IN32(OCB_O2SSTxB[i_bus], l_status, "OCB_O2SSTxB"); + unsigned int loops = 0; + while ((l_status & AVSBUS_STATUS_ONGOING) && (loops < 500)) + { + // o2s_ongoing bit was still set (operation did not complete) + l_status = in32(OCB_O2SSTxB[i_bus]); + ++loops; + } + DEBUG_IN32(OCB_O2SSTxB[i_bus], l_status, "OCB_O2SSTxB"); + if (0 != (l_status & AVSBUS_STATUS_ERRORS)) { - o2scmd_reg = OCB_O2SCMD1B; - o2swd_reg = OCB_O2SWD1B; - o2sst_reg = OCB_O2SST1B; + TRAC_ERR("wait_for_complete(): error in status register: 0x%08X", l_status); } + else if (l_status & AVSBUS_STATUS_ONGOING) // o2s_ongoing + { + TRAC_ERR("wait_for_complete(): timeout waiting for ongoing bit to clear (%d loops) 0x%08X", loops, l_status); + } + + return l_status; +} + +// Clear OT/OC bit in status reg for both busses +uint32_t deglitch_status_errors(const uint8_t i_bus, const uint32_t i_status_mask) +{ // Write O2SCMD[a][n] // o2s_clear_sticky_bits = 1 uint32_t value = 0x40000000; - DEBUG_OUT32(o2scmd_reg, value, "OCB_O2SCMDxB"); - out32(o2scmd_reg, value); + DEBUG_OUT32(OCB_O2SCMDxB[i_bus], value, "OCB_O2SCMDxB"); + out32(OCB_O2SCMDxB[i_bus], value); // To deglitch the status bits, set the status bits you wish to clear with a 1. (in CmdData) // After the write completes, if the status is still active, then the @@ -121,28 +142,11 @@ uint32_t deglitch_status_errors(const uint8_t i_bus, const uint32_t i_status_mas value = 0x47780000 | (cmd_data << 3); // Calculate/add CRC value |= avs_crc_calculate(value); - DEBUG_OUT32(o2swd_reg, value, "OCB_O2SWDxB"); - out32(o2swd_reg, value); + DEBUG_OUT32(OCB_O2SWDxB[i_bus], value, "OCB_O2SWDxB"); + out32(OCB_O2SWDxB[i_bus], value); // Wait for operation to complete (clear ongoing) - uint32_t l_status = in32(o2sst_reg); - DEBUG_IN32(o2sst_reg, l_status, "OCB_O2SSTxB"); - unsigned int loops = 0; - while ((l_status & AVSBUS_STATUS_ONGOING) && (loops < 500)) - { - // o2s_ongoing bit was still set (operation did not complete) - l_status = in32(o2sst_reg); - ++loops; - } - DEBUG_IN32(o2sst_reg, l_status, "OCB_O2SSTxB"); - if (0 != (l_status & AVSBUS_STATUS_ERRORS)) - { - TRAC_ERR("deglitch_status_errors(): error in status register: 0x%08X", l_status); - } - else if (l_status & AVSBUS_STATUS_ONGOING) // o2s_ongoing - { - TRAC_ERR("deglitch_status_errors(): timeout waiting for ongoing bit to clear (%d loops) 0x%08X", loops, l_status); - } + const uint32_t l_status = wait_for_complete(i_bus); // return the last read status return l_status; @@ -150,6 +154,36 @@ uint32_t deglitch_status_errors(const uint8_t i_bus, const uint32_t i_status_mas } // end deglitch_status_errors() +// Re-sync AVS bus to try to recover from errors +// Reference: chips/p9/procedures/hwp/lib/p9_avsbus_lib.C +// chips/p9/procedures/ppe_closed/pgpe/pstate_gpe/avs_driver.c +uint32_t avsbus_resync(const uint8_t i_bus) +{ + // clear sticky bits in o2s_status_reg + // Write O2SCMD[a][n] + // o2s_clear_sticky_bits = 1 + uint32_t value = 0x40000000; + DEBUG_OUT32(OCB_O2SCMDxB[i_bus], value, "OCB_O2SCMDxB"); + out32(OCB_O2SCMDxB[i_bus], value); + + // Drive AVS transaction with a frame value 0xFFFFFFFF (idle frame) + // to initialize the AVS slave. + // In principle this only has to be done once. Though docs suggest + // that due to noise on the chip this init should be done periodically. + TRAC_INFO("avsbus_resync: Send idle frame (bus %d)", i_bus); + value = 0xFFFFFFFF; + DEBUG_OUT32(OCB_O2SWDxB[i_bus], value, "OCB_O2SWDxB"); + out32(OCB_O2SWDxB[i_bus], value); + + // Wait for operation to complete (clear ongoing) + const uint32_t l_status = wait_for_complete(i_bus); + + // return the last read status + return l_status; + +} // end avsbus_resync() + + // AVS Bus setup that must be done once (common between read/write operations) void avsbus_init() { @@ -256,10 +290,20 @@ void avsbus_init() out32(OCB_O2SCTRL21B, value); } - // Clear OC/OT masks in the status reg + // Re-sync AVS bus and clear OC/OT masks in status regs and + // De-glitch: clear OC/OT masks in the status reg const uint32_t error_mask = AVSBUS_STATUS_OVER_CURRENT_MASK | AVSBUS_STATUS_OVER_TEMPERATURE_MASK; - deglitch_status_errors(0, error_mask); - deglitch_status_errors(1, error_mask); + if (bus0_monitoring) + { + avsbus_resync(0); + deglitch_status_errors(0, error_mask); + } + if (bus1_monitoring) + { + avsbus_resync(1); + deglitch_status_errors(1, error_mask); + } + } // end avsbus_init() @@ -353,20 +397,11 @@ void avsbus_read_start(const avsbus_type_e i_type, } #endif - // Determine register based on the bus number - uint32_t o2scmd_reg = OCB_O2SCMD0B; - uint32_t o2swd_reg = OCB_O2SWD0B; - if (1 == l_data.bus) - { - o2scmd_reg = OCB_O2SCMD1B; - o2swd_reg = OCB_O2SWD1B; - } - // Write O2SCMD[a][n] // o2s_clear_sticky_bits = 1 uint32_t value = 0x40000000; - DEBUG_OUT32(o2scmd_reg, value, "OCB_O2SCMDxB"); - out32(o2scmd_reg, value); + DEBUG_OUT32(OCB_O2SCMDxB[l_data.bus], value, "OCB_O2SCMDxB"); + out32(OCB_O2SCMDxB[l_data.bus], value); // Write O2SWD[a][n] - write commands and initiate hardware operation // o2s_wdata with content @@ -382,8 +417,8 @@ void avsbus_read_start(const avsbus_type_e i_type, value = 0x7007FFF8 | ((uint32_t) i_cmdtype << 23) | ((uint32_t)l_data.rail << 19); // Calculate/add CRC value |= avs_crc_calculate(value); - DEBUG_OUT32(o2swd_reg, value, "OCB_O2SWDxB"); - out32(o2swd_reg, value); + DEBUG_OUT32(OCB_O2SWDxB[l_data.bus], value, "OCB_O2SWDxB"); + out32(OCB_O2SWDxB[l_data.bus], value); // Read has been started so now just wait for HW to complete @@ -449,15 +484,6 @@ uint16_t avsbus_read(const avsbus_type_e i_type, } #endif - // Determine register based on the bus number - uint32_t o2sst_reg = OCB_O2SST0B; - uint32_t o2srd_reg = OCB_O2SRD0B; - if (1 == l_data.bus) - { - o2sst_reg = OCB_O2SST1B; - o2srd_reg = OCB_O2SRD1B; - } - // HW: Wait for bus op to complete // HW: arbitration between two bridges // HW: o2s_ongoing: 0 -> 1 @@ -467,8 +493,8 @@ uint16_t avsbus_read(const avsbus_type_e i_type, // Since read was started in previous tick, it should have already completed // (no need to poll/wait on o2s_ongoing) enum occReasonCode rc = OCC_SUCCESS_REASON_CODE; - uint32_t l_status = in32(o2sst_reg); - DEBUG_IN32(o2sst_reg, l_status, "OCB_O2SSTxB"); + uint32_t l_status = in32(OCB_O2SSTxB[l_data.bus]); + DEBUG_IN32(OCB_O2SSTxB[l_data.bus], l_status, "OCB_O2SSTxB"); // OCC O2S Status Register // 0 o2s_ongoing // 1:4 reserved @@ -486,7 +512,7 @@ uint16_t avsbus_read(const avsbus_type_e i_type, if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read: Error found in Vd%c %c O2SST[0x%08X] = [0x%08X]", - l_trace_type, l_trace_cmd, o2sst_reg, l_status); + l_trace_type, l_trace_cmd, OCB_O2SSTxB[l_data.bus], l_status); /* * @errortype * @moduleid PSS_MID_AVSBUS_READ @@ -506,7 +532,7 @@ uint16_t avsbus_read(const avsbus_type_e i_type, if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read: Vd%c %c timeout waiting for o2s_ongoing change O2SST[0x%08X] = [0x%08X]", - l_trace_type, l_trace_cmd, o2sst_reg, l_status); + l_trace_type, l_trace_cmd, OCB_O2SSTxB[l_data.bus], l_status); /* * @errortype * @moduleid PSS_MID_AVSBUS_READ @@ -522,8 +548,8 @@ uint16_t avsbus_read(const avsbus_type_e i_type, if (FALSE == l_failure) { // Read the response data - uint32_t value = in32(o2srd_reg); - DEBUG_IN32(o2srd_reg, value, "OCB_O2SRDxB"); + uint32_t value = in32(OCB_O2SRDxB[l_data.bus]); + DEBUG_IN32(OCB_O2SRDxB[l_data.bus], value, "OCB_O2SRDxB"); // AVS Bus response (read voltage, current, or temperature): // 0:1 SlaveAck (0b00 from slave indicates good CRC and action was taken) // 2 0 @@ -542,7 +568,7 @@ uint16_t avsbus_read(const avsbus_type_e i_type, if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read: CRC mismatch in Vd%c %c rsp O2SRD[0x%08X] = [0x%08X] (calculated CRC 0x%08X)", - l_trace_type, l_trace_cmd, o2srd_reg, value, crc); + l_trace_type, l_trace_cmd, OCB_O2SRDxB[l_data.bus], value, crc); /* * @errortype * @moduleid PSS_MID_AVSBUS_READ @@ -666,6 +692,11 @@ uint16_t avsbus_read(const avsbus_type_e i_type, ERRL_CALLOUT_PRIORITY_MED); REQUEST_RESET(l_err); } + else + { + // Force a re-sync after any failure + avsbus_resync(l_data.bus); + } } #ifdef AVSDEBUG @@ -725,20 +756,6 @@ void initiate_avsbus_read_status() } if (bus != 0xFF) { - // Determine register based on the bus number - uint32_t o2scmd_reg = OCB_O2SCMD0B; - uint32_t o2swd_reg = OCB_O2SWD0B; - if (0 == bus) - { - o2scmd_reg = OCB_O2SCMD0B; - o2swd_reg = OCB_O2SWD0B; - } - else - { - o2scmd_reg = OCB_O2SCMD1B; - o2swd_reg = OCB_O2SWD1B; - } - #ifdef AVSDEBUG if (L_trace_count < DEBUG_TRACE_MAX) { @@ -749,8 +766,8 @@ void initiate_avsbus_read_status() // Write O2SCMD[a][n] // o2s_clear_sticky_bits = 1 uint32_t value = 0x40000000; - DEBUG_OUT32(o2scmd_reg, value, "OCB_O2SCMDxB"); - out32(o2scmd_reg, value); + DEBUG_OUT32(OCB_O2SCMDxB[bus], value, "OCB_O2SCMDxB"); + out32(OCB_O2SCMDxB[bus], value); // Write O2SWD[a][n] - write commands and initiate hardware operation // o2s_wdata with content @@ -767,8 +784,8 @@ void initiate_avsbus_read_status() value = 0x777FFFF8; // Calculate/add CRC value |= avs_crc_calculate(value); - DEBUG_OUT32(o2swd_reg, value, "OCB_O2SWDxB"); - out32(o2swd_reg, value); + DEBUG_OUT32(OCB_O2SWDxB[bus], value, "OCB_O2SWDxB"); + out32(OCB_O2SWDxB[bus], value); } } @@ -801,7 +818,6 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) uint16_t o_reading = 0; bool l_failure = FALSE; - const uint8_t max_read_attempts = G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM_OT_STATUS].max_read_timeout; // Static error counters for each type (Vdd/Vdn) static uint32_t L_error_count[AVSBUS_TYPE_MAX] = {0}; @@ -824,15 +840,6 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) } #endif - // Determine register based on the bus number - uint32_t o2sst_reg = OCB_O2SST0B; - uint32_t o2srd_reg = OCB_O2SRD0B; - if (1 == l_data.bus) - { - o2sst_reg = OCB_O2SST1B; - o2srd_reg = OCB_O2SRD1B; - } - // HW: Wait for bus op to complete // HW: arbitration between two bridges // HW: o2s_ongoing: 0 -> 1 @@ -842,8 +849,8 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) // Since read was started in previous tick, it should have already completed // (no need to poll/wait on o2s_ongoing) enum occReasonCode rc = OCC_SUCCESS_REASON_CODE; - uint32_t l_status = in32(o2sst_reg); - DEBUG_IN32(o2sst_reg, l_status, "OCB_O2SSTxB"); + uint32_t l_status = in32(OCB_O2SSTxB[l_data.bus]); + DEBUG_IN32(OCB_O2SSTxB[l_data.bus], l_status, "OCB_O2SSTxB"); // OCC O2S Status Register // 0 o2s_ongoing // 1:4 reserved @@ -858,18 +865,10 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) // error bit was set l_failure = TRUE; (*l_error_count)++; - if ((*l_error_count == 1) || (*l_error_count == max_read_attempts)) + if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read_status: Error found in Vd%c O2SST[0x%08X] = [0x%08X]", - l_trace_type, o2sst_reg, l_status); - /* - * @errortype - * @moduleid PSS_MID_AVSBUS_READ_STATUS - * @reasoncode AVSBUS_ERROR - * @userdata1 AVS Bus type/bus/rail - * @userdata2 status - * @devdesc Error encountered when reading AVS Bus - */ + l_trace_type, OCB_O2SSTxB[l_data.bus], l_status); rc = AVSBUS_ERROR; } } @@ -878,18 +877,10 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) // o2s_ongoing bit was still set (operation did not complete) l_failure = TRUE; (*l_error_count)++; - if ((*l_error_count == 1) || (*l_error_count == max_read_attempts)) + if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read_status: Vd%c timeout waiting for o2s_ongoing change O2SST[0x%08X] = [0x%08X]", - l_trace_type, o2sst_reg, l_status); - /* - * @errortype - * @moduleid PSS_MID_AVSBUS_READ_STATUS - * @reasoncode AVSBUS_TIMEOUT - * @userdata1 AVS Bus type/bus/rail - * @userdata2 status - * @devdesc Timeout when reading AVS Bus - */ + l_trace_type, OCB_O2SSTxB[l_data.bus], l_status); rc = AVSBUS_TIMEOUT; } } @@ -897,8 +888,8 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) if (FALSE == l_failure) { // Read the response data - uint32_t value = in32(o2srd_reg); - DEBUG_IN32(o2srd_reg, value, "OCB_O2SRDxB"); + uint32_t value = in32(OCB_O2SRDxB[l_data.bus]); + DEBUG_IN32(OCB_O2SRDxB[l_data.bus], value, "OCB_O2SRDxB"); // AVS Bus response (read status): // 0:1 SlaveAck (0b00 from slave indicates good CRC and action was taken) // 2 0 @@ -914,18 +905,10 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) { l_failure = TRUE; (*l_error_count)++; - if ((*l_error_count == 1) || (*l_error_count == max_read_attempts)) + if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read_status: CRC mismatch in Vd%c rsp O2SRD[0x%08X] = [0x%08X] (calculated CRC 0x%08X)", - l_trace_type, o2srd_reg, value, crc); - /* - * @errortype - * @moduleid PSS_MID_AVSBUS_READ_STATUS - * @reasoncode AVSBUS_CRC_ERROR - * @userdata1 AVS Bus type/bus/rail - * @userdata2 status - * @devdesc CRC error reading AVS Bus - */ + l_trace_type, OCB_O2SRDxB[l_data.bus], value, crc); rc = AVSBUS_CRC_ERROR; } } @@ -964,7 +947,7 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) { l_failure = TRUE; (*l_error_count)++; - if ((*l_error_count == 1) || (*l_error_count == max_read_attempts)) + if ((*l_error_count == 1) || (*l_error_count == MAX_READ_ATTEMPTS)) { TRAC_ERR("avsbus_read_status: SlaveAck reported no action taken[0x%08X]", value); rc = AVSBUS_ERROR; @@ -974,29 +957,15 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) if (l_failure) { - if (*l_error_count == max_read_attempts) + if (*l_error_count == MAX_READ_ATTEMPTS) { TRAC_ERR("avsbus_read_status: Reached %d consecutive Vd%c errors reading status", *l_error_count, l_trace_type); - G_avsbus_vdd_monitoring = FALSE; - G_avsbus_vdn_monitoring = FALSE; - G_vrm_thermal_monitoring = FALSE; - errlHndl_t l_err = createErrl(PSS_MID_AVSBUS_READ_STATUS, - rc, - ERC_AVSBUS_STATUS_FAILURE, - ERRL_SEV_PREDICTIVE, - NULL, - DEFAULT_TRACE_SIZE, - (i_type << 16) | (l_data.bus << 8) | l_data.rail, - l_status); - setErrlActions(l_err, ERRL_ACTIONS_MANUFACTURING_ERROR); - // add processor callout - addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, - G_sysConfigData.proc_huid, - ERRL_CALLOUT_PRIORITY_MED); - commitErrl(&l_err); + // Reading AVS bus status is not critical, so don't stop monitoring or commit error } + + // Force a re-sync after any failure + avsbus_resync(l_data.bus); o_reading = AVSBUS_STATUS_READ_ERROR; } diff --git a/src/occ_405/pss/pss_service_codes.h b/src/occ_405/pss/pss_service_codes.h index 3293e7c..5a7fe85 100755 --- a/src/occ_405/pss/pss_service_codes.h +++ b/src/occ_405/pss/pss_service_codes.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -39,7 +39,6 @@ enum pssModuleId PSS_MID_DPSS_RD_STATUS = PSS_COMP_ID | 0x06, PSS_MID_DO_APSS_RECOVERY = PSS_COMP_ID | 0x07, PSS_MID_AVSBUS_READ = PSS_COMP_ID | 0x08, - PSS_MID_AVSBUS_READ_STATUS = PSS_COMP_ID | 0x09, }; #endif /* #ifndef _PSS_SERVICE_CODES_H_ */ |