diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2017-07-19 13:49:58 -0500 |
---|---|---|
committer | William G. Hoffa <wghoffa@us.ibm.com> | 2017-07-31 12:24:33 -0400 |
commit | ad20498a1a7b857517759cbb173fe9d936107d63 (patch) | |
tree | cba05569f1cc0c6f69ef6d46cb630a08aed5627c /src/usr | |
parent | fadc1f7542d63ef55f383cf922db86d4f5e48ffe (diff) | |
download | talos-hostboot-ad20498a1a7b857517759cbb173fe9d936107d63.tar.gz talos-hostboot-ad20498a1a7b857517759cbb173fe9d936107d63.zip |
Fix race condition between INTR and SBEIO
Fixed a race condition in clearing out the PSU interrupt
register that existed between the INTR and SBEIO code.
We can sometimes lose interrupts for SBE PSU operations
which leads to a timeout.
Also added code to look for SBE errors if a PSU
operation times out
Change-Id: I8cdcdcc08956b038bcc65ad7e00a34719bf14c61
CQ: SW396057
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43339
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Elizabeth K. Liner <eliner@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martin Gloff <mgloff@us.ibm.com>
Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Diffstat (limited to 'src/usr')
-rw-r--r-- | src/usr/intr/intrrp.C | 83 | ||||
-rw-r--r-- | src/usr/intr/intrrp.H | 1 | ||||
-rw-r--r-- | src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C | 2 | ||||
-rw-r--r-- | src/usr/isteps/istep08/makefile | 1 | ||||
-rw-r--r-- | src/usr/sbeio/makefile | 5 | ||||
-rw-r--r-- | src/usr/sbeio/sbe_psudd.C | 126 |
6 files changed, 125 insertions, 93 deletions
diff --git a/src/usr/intr/intrrp.C b/src/usr/intr/intrrp.C index 8448ae158..87e2b1374 100644 --- a/src/usr/intr/intrrp.C +++ b/src/usr/intr/intrrp.C @@ -366,6 +366,7 @@ errlHndl_t IntrRp::_init() uint64_t l_en_threads = get_enabled_threads(); TRACFCOMP(g_trac_intr, "IntrRp::_init() Threads enabled:" " %lx", l_en_threads); + } while(0); return l_err; @@ -1312,7 +1313,6 @@ void IntrRp::routeInterrupt(intr_hdlr_t* i_proc, } else if (i_type == LSI_PSU) { - TRACFCOMP(g_trac_intr, "PSU Interrupt Detected"); handlePsuInterrupt(i_type, i_proc, i_pir); } else // no queue registered for this interrupt type @@ -1623,84 +1623,44 @@ errlHndl_t IntrRp::handlePsuInterrupt(ext_intr_t i_type, // Long term will leverage mask register to avoid // polling loop below errlHndl_t l_err = NULL; - uint32_t l_addr = PSI_BRIDGE_PSU_DOORBELL_REG; - size_t scom_len = sizeof(uint64_t); - uint64_t reg = 0x0; - uint64_t l_elapsed_time_ns = 0; TARGETING::Target* procTarget = i_proc->proc; - do - { + do { + size_t scom_len = 8; + uint64_t l_reg = 0x0; l_err = deviceRead(procTarget, - ®, + &l_reg, scom_len, - DEVICE_SCOM_ADDRESS(l_addr)); - + DEVICE_SCOM_ADDRESS(PSI_BRIDGE_PSU_DOORBELL_REG)); if (l_err) { - TRACFCOMP(g_trac_intr, "Error Reading PSU SCOM address: %lx", - l_addr); break; } + TRACDCOMP( g_trac_intr, "%.8X = %.16llX", + PSI_BRIDGE_PSU_DOORBELL_REG, l_reg ); - //If the PSU Host Doorbell bit is on, wait for the - // PSU DD to handle - if (reg & PSI_BRIDGE_PSU_HOST_DOORBELL) + //If the interrupt is driven by the doorbell, yield + // to give the driver a chance to take care of it + if( l_reg & PSI_BRIDGE_PSU_HOST_DOORBELL ) { - TRACDCOMP(g_trac_intr, "Host/SBE Mailbox " - "response. Wait for Polling to handle" - " response"); nanosleep(0,10000); - l_elapsed_time_ns += 10000; - } - else - { - //Polling Complete - break; - } - if (l_elapsed_time_ns > MAX_PSU_LONG_TIMEOUT_NS) - { - TRACFCOMP(g_trac_intr, "PSU Timeout hit"); - /*@ errorlog tag - * @errortype ERRL_SEV_UNRECOVERABLE - * @moduleid INTR::MOD_INTRRP_HNDLPSUINTERRUPT - * @reasoncode INTR::RC_PSU_DOORBELL_TIMEOUT - * @userdata1 Scom Address with interrupt condition - * @userdata2 Register Value - * @devdesc PSU Doorbell Timeout hit waiting for doorbell - * interrupt condition to be cleared - */ - l_err = new ERRORLOG::ErrlEntry - ( - ERRORLOG::ERRL_SEV_UNRECOVERABLE, // severity - INTR::MOD_INTRRP_HNDLPSUINTERRUPT, // moduleid - INTR::RC_PSU_DOORBELL_TIMEOUT, // reason code - l_addr, - reg - ); - break; - } - - } while(1); - - do { - - if (l_err) - { - break; + task_yield(); } //Clear the PSU Scom Reg Interrupt Status register - uint64_t l_barValue = 0; - uint64_t size = sizeof(l_barValue); + // but ignore the bit that the PSU driver uses + // to avoid a race condition + uint64_t l_andVal = PSI_BRIDGE_PSU_HOST_DOORBELL; + uint64_t size = sizeof(l_andVal); l_err = deviceWrite(procTarget, - &l_barValue, - size, - DEVICE_SCOM_ADDRESS(l_addr)); + &l_andVal, + size, + DEVICE_SCOM_ADDRESS(PSI_BRIDGE_PSU_DOORBELL_ANDREG)); if (l_err) { - TRACFCOMP(g_trac_intr, "Error clearing scom - %x", l_addr); + TRACFCOMP(g_trac_intr, "Error clearing scom - %x", + PSI_BRIDGE_PSU_DOORBELL_ANDREG); break; } @@ -3714,3 +3674,4 @@ errlHndl_t INTR::IntrRp::enableSlaveProcInterrupts(TARGETING::Target * i_target) TRACFCOMP(g_trac_intr, INFO_MRK"Slave Proc Interrupt Routing setup complete\n"); return l_err; } + diff --git a/src/usr/intr/intrrp.H b/src/usr/intr/intrrp.H index db6669989..ecebf7e67 100644 --- a/src/usr/intr/intrrp.H +++ b/src/usr/intr/intrrp.H @@ -185,6 +185,7 @@ namespace INTR PSI_BRIDGE_ESB_OFF_OFFSET = 0xD00, PSI_BRIDGE_ESB_RESET_OFFSET = 0XC00, PSI_BRIDGE_PSU_DOORBELL_REG = 0x000D0063, + PSI_BRIDGE_PSU_DOORBELL_ANDREG = 0x000D0064, PSI_BRIDGE_PSU_HOST_DOORBELL = 0x8000000000000000, //XIVE Interrupt Controller Constants diff --git a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C index 2b88e8809..6174ddc79 100644 --- a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C +++ b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C @@ -166,7 +166,7 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs ) P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_cpu_target, l_rcAction); + l_fapi2ProcTarget, l_rcAction); if(l_errl) { diff --git a/src/usr/isteps/istep08/makefile b/src/usr/isteps/istep08/makefile index a200aed91..07d7993a7 100644 --- a/src/usr/isteps/istep08/makefile +++ b/src/usr/isteps/istep08/makefile @@ -73,7 +73,6 @@ include ${PROCEDURES_PATH}/hwp/perv/p9_start_cbs.mk # proc_check_slave_sbe_seeprom_complete : Check Slave SBE Complete include ${PROCEDURES_PATH}/hwp/perv/p9_check_slave_sbe_seeprom_complete.mk -include ${PROCEDURES_PATH}/hwp/perv/p9_extract_sbe_rc.mk include ${PROCEDURES_PATH}/hwp/sbe/p9_get_sbe_msg_register.mk include ${PROCEDURES_PATH}/hwp/perv/p9_getecid.mk diff --git a/src/usr/sbeio/makefile b/src/usr/sbeio/makefile index fb74a3ea3..d1369e485 100644 --- a/src/usr/sbeio/makefile +++ b/src/usr/sbeio/makefile @@ -31,6 +31,7 @@ EXTRAINCDIR += ${ROOTPATH}/src/include/usr/fapi2 EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils/imageProcs EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/ffdc +EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv OBJS += sbe_psudd.o OBJS += sbe_coreStateControl.o @@ -45,6 +46,10 @@ OBJS += sbe_ffdc_parser.o OBJS += sbe_setFFDCAddr.o OBJS += sbe_memRegionMgr.o +VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/ +include ${ROOTPATH}/procedure.rules.mk +include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_extract_sbe_rc.mk + SUBDIRS += test.d SUBDIRS += runtime.d diff --git a/src/usr/sbeio/sbe_psudd.C b/src/usr/sbeio/sbe_psudd.C index a9d56be63..c27fdf15c 100644 --- a/src/usr/sbeio/sbe_psudd.C +++ b/src/usr/sbeio/sbe_psudd.C @@ -43,6 +43,10 @@ #include <arch/ppc.H> #include <kernel/pagemgr.H> #include <sbeio/sbeioif.H> +#include <fapi2/target.H> +#include <fapi2/plat_hwp_invoker.H> +#include <p9_extract_sbe_rc.H> +#include <errl/errludlogregister.H> trace_desc_t* g_trac_sbeio; TRAC_INIT(&g_trac_sbeio, SBEIO_COMP_NAME, 6*KILOBYTE, TRACE::BUFFER_SLOW); @@ -342,6 +346,7 @@ errlHndl_t SbePsu::readResponse(TARGETING::Target * i_target, errl = writeScom(i_target,PSU_HOST_DOORBELL_REG_AND,&l_data); if (errl) break; + //If the command is not supported, then print a statement and break out if(o_pPsuResponse->primaryStatus == SBE_PRI_INVALID_COMMAND && o_pPsuResponse->secondaryStatus == SBE_SEC_COMMAND_NOT_SUPPORTED) @@ -458,7 +463,7 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, const uint64_t i_timeout, psuCommand* i_pPsuRequest) { - errlHndl_t errl = NULL; + errlHndl_t l_errl = NULL; SBE_TRACD(ENTER_MRK "pollForPsuComplete"); @@ -469,8 +474,8 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, do { // read response doorbell to see if ready - errl = readScom(i_target,PSU_HOST_DOORBELL_REG_RW,&l_data,l_trace); - if (errl) break; // return with error + l_errl = readScom(i_target,PSU_HOST_DOORBELL_REG_RW,&l_data,l_trace); + if (l_errl) break; // return with error // check if response is now ready to be read if (l_data & HOST_RESPONSE_WAITING) @@ -481,27 +486,87 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, // time out if wait too long if (l_elapsed_time_ns > i_timeout ) { + //read the response registers for FFDC + uint64_t l_respRegs[4]; + ERRORLOG::ErrlUserDetailsLogRegister l_respRegsFFDC(i_target); + uint64_t l_addr = PSU_HOST_SBE_MBOX4_REG; + for (uint8_t i=0;i<4;i++) + { + l_errl = readScom(i_target,l_addr,&l_respRegs[i]); + if (l_errl) + { + l_respRegs[i] = 0; + delete l_errl; + } + + l_respRegsFFDC.addData(DEVICE_XSCOM_ADDRESS(l_addr)); + + l_addr++; + } + psuResponse* l_resp = reinterpret_cast<psuResponse*>(l_respRegs); + + SBE_TRACF(ERR_MRK "pollForPsuComplete: " "timeout waiting for PSU request to complete" - ": doorbell=%.8X", l_data); + ": doorbell=%.8X, mbox4=%.16llX", + l_data, l_respRegs[0]); + + // Look for a hardware failure first + const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> + l_fapiTarg(i_target); + P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = + P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION; + FAPI_INVOKE_HWP( l_errl, p9_extract_sbe_rc, + l_fapiTarg, l_rcAction ); + if( l_rcAction != P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION ) + { + // saw an error on the sbe itself, use the error we + // got back from the HWP + } + else + { + // got an error in the attempt to find a hw fail, just + // commit it as info + if( l_errl ) + { + l_errl->setSev( ERRORLOG::ERRL_SEV_INFORMATIONAL ); + ERRORLOG::errlCommit( l_errl, SBEIO_COMP_ID ); + l_errl = nullptr; + } - /*@ - * @errortype - * @moduleid SBEIO_PSU - * @reasoncode SBEIO_PSU_RESPONSE_TIMEOUT - * @userdata1[00:31] Timeout in NS - * @userdata1[32:63] Processor Target - * @userdata2 Failing Request - * @devdesc Timeout waiting for PSU command to complete - * @custdesc Firmware error communicating with boot device - */ - errl = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, - SBEIO_PSU, - SBEIO_PSU_RESPONSE_TIMEOUT, - TWO_UINT32_TO_UINT64(i_timeout, - TARGETING::get_huid(i_target)), - i_pPsuRequest->mbxReg0); + // we don't know what caused the timeout, make a generic log + + /*@ + * @errortype + * @moduleid SBEIO_PSU + * @reasoncode SBEIO_PSU_RESPONSE_TIMEOUT + * @userdata1[00:15] Primary Status in mbox4 + * @userdata1[16:31] Sequence Id in mbox4 + * @userdata1[32:63] Processor Target + * @userdata2 Failing Request + * @devdesc Timeout waiting for PSU command to complete + * @custdesc Firmware error communicating with boot device + */ + l_errl = new ErrlEntry(ERRL_SEV_UNRECOVERABLE, + SBEIO_PSU, + SBEIO_PSU_RESPONSE_TIMEOUT, + TWO_UINT32_TO_UINT64( + TWO_UINT16_TO_UINT32( + l_resp->primaryStatus, + l_resp->secondaryStatus), + TARGETING::get_huid(i_target)), + i_pPsuRequest->mbxReg0); + // Code should be okay so callout hardware + l_errl->addHwCallout( i_target, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::NO_DECONFIG, + HWAS::GARD_NULL ); + } + // log the failing proc as FFDC + ErrlUserDetailsTarget(i_target).addToLog(l_errl); + + // check for any FFDC logged by the SBE itself void * l_ffdcPkg = findFFDCBufferByTarget(i_target); if(l_ffdcPkg != NULL) { @@ -511,19 +576,20 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, uint8_t i; for(i = 0; i < l_pkgs; i++) { - errl->addFFDC( SBEIO_COMP_ID, - l_ffdc_parser->getFFDCPackage(i), - l_ffdc_parser->getPackageLength(i), - 0, - SBEIO_UDT_PARAMETERS, - false ); + l_errl->addFFDC( SBEIO_COMP_ID, + l_ffdc_parser->getFFDCPackage(i), + l_ffdc_parser->getPackageLength(i), + 0, + SBEIO_UDT_PARAMETERS, + false ); } delete l_ffdc_parser; } - errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE, - HWAS::SRCI_PRIORITY_HIGH); - errl->collectTrace(SBEIO_COMP_NAME); + // save the mbox status regs as FFDC + l_respRegsFFDC.addToLog(l_errl); + + l_errl->collectTrace(SBEIO_COMP_NAME); MAGIC_INST_GET_SBE_TRACES( i_target->getAttr<TARGETING::ATTR_POSITION>(), SBEIO_PSU_RESPONSE_TIMEOUT); @@ -548,7 +614,7 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target, SBE_TRACD(EXIT_MRK "pollForPsuComplete"); - return errl; + return l_errl; } /** |