summaryrefslogtreecommitdiffstats
path: root/src/usr
diff options
context:
space:
mode:
authorDan Crowell <dcrowell@us.ibm.com>2017-07-19 13:49:58 -0500
committerWilliam G. Hoffa <wghoffa@us.ibm.com>2017-07-31 12:24:33 -0400
commitad20498a1a7b857517759cbb173fe9d936107d63 (patch)
treecba05569f1cc0c6f69ef6d46cb630a08aed5627c /src/usr
parentfadc1f7542d63ef55f383cf922db86d4f5e48ffe (diff)
downloadtalos-hostboot-ad20498a1a7b857517759cbb173fe9d936107d63.tar.gz
talos-hostboot-ad20498a1a7b857517759cbb173fe9d936107d63.zip
Fix race condition between INTR and SBEIO
Fixed a race condition in clearing out the PSU interrupt register that existed between the INTR and SBEIO code. We can sometimes lose interrupts for SBE PSU operations which leads to a timeout. Also added code to look for SBE errors if a PSU operation times out Change-Id: I8cdcdcc08956b038bcc65ad7e00a34719bf14c61 CQ: SW396057 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/43339 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Elizabeth K. Liner <eliner@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: Martin Gloff <mgloff@us.ibm.com> Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Diffstat (limited to 'src/usr')
-rw-r--r--src/usr/intr/intrrp.C83
-rw-r--r--src/usr/intr/intrrp.H1
-rw-r--r--src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C2
-rw-r--r--src/usr/isteps/istep08/makefile1
-rw-r--r--src/usr/sbeio/makefile5
-rw-r--r--src/usr/sbeio/sbe_psudd.C126
6 files changed, 125 insertions, 93 deletions
diff --git a/src/usr/intr/intrrp.C b/src/usr/intr/intrrp.C
index 8448ae158..87e2b1374 100644
--- a/src/usr/intr/intrrp.C
+++ b/src/usr/intr/intrrp.C
@@ -366,6 +366,7 @@ errlHndl_t IntrRp::_init()
uint64_t l_en_threads = get_enabled_threads();
TRACFCOMP(g_trac_intr, "IntrRp::_init() Threads enabled:"
" %lx", l_en_threads);
+
} while(0);
return l_err;
@@ -1312,7 +1313,6 @@ void IntrRp::routeInterrupt(intr_hdlr_t* i_proc,
}
else if (i_type == LSI_PSU)
{
- TRACFCOMP(g_trac_intr, "PSU Interrupt Detected");
handlePsuInterrupt(i_type, i_proc, i_pir);
}
else // no queue registered for this interrupt type
@@ -1623,84 +1623,44 @@ errlHndl_t IntrRp::handlePsuInterrupt(ext_intr_t i_type,
// Long term will leverage mask register to avoid
// polling loop below
errlHndl_t l_err = NULL;
- uint32_t l_addr = PSI_BRIDGE_PSU_DOORBELL_REG;
- size_t scom_len = sizeof(uint64_t);
- uint64_t reg = 0x0;
- uint64_t l_elapsed_time_ns = 0;
TARGETING::Target* procTarget = i_proc->proc;
- do
- {
+ do {
+ size_t scom_len = 8;
+ uint64_t l_reg = 0x0;
l_err = deviceRead(procTarget,
- &reg,
+ &l_reg,
scom_len,
- DEVICE_SCOM_ADDRESS(l_addr));
-
+ DEVICE_SCOM_ADDRESS(PSI_BRIDGE_PSU_DOORBELL_REG));
if (l_err)
{
- TRACFCOMP(g_trac_intr, "Error Reading PSU SCOM address: %lx",
- l_addr);
break;
}
+ TRACDCOMP( g_trac_intr, "%.8X = %.16llX",
+ PSI_BRIDGE_PSU_DOORBELL_REG, l_reg );
- //If the PSU Host Doorbell bit is on, wait for the
- // PSU DD to handle
- if (reg & PSI_BRIDGE_PSU_HOST_DOORBELL)
+ //If the interrupt is driven by the doorbell, yield
+ // to give the driver a chance to take care of it
+ if( l_reg & PSI_BRIDGE_PSU_HOST_DOORBELL )
{
- TRACDCOMP(g_trac_intr, "Host/SBE Mailbox "
- "response. Wait for Polling to handle"
- " response");
nanosleep(0,10000);
- l_elapsed_time_ns += 10000;
- }
- else
- {
- //Polling Complete
- break;
- }
- if (l_elapsed_time_ns > MAX_PSU_LONG_TIMEOUT_NS)
- {
- TRACFCOMP(g_trac_intr, "PSU Timeout hit");
- /*@ errorlog tag
- * @errortype ERRL_SEV_UNRECOVERABLE
- * @moduleid INTR::MOD_INTRRP_HNDLPSUINTERRUPT
- * @reasoncode INTR::RC_PSU_DOORBELL_TIMEOUT
- * @userdata1 Scom Address with interrupt condition
- * @userdata2 Register Value
- * @devdesc PSU Doorbell Timeout hit waiting for doorbell
- * interrupt condition to be cleared
- */
- l_err = new ERRORLOG::ErrlEntry
- (
- ERRORLOG::ERRL_SEV_UNRECOVERABLE, // severity
- INTR::MOD_INTRRP_HNDLPSUINTERRUPT, // moduleid
- INTR::RC_PSU_DOORBELL_TIMEOUT, // reason code
- l_addr,
- reg
- );
- break;
- }
-
- } while(1);
-
- do {
-
- if (l_err)
- {
- break;
+ task_yield();
}
//Clear the PSU Scom Reg Interrupt Status register
- uint64_t l_barValue = 0;
- uint64_t size = sizeof(l_barValue);
+ // but ignore the bit that the PSU driver uses
+ // to avoid a race condition
+ uint64_t l_andVal = PSI_BRIDGE_PSU_HOST_DOORBELL;
+ uint64_t size = sizeof(l_andVal);
l_err = deviceWrite(procTarget,
- &l_barValue,
- size,
- DEVICE_SCOM_ADDRESS(l_addr));
+ &l_andVal,
+ size,
+ DEVICE_SCOM_ADDRESS(PSI_BRIDGE_PSU_DOORBELL_ANDREG));
if (l_err)
{
- TRACFCOMP(g_trac_intr, "Error clearing scom - %x", l_addr);
+ TRACFCOMP(g_trac_intr, "Error clearing scom - %x",
+ PSI_BRIDGE_PSU_DOORBELL_ANDREG);
break;
}
@@ -3714,3 +3674,4 @@ errlHndl_t INTR::IntrRp::enableSlaveProcInterrupts(TARGETING::Target * i_target)
TRACFCOMP(g_trac_intr, INFO_MRK"Slave Proc Interrupt Routing setup complete\n");
return l_err;
}
+
diff --git a/src/usr/intr/intrrp.H b/src/usr/intr/intrrp.H
index db6669989..ecebf7e67 100644
--- a/src/usr/intr/intrrp.H
+++ b/src/usr/intr/intrrp.H
@@ -185,6 +185,7 @@ namespace INTR
PSI_BRIDGE_ESB_OFF_OFFSET = 0xD00,
PSI_BRIDGE_ESB_RESET_OFFSET = 0XC00,
PSI_BRIDGE_PSU_DOORBELL_REG = 0x000D0063,
+ PSI_BRIDGE_PSU_DOORBELL_ANDREG = 0x000D0064,
PSI_BRIDGE_PSU_HOST_DOORBELL = 0x8000000000000000,
//XIVE Interrupt Controller Constants
diff --git a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C
index 2b88e8809..6174ddc79 100644
--- a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C
+++ b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C
@@ -166,7 +166,7 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs )
P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction =
P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;
FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc,
- l_cpu_target, l_rcAction);
+ l_fapi2ProcTarget, l_rcAction);
if(l_errl)
{
diff --git a/src/usr/isteps/istep08/makefile b/src/usr/isteps/istep08/makefile
index a200aed91..07d7993a7 100644
--- a/src/usr/isteps/istep08/makefile
+++ b/src/usr/isteps/istep08/makefile
@@ -73,7 +73,6 @@ include ${PROCEDURES_PATH}/hwp/perv/p9_start_cbs.mk
# proc_check_slave_sbe_seeprom_complete : Check Slave SBE Complete
include ${PROCEDURES_PATH}/hwp/perv/p9_check_slave_sbe_seeprom_complete.mk
-include ${PROCEDURES_PATH}/hwp/perv/p9_extract_sbe_rc.mk
include ${PROCEDURES_PATH}/hwp/sbe/p9_get_sbe_msg_register.mk
include ${PROCEDURES_PATH}/hwp/perv/p9_getecid.mk
diff --git a/src/usr/sbeio/makefile b/src/usr/sbeio/makefile
index fb74a3ea3..d1369e485 100644
--- a/src/usr/sbeio/makefile
+++ b/src/usr/sbeio/makefile
@@ -31,6 +31,7 @@ EXTRAINCDIR += ${ROOTPATH}/src/include/usr/fapi2
EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils
EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/utils/imageProcs
EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/ffdc
+EXTRAINCDIR += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv
OBJS += sbe_psudd.o
OBJS += sbe_coreStateControl.o
@@ -45,6 +46,10 @@ OBJS += sbe_ffdc_parser.o
OBJS += sbe_setFFDCAddr.o
OBJS += sbe_memRegionMgr.o
+VPATH += ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/
+include ${ROOTPATH}/procedure.rules.mk
+include ${ROOTPATH}/src/import/chips/p9/procedures/hwp/perv/p9_extract_sbe_rc.mk
+
SUBDIRS += test.d
SUBDIRS += runtime.d
diff --git a/src/usr/sbeio/sbe_psudd.C b/src/usr/sbeio/sbe_psudd.C
index a9d56be63..c27fdf15c 100644
--- a/src/usr/sbeio/sbe_psudd.C
+++ b/src/usr/sbeio/sbe_psudd.C
@@ -43,6 +43,10 @@
#include <arch/ppc.H>
#include <kernel/pagemgr.H>
#include <sbeio/sbeioif.H>
+#include <fapi2/target.H>
+#include <fapi2/plat_hwp_invoker.H>
+#include <p9_extract_sbe_rc.H>
+#include <errl/errludlogregister.H>
trace_desc_t* g_trac_sbeio;
TRAC_INIT(&g_trac_sbeio, SBEIO_COMP_NAME, 6*KILOBYTE, TRACE::BUFFER_SLOW);
@@ -342,6 +346,7 @@ errlHndl_t SbePsu::readResponse(TARGETING::Target * i_target,
errl = writeScom(i_target,PSU_HOST_DOORBELL_REG_AND,&l_data);
if (errl) break;
+
//If the command is not supported, then print a statement and break out
if(o_pPsuResponse->primaryStatus == SBE_PRI_INVALID_COMMAND &&
o_pPsuResponse->secondaryStatus == SBE_SEC_COMMAND_NOT_SUPPORTED)
@@ -458,7 +463,7 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target,
const uint64_t i_timeout,
psuCommand* i_pPsuRequest)
{
- errlHndl_t errl = NULL;
+ errlHndl_t l_errl = NULL;
SBE_TRACD(ENTER_MRK "pollForPsuComplete");
@@ -469,8 +474,8 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target,
do
{
// read response doorbell to see if ready
- errl = readScom(i_target,PSU_HOST_DOORBELL_REG_RW,&l_data,l_trace);
- if (errl) break; // return with error
+ l_errl = readScom(i_target,PSU_HOST_DOORBELL_REG_RW,&l_data,l_trace);
+ if (l_errl) break; // return with error
// check if response is now ready to be read
if (l_data & HOST_RESPONSE_WAITING)
@@ -481,27 +486,87 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target,
// time out if wait too long
if (l_elapsed_time_ns > i_timeout )
{
+ //read the response registers for FFDC
+ uint64_t l_respRegs[4];
+ ERRORLOG::ErrlUserDetailsLogRegister l_respRegsFFDC(i_target);
+ uint64_t l_addr = PSU_HOST_SBE_MBOX4_REG;
+ for (uint8_t i=0;i<4;i++)
+ {
+ l_errl = readScom(i_target,l_addr,&l_respRegs[i]);
+ if (l_errl)
+ {
+ l_respRegs[i] = 0;
+ delete l_errl;
+ }
+
+ l_respRegsFFDC.addData(DEVICE_XSCOM_ADDRESS(l_addr));
+
+ l_addr++;
+ }
+ psuResponse* l_resp = reinterpret_cast<psuResponse*>(l_respRegs);
+
+
SBE_TRACF(ERR_MRK "pollForPsuComplete: "
"timeout waiting for PSU request to complete"
- ": doorbell=%.8X", l_data);
+ ": doorbell=%.8X, mbox4=%.16llX",
+ l_data, l_respRegs[0]);
+
+ // Look for a hardware failure first
+ const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP>
+ l_fapiTarg(i_target);
+ P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction =
+ P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;
+ FAPI_INVOKE_HWP( l_errl, p9_extract_sbe_rc,
+ l_fapiTarg, l_rcAction );
+ if( l_rcAction != P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION )
+ {
+ // saw an error on the sbe itself, use the error we
+ // got back from the HWP
+ }
+ else
+ {
+ // got an error in the attempt to find a hw fail, just
+ // commit it as info
+ if( l_errl )
+ {
+ l_errl->setSev( ERRORLOG::ERRL_SEV_INFORMATIONAL );
+ ERRORLOG::errlCommit( l_errl, SBEIO_COMP_ID );
+ l_errl = nullptr;
+ }
- /*@
- * @errortype
- * @moduleid SBEIO_PSU
- * @reasoncode SBEIO_PSU_RESPONSE_TIMEOUT
- * @userdata1[00:31] Timeout in NS
- * @userdata1[32:63] Processor Target
- * @userdata2 Failing Request
- * @devdesc Timeout waiting for PSU command to complete
- * @custdesc Firmware error communicating with boot device
- */
- errl = new ErrlEntry(ERRL_SEV_UNRECOVERABLE,
- SBEIO_PSU,
- SBEIO_PSU_RESPONSE_TIMEOUT,
- TWO_UINT32_TO_UINT64(i_timeout,
- TARGETING::get_huid(i_target)),
- i_pPsuRequest->mbxReg0);
+ // we don't know what caused the timeout, make a generic log
+
+ /*@
+ * @errortype
+ * @moduleid SBEIO_PSU
+ * @reasoncode SBEIO_PSU_RESPONSE_TIMEOUT
+ * @userdata1[00:15] Primary Status in mbox4
+ * @userdata1[16:31] Sequence Id in mbox4
+ * @userdata1[32:63] Processor Target
+ * @userdata2 Failing Request
+ * @devdesc Timeout waiting for PSU command to complete
+ * @custdesc Firmware error communicating with boot device
+ */
+ l_errl = new ErrlEntry(ERRL_SEV_UNRECOVERABLE,
+ SBEIO_PSU,
+ SBEIO_PSU_RESPONSE_TIMEOUT,
+ TWO_UINT32_TO_UINT64(
+ TWO_UINT16_TO_UINT32(
+ l_resp->primaryStatus,
+ l_resp->secondaryStatus),
+ TARGETING::get_huid(i_target)),
+ i_pPsuRequest->mbxReg0);
+ // Code should be okay so callout hardware
+ l_errl->addHwCallout( i_target,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::NO_DECONFIG,
+ HWAS::GARD_NULL );
+ }
+ // log the failing proc as FFDC
+ ErrlUserDetailsTarget(i_target).addToLog(l_errl);
+
+ // check for any FFDC logged by the SBE itself
void * l_ffdcPkg = findFFDCBufferByTarget(i_target);
if(l_ffdcPkg != NULL)
{
@@ -511,19 +576,20 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target,
uint8_t i;
for(i = 0; i < l_pkgs; i++)
{
- errl->addFFDC( SBEIO_COMP_ID,
- l_ffdc_parser->getFFDCPackage(i),
- l_ffdc_parser->getPackageLength(i),
- 0,
- SBEIO_UDT_PARAMETERS,
- false );
+ l_errl->addFFDC( SBEIO_COMP_ID,
+ l_ffdc_parser->getFFDCPackage(i),
+ l_ffdc_parser->getPackageLength(i),
+ 0,
+ SBEIO_UDT_PARAMETERS,
+ false );
}
delete l_ffdc_parser;
}
- errl->addProcedureCallout(HWAS::EPUB_PRC_HB_CODE,
- HWAS::SRCI_PRIORITY_HIGH);
- errl->collectTrace(SBEIO_COMP_NAME);
+ // save the mbox status regs as FFDC
+ l_respRegsFFDC.addToLog(l_errl);
+
+ l_errl->collectTrace(SBEIO_COMP_NAME);
MAGIC_INST_GET_SBE_TRACES(
i_target->getAttr<TARGETING::ATTR_POSITION>(),
SBEIO_PSU_RESPONSE_TIMEOUT);
@@ -548,7 +614,7 @@ errlHndl_t SbePsu::pollForPsuComplete(TARGETING::Target * i_target,
SBE_TRACD(EXIT_MRK "pollForPsuComplete");
- return errl;
+ return l_errl;
}
/**
OpenPOWER on IntegriCloud