summaryrefslogtreecommitdiffstats
path: root/src/usr/mbox
diff options
context:
space:
mode:
authorChristian Geddes <crgeddes@us.ibm.com>2018-06-20 11:06:19 -0500
committerWilliam G. Hoffa <wghoffa@us.ibm.com>2018-07-05 09:50:43 -0400
commit50e72792adbdea613e4a2aeea25b60ba1043a2b8 (patch)
tree678dc780c974563cb60035eb4bc187b1df333aeb /src/usr/mbox
parent1759af757bd8f9a13386c4fb4624bd93394af67b (diff)
downloadtalos-hostboot-50e72792adbdea613e4a2aeea25b60ba1043a2b8.tar.gz
talos-hostboot-50e72792adbdea613e4a2aeea25b60ba1043a2b8.zip
Print out MBOX/INTR state info on DMA request hang
We have been stuck on a hang that occurs during memdiags on our multi-node p9 systems. It appears that Hostboot is never receiving the response to the request to reclaim DMA buffers from the FSP. From debugging we know the FSP thinks it has sent the message over the FSI mbox but hostboot isnt seeing it. Next time this happens if this is in the code we should be able to get a better idea of what is happening. Change-Id: I6b702e4094da3576ba454b5cdf0660841961baff Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60977 Reviewed-by: Richard Ward <rward15@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Roland Veloz <rveloz@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Diffstat (limited to 'src/usr/mbox')
-rw-r--r--src/usr/mbox/mailboxsp.C96
-rw-r--r--src/usr/mbox/mailboxsp.H18
-rw-r--r--src/usr/mbox/mboxdd.C84
-rw-r--r--src/usr/mbox/mboxdd.H16
4 files changed, 207 insertions, 7 deletions
diff --git a/src/usr/mbox/mailboxsp.C b/src/usr/mbox/mailboxsp.C
index 7bd1ccde5..926dcdc2d 100644
--- a/src/usr/mbox/mailboxsp.C
+++ b/src/usr/mbox/mailboxsp.C
@@ -44,6 +44,7 @@
#include <arch/ppc.H>
#include <errl/errlmanager.H>
#include <sys/misc.h>
+#include <util/misc.H>
#include <errl/errludprintk.H>
#include <errno.h>
#include <kernel/console.H>
@@ -82,6 +83,7 @@ MailboxSp::MailboxSp()
iv_sendq(),
iv_respondq(),
iv_dmaBuffer(),
+ iv_dmaRequestWatchdog(0),
iv_trgt(NULL),
iv_shutdown_msg(NULL),
iv_rts(true),
@@ -832,6 +834,17 @@ void MailboxSp::send_msg(mbox_msg_t * i_msg)
&iv_msg_to_send,
mbox_msg_len,
DeviceFW::MAILBOX);
+
+ // Create a watchdog task that will run for 60 seconds
+ // if there is no response in 60 seconds then dbg info will
+ // be printed in the slow trace buffer
+ if(iv_msg_to_send.msg_payload.type == MSG_REQUEST_DMA_BUFFERS
+ && !Util::isSimicsRunning()
+ && !iv_dmaRequestWatchdog)
+ {
+ iv_dmaRequestWatchdog = task_create(&watchdogTimeoutTask, this);
+ assert (iv_dmaRequestWatchdog > 0 );
+ }
}
if(err)
@@ -1448,6 +1461,89 @@ void MailboxSp::sendReclaimDmaBfrsMsg( mbox_msg_t & i_mbox_msg )
return;
}
+void * MailboxSp::watchdogTimeoutTask(void * i_mailboxSp)
+{
+ // We don't want this to be a zombie because parent keeps going
+ task_detach();
+
+ // create a task which we can wait, this way we can print
+ // an error message if the taskWorker crashes
+ tid_t l_tid = task_create( &watchdogTimeoutTaskWorker, i_mailboxSp);
+ assert (l_tid > 0 );
+
+ int l_status = 0;
+ void* l_rc = nullptr;
+
+ tid_t l_tidRc = task_wait_tid(l_tid, &l_status, &l_rc);
+
+ if(l_status == TASK_STATUS_CRASHED)
+ {
+ TRACFCOMP(g_trac_mbox,
+ ERR_MRK
+ "MailboxSp::watchdogTimeoutTask - "
+ "Watchdog timeout crashed!! %lx", l_tidRc);
+ }
+
+ return nullptr;
+}
+
+void * MailboxSp::watchdogTimeoutTaskWorker(void * i_mailboxSp)
+{
+
+ uint64_t MAX_TIMEOUT = 200000000000; // nanoseconds
+ uint64_t POLL_RATE = 1000000; // nanoseconds
+ uint64_t cur_timeout = 0; // nanoseconds
+ errlHndl_t err = nullptr;
+
+ assert(i_mailboxSp != nullptr, "nullptr was passed to watchdogTimeoutTaskWorker");
+
+ MailboxSp & mboxSp = *static_cast<MailboxSp *>(i_mailboxSp);
+
+ while(cur_timeout < MAX_TIMEOUT)
+ {
+ if( !mboxSp.iv_dma_pend )
+ {
+ TRACFCOMP(g_trac_mbox,
+ INFO_MRK
+ "Breaking out of watchdog because FSP responded to DMA request");
+ break;
+ }
+ // sleep for 1 ms
+ nanosleep(0, POLL_RATE);
+ cur_timeout += POLL_RATE;
+ }
+
+ if(cur_timeout >= MAX_TIMEOUT)
+ {
+ TRACFCOMP(g_trac_mbox,
+ INFO_MRK
+ "Hang during DMA request detected, dumping state information");
+ err = dumpMboxRegs();
+ if(err)
+ {
+ TRACFCOMP(g_trac_mbox,
+ INFO_MRK
+ "Error occured while dumping MBOX information");
+ err->collectTrace(MBOX_COMP_NAME);
+ errlCommit(err,MBOX_COMP_ID);
+ }
+ err = INTR::printInterruptInfo();
+ if(err)
+ {
+ TRACFCOMP(g_trac_mbox,
+ INFO_MRK
+ "Error occured while dumping INTR information");
+ err->collectTrace(INTR_COMP_NAME);
+ errlCommit(err,MBOX_COMP_ID);
+ }
+ }
+
+ //Zero out the TID so another watchdog task can be created if needed
+ mboxSp.iv_dmaRequestWatchdog = 0;
+ return nullptr;
+
+}
+
errlHndl_t MailboxSp::msgq_register(queue_id_t i_queue_id, msg_q_t i_msgQ)
{
diff --git a/src/usr/mbox/mailboxsp.H b/src/usr/mbox/mailboxsp.H
index 22fdf45e8..d1db0a88a 100644
--- a/src/usr/mbox/mailboxsp.H
+++ b/src/usr/mbox/mailboxsp.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2017 */
+/* Contributors Listed Below - COPYRIGHT 2012,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -310,6 +310,21 @@ namespace MBOX
void sendReclaimDmaBfrsMsg( void );
/**
+ * Start the watchdogTimeoutTaskWorker and print
+ * out an error if it crashes
+ */
+ static void * watchdogTimeoutTask(void * i_mailboxSp);
+
+ /**
+ * Used to start a timer , if the timer expires then
+ * Hostboot will print out a bunch of MBOX and INTR
+ * error information to the SLOW buffer. This is used
+ * to collect debug information in the case where we
+ * are hanging, waiting for a response to an mailbox msg
+ */
+ static void * watchdogTimeoutTaskWorker(void * i_mailboxSp);
+
+ /**
* Determine if a Reclaim Bfr message is outstanding
* @return [true - Msg active | false - no msg active]
*/
@@ -367,6 +382,7 @@ namespace MBOX
registry_t iv_registry; //!< Registered queue
DmaBuffer iv_dmaBuffer; //!< DMA buffer manager
send_q_t iv_pendingq; //!< Pending for queue registration
+ tid_t iv_dmaRequestWatchdog; //!< TID of dma buffer request watchdog
TARGETING::Target * iv_trgt;//!< mailbox device driver target
msg_t * iv_shutdown_msg;//!< Message to shutdown mbox
diff --git a/src/usr/mbox/mboxdd.C b/src/usr/mbox/mboxdd.C
index c5c19df99..f0e74f328 100644
--- a/src/usr/mbox/mboxdd.C
+++ b/src/usr/mbox/mboxdd.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2015 */
+/* Contributors Listed Below - COPYRIGHT 2012,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -29,11 +29,12 @@
#include <trace/interface.H>
#include <errl/errlentry.H>
#include <targeting/common/targetservice.H>
+#include <targeting/common/utilFilter.H>
#include <intr/interrupt.H>
trace_desc_t* g_trac_mbox = NULL;
-TRAC_INIT(&g_trac_mbox, "MBOX", KILOBYTE, TRACE::BUFFER_SLOW); //4K
+TRAC_INIT(&g_trac_mbox, "MBOX", 16*KILOBYTE, TRACE::BUFFER_SLOW); //16K
namespace MBOX
@@ -697,6 +698,85 @@ errlHndl_t mboxddShutDown(TARGETING::Target* i_target)
return err;
}
+errlHndl_t dumpMboxRegs()
+{
+ errlHndl_t l_err = nullptr;
+ TARGETING::TargetHandleList l_procList;
+ TARGETING::getAllChips( l_procList, TARGETING::TYPE_PROC);
+ assert(l_procList.size(), "No functional processors found");
+
+ TRACFCOMP(g_trac_mbox, "---Dumping Mbox registers---");
+
+ for( const auto l_procChip : l_procList)
+ {
+ uint32_t l_64bitBuf[2] = {0};
+ size_t l_64bitSize = sizeof(uint64_t);
+ uint32_t l_huid = TARGETING::get_huid(l_procChip);
+ TRACFCOMP(g_trac_mbox, "Processor 0x%lx",l_huid);
+
+ // Read the MBOX_DB_INT_REG_PIB
+ l_err = deviceOp(DeviceFW::READ,l_procChip,
+ l_64bitBuf,l_64bitSize,
+ DEVICE_XSCOM_ADDRESS(MBOX_DB_INT_REG_PIB));
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read PIB Interrupt Register");
+ break;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_mbox, " PIB Interrupt Register (0x%08X) = 0x%08X",
+ MBOX_DB_INT_REG_PIB, l_64bitBuf[0]);
+ }
+
+ // Read the MBOX_DB_STAT_CNTRL_1
+ l_err = deviceOp(DeviceFW::READ,l_procChip,
+ l_64bitBuf,l_64bitSize,
+ DEVICE_XSCOM_ADDRESS(MBOX_DB_STAT_CNTRL_1));
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read Doorbell Status/Control Register");
+ break;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_mbox, " Doorbell Status/Control Register (0x%08X) = 0x%08X",
+ MBOX_DB_STAT_CNTRL_1, l_64bitBuf[0]);
+ }
+
+ // Read the MBOX_DB_ERR_STAT_PIB
+ l_err = deviceOp(DeviceFW::READ,l_procChip,
+ l_64bitBuf,l_64bitSize,
+ DEVICE_XSCOM_ADDRESS( MBOX_DB_ERR_STAT_LBUS));
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read Doorbell Error/Status Register");
+ break;
+ }
+ else
+ {
+ TRACFCOMP(g_trac_mbox, " Doorbell Error/Status Register (0x%08X) = 0x%08lx",
+ MBOX_DB_ERR_STAT_LBUS, l_64bitBuf[0]);
+ }
+
+ for(uint8_t i = 0x0; i <= (MBOX_DATA_LBUS_END - MBOX_DATA_LBUS_START) ; i++)
+ {
+ // Read the MBOX_DATA_LBUS_START + i
+ l_err = deviceOp(DeviceFW::READ,l_procChip,
+ l_64bitBuf,l_64bitSize,
+ DEVICE_XSCOM_ADDRESS(MBOX_DATA_LBUS_START + i));
+ if (l_err)
+ {
+ TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read MBOX_DATA_LBUS_START + %d Register", i);
+ break;
+ }
+ TRACFCOMP(g_trac_mbox, " MBOX_DATA_LBUS_START + %02d (0x%08X) = 0x%08lx",
+ i, MBOX_DATA_LBUS_START + i , l_64bitBuf[0]);
+ }
+ }
+ return l_err;
+}
+
#if defined(__DESTRUCTIVE_MBOX_TEST__)
void forceErrorOnNextOperation()
{
diff --git a/src/usr/mbox/mboxdd.H b/src/usr/mbox/mboxdd.H
index 86fb1ee80..f79459615 100644
--- a/src/usr/mbox/mboxdd.H
+++ b/src/usr/mbox/mboxdd.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2014 */
+/* Contributors Listed Below - COPYRIGHT 2012,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -37,7 +37,7 @@ namespace MBOX
/*
* Mbox device driver public constants
*/
- enum
+ enum
{
MBOX_MAX_DATA_BYTES = 64, //16 32-bit Data Registers
};
@@ -45,7 +45,7 @@ namespace MBOX
/*
* Mbox device driver status values
*/
- enum MboxReadStatus
+ enum MboxReadStatus
{
MBOX_DOORBELL_ERROR = 0x00000004, /* Error Set In Error Register */
MBOX_HW_ACK = 0x00000002, /* LBUS Data Acknowledgment */
@@ -58,7 +58,7 @@ namespace MBOX
/**
* @brief Initialize device driver hardware
- *
+ *
* @param[in] i_target, Chip target of the MBOX operation
* @return errlHndl_t If scom error | NULL (success)
*/
@@ -112,6 +112,14 @@ namespace MBOX
void* i_buffer,
size_t& i_buflen);
+ /**
+ * @brief Print all the mailbox state information to slow trace
+ * buffer to aid in debug.
+ *
+ * @return errlHndl_t nullptr on success
+ */
+ errlHndl_t dumpMboxRegs();
+
/**
* @brief Reads the mailbox PIB error status register
*
OpenPOWER on IntegriCloud