From 50e72792adbdea613e4a2aeea25b60ba1043a2b8 Mon Sep 17 00:00:00 2001 From: Christian Geddes Date: Wed, 20 Jun 2018 11:06:19 -0500 Subject: Print out MBOX/INTR state info on DMA request hang We have been stuck on a hang that occurs during memdiags on our multi-node p9 systems. It appears that Hostboot is never receiving the response to the request to reclaim DMA buffers from the FSP. From debugging we know the FSP thinks it has sent the message over the FSI mbox but hostboot isnt seeing it. Next time this happens if this is in the code we should be able to get a better idea of what is happening. Change-Id: I6b702e4094da3576ba454b5cdf0660841961baff Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60977 Reviewed-by: Richard Ward Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Reviewed-by: Roland Veloz Tested-by: FSP CI Jenkins Reviewed-by: William G. Hoffa --- src/include/usr/intr/interrupt.H | 28 ++++++ src/include/usr/intr/intr_reasoncodes.H | 1 + src/usr/intr/intrrp.C | 160 ++++++++++++++++++++++++++++++++ src/usr/intr/intrrp.H | 28 +++++- src/usr/mbox/mailboxsp.C | 96 +++++++++++++++++++ src/usr/mbox/mailboxsp.H | 18 +++- src/usr/mbox/mboxdd.C | 84 ++++++++++++++++- src/usr/mbox/mboxdd.H | 16 +++- 8 files changed, 420 insertions(+), 11 deletions(-) diff --git a/src/include/usr/intr/interrupt.H b/src/include/usr/intr/interrupt.H index b03f08f82..5456eefbe 100644 --- a/src/include/usr/intr/interrupt.H +++ b/src/include/usr/intr/interrupt.H @@ -115,6 +115,7 @@ namespace INTR MSG_INTR_DRAIN_QUEUE, //!< Allow intrp to drain Q of EOI MSG_INTR_COALESCE, //!< Pending interrupt to be handled MSG_INTR_ENABLE_PSI_INTR, //!< Enable PSIHB Interrupts + MSG_INTR_DUMP, //!< Print out interrupt info to slow buffer }; @@ -144,6 +145,14 @@ namespace INTR uint64_t phbsecure; //PSI Host Bridge Secure Control reg - 0x90 }; + enum esbStates + { + ESB_STATE_RESET = 0, + ESB_STATE_OFF = 1, + ESB_STATE_PENDING = 2, + ESB_STATE_QUEUED = 3, + }; + /** * Register a message queue for an interrupt type @@ -191,6 +200,18 @@ namespace INTR */ void sendEOI(msg_q_t i_q, msg_t* i_msg); + /** + * Sends a message to the intrrp Q to tell it to print out current state + * of interrupts from hostboot perspective to the slow trace buffer + * + * This will call the private intrrp functions: + * printEsbStates + * printPSIHBInfo + * printLSIInfo + * @return errlHndl_t on error. + */ + errlHndl_t printInterruptInfo(); + /** * Un register a message queue from the interrupt handler * @param[in] i_type the type of interrupt (ISN value) @@ -241,6 +262,13 @@ namespace INTR * drain */ void drainQueue(void); + /** + * Convert the esb state to a human readable string + * @param[in] i_esbState ESB state as hex + * @return human readable string, "INVALID" if cannot translate + */ + void esbStateToString(uint64_t i_esbState, const char** o_esbStateString); + }; #endif diff --git a/src/include/usr/intr/intr_reasoncodes.H b/src/include/usr/intr/intr_reasoncodes.H index 4dc3cc496..511008cd9 100644 --- a/src/include/usr/intr/intr_reasoncodes.H +++ b/src/include/usr/intr/intr_reasoncodes.H @@ -53,6 +53,7 @@ namespace INTR MOD_INTRRP_RESETINTUNIT = 0x11, /**< intrrp.C : IntrRp::resetIntUnit */ MOD_INTRRP_XIVE_SENDEOI = 0x12, MOD_INTRRP_IPC = 0x13, + MOD_INTR_DUMP = 0x14, /**< intrrp.C : INTR::printInterruptInfo */ }; enum IntrReasonCode diff --git a/src/usr/intr/intrrp.C b/src/usr/intr/intrrp.C index 9e3f5930b..bb6aabff2 100644 --- a/src/usr/intr/intrrp.C +++ b/src/usr/intr/intrrp.C @@ -30,6 +30,7 @@ #include "intrrp.H" #include #include +#include #include #include #include @@ -1331,6 +1332,16 @@ void IntrRp::msgHandler() msg_respond(iv_msgQ,msg); } break; + case MSG_INTR_DUMP: + { + // Run the functions that dump out + // interrupt info to slow buffer + printEsbStates(); + printLSIInfo(); + printPSIHBInfo(); + msg_free(msg); // async message + } + break; default: msg->data[1] = -EINVAL; @@ -3392,3 +3403,152 @@ errlHndl_t INTR::IntrRp::enableSlaveProcInterrupts(TARGETING::Target * i_target) return l_err; } +void INTR::esbStateToString(uint64_t i_esbState, const char** o_esbStateString) +{ + switch(i_esbState) + { + case ESB_STATE_RESET: + *o_esbStateString = "RESET"; + break; + case ESB_STATE_OFF: + *o_esbStateString = "OFF"; + break; + case ESB_STATE_PENDING: + *o_esbStateString = "PENDING"; + break; + case ESB_STATE_QUEUED: + *o_esbStateString = "QUEUED"; + break; + default: + *o_esbStateString = "INVALID"; + break; + } +} + +errlHndl_t INTR::printInterruptInfo() +{ + errlHndl_t err = NULL; + msg_q_t intr_msgQ = msg_q_resolve(VFS_ROOT_MSG_INTR); + if(intr_msgQ) + { + msg_t * msg = msg_allocate(); + msg->type = MSG_INTR_DUMP; + int send_rc = msg_send(intr_msgQ, msg); + if (send_rc != 0) + { + TRACFCOMP(g_trac_intr, ERR_MRK"IntrRp::printInterruptInfo error " + "sending print intr info message"); + /*@ errorlog tag + * @errortype ERRL_SEV_UNRECOVERABLE + * @moduleid INTR::MOD_INTR_DUMP + * @reasoncode INTR::RC_MESSAGE_SEND_ERROR + * @userdata1 RC from msg_send command + * @devdesc Error encountered sending print intr info + * message to INTRP + * @custdesc Error encountered gathering diagnostic info + */ + err = new ERRORLOG::ErrlEntry + ( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, // severity + INTR::MOD_INTR_DUMP, // moduleid + INTR::RC_MESSAGE_SEND_ERROR, // reason code + send_rc, + 0 + ); + } + } + else + { + /*@ errorlog tag + * @errortype ERRL_SEV_INFORMATIONAL + * @moduleid INTR::MOD_INTR_DUMP + * @reasoncode INTR::RC_RP_NOT_INITIALIZED + * @userdata1 MSG_INTR_DUMP + * @userdata2 0 + * @devdesc Interrupt resource provider not initialized yet. + * @custdesc Error encountered gathering diagnostic info + */ + err = new ERRORLOG::ErrlEntry + ( + ERRORLOG::ERRL_SEV_INFORMATIONAL, // severity + INTR::MOD_INTR_DUMP, // moduleid + INTR::RC_RP_NOT_INITIALIZED, // reason code + static_cast(MSG_INTR_DUMP), + 0 + ); + } + return err; +} + +void INTR::IntrRp::printLSIInfo() const +{ + TRACFCOMP(g_trac_intr, "---LSI Sources---"); + + //Read LSI Interrupt Status register from each enabled + // proc chip to see which caused the interrupt + for(auto targ_itr = iv_chipList.begin(); + targ_itr != iv_chipList.end(); ++targ_itr) + { + uint64_t l_mmioRead = (*targ_itr)->psiHbBaseAddr->lsiintstatus; + uint32_t l_huid = get_huid((*targ_itr)->proc); + TRACFCOMP(g_trac_intr, "Processor 0x%lx", l_huid); + TRACFCOMP(g_trac_intr, " lsiIntStatus : vAddr=0x%016lx Value=0x%016lx", &(*targ_itr)->psiHbBaseAddr->lsiintstatus , l_mmioRead); + l_mmioRead = (*targ_itr)->psiHbBaseAddr->lsiintlevel; + TRACFCOMP(g_trac_intr, " lsiIntLevel : vAddr=0x%016lx Value=0x%016lx", &(*targ_itr)->psiHbBaseAddr->lsiintlevel, l_mmioRead); + } +} + +void INTR::IntrRp::printPSIHBInfo() const +{ + TRACFCOMP(g_trac_intr, "---PSIHB Info---"); + //Read LSI Interrupt Status register from each enabled + // proc chip to see which caused the interrupt + for(auto targ_itr = iv_chipList.begin(); + targ_itr != iv_chipList.end(); ++targ_itr) + { + uint32_t l_huid = get_huid((*targ_itr)->proc); + uint64_t l_mmioRead = (*targ_itr)->psiHbBaseAddr->psihbcr; + + TRACFCOMP(g_trac_intr, "Processor 0x%lx", l_huid); + + TRACFCOMP(g_trac_intr, " PSIHB Ctrl/Status Reg : vAddr=0x%016lx Value=0x%016lx", + &(*targ_itr)->psiHbBaseAddr->psihbcr, l_mmioRead); + + l_mmioRead = (*targ_itr)->psiHbBaseAddr->psisemr; + TRACFCOMP(g_trac_intr, " PSIHB Error/Status Reg : vAddr=0x%016lx Value=0x%016lx", + &(*targ_itr)->psiHbBaseAddr->psisemr, l_mmioRead); + + l_mmioRead = (*targ_itr)->psiHbBaseAddr->phbdsr; + TRACFCOMP(g_trac_intr, " PSIHB Dbg Setting Reg : vAddr=0x%016lx Value=0x%016lx", + &(*targ_itr)->psiHbBaseAddr->phbdsr, l_mmioRead); + + l_mmioRead = (*targ_itr)->psiHbBaseAddr->icr; + TRACFCOMP(g_trac_intr, " PSIHB Interrupt Control Reg : vAddr=0x%016lx Value=0x%016lx", + &(*targ_itr)->psiHbBaseAddr->icr, l_mmioRead); + } +} + +void INTR::IntrRp::printEsbStates() const +{ + TRACFCOMP(g_trac_intr, "---ESB States---"); + for(auto targ_itr = iv_chipList.begin(); + targ_itr != iv_chipList.end(); ++targ_itr) + { + TRACFCOMP(g_trac_intr, "Processor 0x%lx", get_huid((*targ_itr)->proc)); + for (uint8_t i = 0; i < LSI_LAST_SOURCE; i++) + { + // Ready from the ESB_QUERY_OFFSET to ensure the read doesn't + // affect the state + uint64_t * l_psiHbEsbptr = (*targ_itr)->psiHbEsbBaseAddr + + (((i*PAGE_SIZE)+ESB_QUERY_OFFSET) /sizeof(uint64_t)); + + volatile uint64_t l_esbState = *l_psiHbEsbptr; + const char* l_esbStateString = nullptr; + + // Use toString method to look up human readable string + esbStateToString(l_esbState, &l_esbStateString); + + TRACFCOMP(g_trac_intr, " SRC: %02d State: %s", i , l_esbStateString ); + } + } +} diff --git a/src/usr/intr/intrrp.H b/src/usr/intr/intrrp.H index b0245a868..6b4b57197 100644 --- a/src/usr/intr/intrrp.H +++ b/src/usr/intr/intrrp.H @@ -231,10 +231,6 @@ namespace INTR ESB_QUERY_OFFSET = 0x800, ESB_OFF_OFFSET = 0xD00, ESB_RESET_OFFSET = 0XC00, - ESB_STATE_RESET = 0, - ESB_STATE_OFF = 1, - ESB_STATE_PENDING = 2, - ESB_STATE_QUEUED = 3, }; enum INTR_ROUTING_t @@ -889,6 +885,30 @@ namespace INTR */ errlHndl_t resetIntpForMpipl(void); + /** + * Print out the ESB state for every source on all processors + * that the Interrp is aware of + * + * @return void + */ + void printEsbStates() const; + + /** + * Print out the PSIHB info for all processors + * that the Interrp is aware of + * + * @return void + */ + void printPSIHBInfo() const; + + /** + * Print out the LSI info for all processors + * that the Interrp is aware of + * + * @return void + */ + void printLSIInfo() const; + }; }; // INTR namespace diff --git a/src/usr/mbox/mailboxsp.C b/src/usr/mbox/mailboxsp.C index 7bd1ccde5..926dcdc2d 100644 --- a/src/usr/mbox/mailboxsp.C +++ b/src/usr/mbox/mailboxsp.C @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +83,7 @@ MailboxSp::MailboxSp() iv_sendq(), iv_respondq(), iv_dmaBuffer(), + iv_dmaRequestWatchdog(0), iv_trgt(NULL), iv_shutdown_msg(NULL), iv_rts(true), @@ -832,6 +834,17 @@ void MailboxSp::send_msg(mbox_msg_t * i_msg) &iv_msg_to_send, mbox_msg_len, DeviceFW::MAILBOX); + + // Create a watchdog task that will run for 60 seconds + // if there is no response in 60 seconds then dbg info will + // be printed in the slow trace buffer + if(iv_msg_to_send.msg_payload.type == MSG_REQUEST_DMA_BUFFERS + && !Util::isSimicsRunning() + && !iv_dmaRequestWatchdog) + { + iv_dmaRequestWatchdog = task_create(&watchdogTimeoutTask, this); + assert (iv_dmaRequestWatchdog > 0 ); + } } if(err) @@ -1448,6 +1461,89 @@ void MailboxSp::sendReclaimDmaBfrsMsg( mbox_msg_t & i_mbox_msg ) return; } +void * MailboxSp::watchdogTimeoutTask(void * i_mailboxSp) +{ + // We don't want this to be a zombie because parent keeps going + task_detach(); + + // create a task which we can wait, this way we can print + // an error message if the taskWorker crashes + tid_t l_tid = task_create( &watchdogTimeoutTaskWorker, i_mailboxSp); + assert (l_tid > 0 ); + + int l_status = 0; + void* l_rc = nullptr; + + tid_t l_tidRc = task_wait_tid(l_tid, &l_status, &l_rc); + + if(l_status == TASK_STATUS_CRASHED) + { + TRACFCOMP(g_trac_mbox, + ERR_MRK + "MailboxSp::watchdogTimeoutTask - " + "Watchdog timeout crashed!! %lx", l_tidRc); + } + + return nullptr; +} + +void * MailboxSp::watchdogTimeoutTaskWorker(void * i_mailboxSp) +{ + + uint64_t MAX_TIMEOUT = 200000000000; // nanoseconds + uint64_t POLL_RATE = 1000000; // nanoseconds + uint64_t cur_timeout = 0; // nanoseconds + errlHndl_t err = nullptr; + + assert(i_mailboxSp != nullptr, "nullptr was passed to watchdogTimeoutTaskWorker"); + + MailboxSp & mboxSp = *static_cast(i_mailboxSp); + + while(cur_timeout < MAX_TIMEOUT) + { + if( !mboxSp.iv_dma_pend ) + { + TRACFCOMP(g_trac_mbox, + INFO_MRK + "Breaking out of watchdog because FSP responded to DMA request"); + break; + } + // sleep for 1 ms + nanosleep(0, POLL_RATE); + cur_timeout += POLL_RATE; + } + + if(cur_timeout >= MAX_TIMEOUT) + { + TRACFCOMP(g_trac_mbox, + INFO_MRK + "Hang during DMA request detected, dumping state information"); + err = dumpMboxRegs(); + if(err) + { + TRACFCOMP(g_trac_mbox, + INFO_MRK + "Error occured while dumping MBOX information"); + err->collectTrace(MBOX_COMP_NAME); + errlCommit(err,MBOX_COMP_ID); + } + err = INTR::printInterruptInfo(); + if(err) + { + TRACFCOMP(g_trac_mbox, + INFO_MRK + "Error occured while dumping INTR information"); + err->collectTrace(INTR_COMP_NAME); + errlCommit(err,MBOX_COMP_ID); + } + } + + //Zero out the TID so another watchdog task can be created if needed + mboxSp.iv_dmaRequestWatchdog = 0; + return nullptr; + +} + errlHndl_t MailboxSp::msgq_register(queue_id_t i_queue_id, msg_q_t i_msgQ) { diff --git a/src/usr/mbox/mailboxsp.H b/src/usr/mbox/mailboxsp.H index 22fdf45e8..d1db0a88a 100644 --- a/src/usr/mbox/mailboxsp.H +++ b/src/usr/mbox/mailboxsp.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2017 */ +/* Contributors Listed Below - COPYRIGHT 2012,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -309,6 +309,21 @@ namespace MBOX */ void sendReclaimDmaBfrsMsg( void ); + /** + * Start the watchdogTimeoutTaskWorker and print + * out an error if it crashes + */ + static void * watchdogTimeoutTask(void * i_mailboxSp); + + /** + * Used to start a timer , if the timer expires then + * Hostboot will print out a bunch of MBOX and INTR + * error information to the SLOW buffer. This is used + * to collect debug information in the case where we + * are hanging, waiting for a response to an mailbox msg + */ + static void * watchdogTimeoutTaskWorker(void * i_mailboxSp); + /** * Determine if a Reclaim Bfr message is outstanding * @return [true - Msg active | false - no msg active] @@ -367,6 +382,7 @@ namespace MBOX registry_t iv_registry; //!< Registered queue DmaBuffer iv_dmaBuffer; //!< DMA buffer manager send_q_t iv_pendingq; //!< Pending for queue registration + tid_t iv_dmaRequestWatchdog; //!< TID of dma buffer request watchdog TARGETING::Target * iv_trgt;//!< mailbox device driver target msg_t * iv_shutdown_msg;//!< Message to shutdown mbox diff --git a/src/usr/mbox/mboxdd.C b/src/usr/mbox/mboxdd.C index c5c19df99..f0e74f328 100644 --- a/src/usr/mbox/mboxdd.C +++ b/src/usr/mbox/mboxdd.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2015 */ +/* Contributors Listed Below - COPYRIGHT 2012,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -29,11 +29,12 @@ #include #include #include +#include #include trace_desc_t* g_trac_mbox = NULL; -TRAC_INIT(&g_trac_mbox, "MBOX", KILOBYTE, TRACE::BUFFER_SLOW); //4K +TRAC_INIT(&g_trac_mbox, "MBOX", 16*KILOBYTE, TRACE::BUFFER_SLOW); //16K namespace MBOX @@ -697,6 +698,85 @@ errlHndl_t mboxddShutDown(TARGETING::Target* i_target) return err; } +errlHndl_t dumpMboxRegs() +{ + errlHndl_t l_err = nullptr; + TARGETING::TargetHandleList l_procList; + TARGETING::getAllChips( l_procList, TARGETING::TYPE_PROC); + assert(l_procList.size(), "No functional processors found"); + + TRACFCOMP(g_trac_mbox, "---Dumping Mbox registers---"); + + for( const auto l_procChip : l_procList) + { + uint32_t l_64bitBuf[2] = {0}; + size_t l_64bitSize = sizeof(uint64_t); + uint32_t l_huid = TARGETING::get_huid(l_procChip); + TRACFCOMP(g_trac_mbox, "Processor 0x%lx",l_huid); + + // Read the MBOX_DB_INT_REG_PIB + l_err = deviceOp(DeviceFW::READ,l_procChip, + l_64bitBuf,l_64bitSize, + DEVICE_XSCOM_ADDRESS(MBOX_DB_INT_REG_PIB)); + if (l_err) + { + TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read PIB Interrupt Register"); + break; + } + else + { + TRACFCOMP(g_trac_mbox, " PIB Interrupt Register (0x%08X) = 0x%08X", + MBOX_DB_INT_REG_PIB, l_64bitBuf[0]); + } + + // Read the MBOX_DB_STAT_CNTRL_1 + l_err = deviceOp(DeviceFW::READ,l_procChip, + l_64bitBuf,l_64bitSize, + DEVICE_XSCOM_ADDRESS(MBOX_DB_STAT_CNTRL_1)); + if (l_err) + { + TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read Doorbell Status/Control Register"); + break; + } + else + { + TRACFCOMP(g_trac_mbox, " Doorbell Status/Control Register (0x%08X) = 0x%08X", + MBOX_DB_STAT_CNTRL_1, l_64bitBuf[0]); + } + + // Read the MBOX_DB_ERR_STAT_PIB + l_err = deviceOp(DeviceFW::READ,l_procChip, + l_64bitBuf,l_64bitSize, + DEVICE_XSCOM_ADDRESS( MBOX_DB_ERR_STAT_LBUS)); + if (l_err) + { + TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read Doorbell Error/Status Register"); + break; + } + else + { + TRACFCOMP(g_trac_mbox, " Doorbell Error/Status Register (0x%08X) = 0x%08lx", + MBOX_DB_ERR_STAT_LBUS, l_64bitBuf[0]); + } + + for(uint8_t i = 0x0; i <= (MBOX_DATA_LBUS_END - MBOX_DATA_LBUS_START) ; i++) + { + // Read the MBOX_DATA_LBUS_START + i + l_err = deviceOp(DeviceFW::READ,l_procChip, + l_64bitBuf,l_64bitSize, + DEVICE_XSCOM_ADDRESS(MBOX_DATA_LBUS_START + i)); + if (l_err) + { + TRACFCOMP(g_trac_mbox, ERR_MRK "dumpMboxRegs> Unable to read MBOX_DATA_LBUS_START + %d Register", i); + break; + } + TRACFCOMP(g_trac_mbox, " MBOX_DATA_LBUS_START + %02d (0x%08X) = 0x%08lx", + i, MBOX_DATA_LBUS_START + i , l_64bitBuf[0]); + } + } + return l_err; +} + #if defined(__DESTRUCTIVE_MBOX_TEST__) void forceErrorOnNextOperation() { diff --git a/src/usr/mbox/mboxdd.H b/src/usr/mbox/mboxdd.H index 86fb1ee80..f79459615 100644 --- a/src/usr/mbox/mboxdd.H +++ b/src/usr/mbox/mboxdd.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -37,7 +37,7 @@ namespace MBOX /* * Mbox device driver public constants */ - enum + enum { MBOX_MAX_DATA_BYTES = 64, //16 32-bit Data Registers }; @@ -45,7 +45,7 @@ namespace MBOX /* * Mbox device driver status values */ - enum MboxReadStatus + enum MboxReadStatus { MBOX_DOORBELL_ERROR = 0x00000004, /* Error Set In Error Register */ MBOX_HW_ACK = 0x00000002, /* LBUS Data Acknowledgment */ @@ -58,7 +58,7 @@ namespace MBOX /** * @brief Initialize device driver hardware - * + * * @param[in] i_target, Chip target of the MBOX operation * @return errlHndl_t If scom error | NULL (success) */ @@ -112,6 +112,14 @@ namespace MBOX void* i_buffer, size_t& i_buflen); + /** + * @brief Print all the mailbox state information to slow trace + * buffer to aid in debug. + * + * @return errlHndl_t nullptr on success + */ + errlHndl_t dumpMboxRegs(); + /** * @brief Reads the mailbox PIB error status register * -- cgit v1.2.1