diff options
| author | Ilya Smirnov <ismirno@us.ibm.com> | 2019-04-11 09:42:19 -0500 |
|---|---|---|
| committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2019-04-24 13:52:59 -0500 |
| commit | a1f8b3160946b44d37ac947de70430f4a49b28cd (patch) | |
| tree | 5f09163e170d4d56e4b7b33857d4f00a03018729 /src | |
| parent | 2007c4f940856589a087452beac9a20359098b0b (diff) | |
| download | blackbird-hostboot-a1f8b3160946b44d37ac947de70430f4a49b28cd.tar.gz blackbird-hostboot-a1f8b3160946b44d37ac947de70430f4a49b28cd.zip | |
Add Child RC Checking to Thread Pool
Add logic for Thread Pool to return error log when
shutdown() is called if a child thread (or multiple
child threads) had crashed. This is a default behavior,
but a constructor option can switch this functionality
off.
The new logic will aid in debugging of the failures of
child processes, as the failures could be catched when
they occurred and not at some point after.
Change-Id: I9736d659a086701b8e4f18f41504df4864924d88
RTC: 208517
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75897
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com>
Reviewed-by: Michael Baiocchi <mbaiocch@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src')
| -rw-r--r-- | src/include/usr/util/impl/threadpool.H | 14 | ||||
| -rw-r--r-- | src/include/usr/util/threadpool.H | 20 | ||||
| -rw-r--r-- | src/include/usr/util/util_reasoncodes.H | 2 | ||||
| -rw-r--r-- | src/usr/diag/mdia/mdia.C | 12 | ||||
| -rw-r--r-- | src/usr/diag/mdia/mdiafwd.H | 6 | ||||
| -rw-r--r-- | src/usr/diag/mdia/mdiasm.C | 13 | ||||
| -rw-r--r-- | src/usr/diag/mdia/mdiasm.H | 4 | ||||
| -rw-r--r-- | src/usr/isteps/istep13/call_mss_draminit_trainadv.C | 13 | ||||
| -rw-r--r-- | src/usr/util/test/threadpool.H | 131 | ||||
| -rw-r--r-- | src/usr/util/threadpool.C | 54 |
10 files changed, 241 insertions, 28 deletions
diff --git a/src/include/usr/util/impl/threadpool.H b/src/include/usr/util/impl/threadpool.H index 84ee9afd7..5ad530057 100644 --- a/src/include/usr/util/impl/threadpool.H +++ b/src/include/usr/util/impl/threadpool.H @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -38,6 +40,7 @@ #include <algorithm> #include <sys/sync.h> #include <sys/task.h> +#include <errl/errlentry.H> namespace Util { @@ -82,7 +85,11 @@ namespace __Util_ThreadPool_Impl /** Simple constructor, call __init to avoid the in-charge and * not-in-charge construction costs. */ - ThreadPoolImpl() { __init(); }; + ThreadPoolImpl(bool i_checkChildRc = true) : + iv_checkChildRc(i_checkChildRc) + { + __init(); + }; protected: /** Initialize the object. */ @@ -105,7 +112,7 @@ namespace __Util_ThreadPool_Impl */ void __start(start_fn_t fn, void* instance); /** Stop the thread-pool. */ - void __shutdown(); + errlHndl_t __shutdown(); /** Outstanding work-list. */ worklist_t iv_worklist; @@ -119,6 +126,7 @@ namespace __Util_ThreadPool_Impl std::list<tid_t> iv_children; /** State of object. */ bool iv_shutdown; + bool iv_checkChildRc; }; diff --git a/src/include/usr/util/threadpool.H b/src/include/usr/util/threadpool.H index f96e0b916..43fdeba7a 100644 --- a/src/include/usr/util/threadpool.H +++ b/src/include/usr/util/threadpool.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2015 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -58,6 +58,7 @@ #include <stdint.h> #include <util/traits/has_lessthan.H> #include <util/impl/threadpool.H> +#include <errl/errlentry.H> namespace Util { @@ -87,7 +88,10 @@ class ThreadPool : public Util::__Util_ThreadPool_Impl::ThreadPoolImpl { public: /** Basic Constructor. Initialize ThreadPool. */ - ThreadPool() : Util::__Util_ThreadPool_Impl::ThreadPoolImpl() { }; + ThreadPool(bool i_checkChildRc = true) : + Util::__Util_ThreadPool_Impl::ThreadPoolImpl(i_checkChildRc) + { + }; /** Basic Destructor. Ensures pool is properly shut down. */ ~ThreadPool() { shutdown(); }; @@ -98,12 +102,20 @@ class ThreadPool : public Util::__Util_ThreadPool_Impl::ThreadPoolImpl __start(reinterpret_cast<start_fn_t>(&run), this); }; /** @brief Completes outstanding work and destroys worker threads. + * Returns an error log when any child task crashes when + * iv_checkChildRc is set. * * @note This function will block until all work is completed and * worker threads are destroyed. + * + * @return nullptr if all child tasks finished (child status + * checking disabled); + * nullptr if all child tasks finished successfully, or a + * pointer to error log otherwise (child status checking + * enabled). */ - void shutdown() - { __shutdown(); }; + errlHndl_t shutdown() + { return __shutdown(); }; /** @brief Insert a work item onto the thread-pool's queue. * diff --git a/src/include/usr/util/util_reasoncodes.H b/src/include/usr/util/util_reasoncodes.H index 17ea0edfc..1f8146baf 100644 --- a/src/include/usr/util/util_reasoncodes.H +++ b/src/include/usr/util/util_reasoncodes.H @@ -53,6 +53,7 @@ namespace Util UTIL_MOD_GET_OBUS_PLL_BUCKET = 0x14, // UtilCommonAttr::getObusPllBucket UTIL_LIDMGR_CSTOR = 0x15, // UtilLidMgr::UtilLidMgr UTIL_MCL_PROCESS_SINGLE_COMP = 0x16, // UtilLidMgr::processSingleComponent + UTIL_MOD_TP_SHUTDOWN = 0x17, // Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown }; enum ReasonCode @@ -85,6 +86,7 @@ namespace Util UTIL_ERC_NO_MATCHING_FREQ = UTIL_COMP_ID | 0x1B, UTIL_LIDMGR_INVAL_LID_REQUEST = UTIL_COMP_ID | 0x1C, UTIL_LIDMGR_INVAL_COMP = UTIL_COMP_ID | 0x1D, + UTIL_RC_CHILD_TASK_FAILED = UTIL_COMP_ID | 0x1E, }; }; diff --git a/src/usr/diag/mdia/mdia.C b/src/usr/diag/mdia/mdia.C index a13f28e59..c4b2f4993 100644 --- a/src/usr/diag/mdia/mdia.C +++ b/src/usr/diag/mdia/mdia.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2018 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -115,7 +115,10 @@ errlHndl_t runStep(const TargetHandleList & i_targetList) // ensure threads and pools are shutdown when finished - doStepCleanup(globals); + if(nullptr == err) + { + err = doStepCleanup(globals); + } // If this step completes without the need for a reconfig due to an RCD // parity error, clear all RCD parity error counters. @@ -140,13 +143,14 @@ errlHndl_t runStep(const TargetHandleList & i_targetList) } -void doStepCleanup(const Globals & i_globals) +errlHndl_t doStepCleanup(const Globals & i_globals) { // stop the state machine - Singleton<StateMachine>::instance().shutdown(); + errlHndl_t l_errl = Singleton<StateMachine>::instance().shutdown(); // TODO ... stop the command monitor + return l_errl; } errlHndl_t processEvent(MaintCommandEvent & i_event) diff --git a/src/usr/diag/mdia/mdiafwd.H b/src/usr/diag/mdia/mdiafwd.H index e3395781e..eae069588 100644 --- a/src/usr/diag/mdia/mdiafwd.H +++ b/src/usr/diag/mdia/mdiafwd.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2018 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -211,8 +211,10 @@ errlHndl_t getWorkFlow( * @brief doStepCleanup shut down threads and pools on step exit * * @param[in] i_globals contains objects to be cleaned up + * + * @return nullptr on success; non-nullptr on error */ -void doStepCleanup(const Globals & i_globals); +errlHndl_t doStepCleanup(const Globals & i_globals); /** * @brief check if hw state has been changed for an mba diff --git a/src/usr/diag/mdia/mdiasm.C b/src/usr/diag/mdia/mdiasm.C index 486861e66..8ffa7e4be 100644 --- a/src/usr/diag/mdia/mdiasm.C +++ b/src/usr/diag/mdia/mdiasm.C @@ -1665,10 +1665,12 @@ void StateMachine::reset() mutex_unlock(&iv_mutex); } -void StateMachine::shutdown() +errlHndl_t StateMachine::shutdown() { mutex_lock(&iv_mutex); + errlHndl_t l_errl = nullptr; + Util::ThreadPool<WorkItem> * tp = iv_tp; CommandMonitor * monitor = iv_monitor; @@ -1684,7 +1686,7 @@ void StateMachine::shutdown() if(tp) { MDIA_FAST("Stopping threadPool..."); - tp->shutdown(); + l_errl = tp->shutdown(); delete tp; } @@ -1696,11 +1698,16 @@ void StateMachine::shutdown() } MDIA_FAST("sm: ...shutdown complete"); + return l_errl; } StateMachine::~StateMachine() { - shutdown(); + errlHndl_t l_errl = shutdown(); + if(l_errl) + { + errlCommit(l_errl, MDIA_COMP_ID); + } sync_cond_destroy(&iv_cond); mutex_destroy(&iv_mutex); diff --git a/src/usr/diag/mdia/mdiasm.H b/src/usr/diag/mdia/mdiasm.H index 924af567a..55cc8ed74 100644 --- a/src/usr/diag/mdia/mdiasm.H +++ b/src/usr/diag/mdia/mdiasm.H @@ -106,8 +106,10 @@ class StateMachine /** * @brief shutdown state machine + * + * @retval nullptr on success; non-nullptr on error */ - void shutdown(); + errlHndl_t shutdown(); /** * @brief processMaintCommandEvent process maint command event from prd diff --git a/src/usr/isteps/istep13/call_mss_draminit_trainadv.C b/src/usr/isteps/istep13/call_mss_draminit_trainadv.C index fc42f7264..f50916be7 100644 --- a/src/usr/isteps/istep13/call_mss_draminit_trainadv.C +++ b/src/usr/isteps/istep13/call_mss_draminit_trainadv.C @@ -154,7 +154,7 @@ void MembufWorkItem::operator()() mutex_unlock(&g_stepErrorMutex); // Commit Error - errlCommit( l_err, HWPF_COMP_ID ); + errlCommit( l_err, ISTEP_COMP_ID ); break; } @@ -210,7 +210,7 @@ void* call_mss_draminit_trainadv (void *io_pArgs) l_stepError.addErrorDetails( l_err ); // Commit Error - errlCommit( l_err, HWPF_COMP_ID ); + errlCommit( l_err, ISTEP_COMP_ID ); } TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, @@ -252,7 +252,14 @@ void* call_mss_draminit_trainadv (void *io_pArgs) tp.start(); //wait for all workitems to complete, then clean up all threads. - tp.shutdown(); + l_err = tp.shutdown(); + if(l_err) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + ERR_MRK"call_mss_draminit_trainadv: thread pool returned an error"); + l_stepError.addErrorDetails(l_err); + errlCommit(l_err, ISTEP_COMP_ID); + } } TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, diff --git a/src/usr/util/test/threadpool.H b/src/usr/util/test/threadpool.H index 9784540a9..e8f41e9a3 100644 --- a/src/usr/util/test/threadpool.H +++ b/src/usr/util/test/threadpool.H @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -25,6 +27,14 @@ #include <cxxtest/TestSuite.H> #include <util/threadpool.H> +#include <errl/errlentry.H> +#include <errl/errlmanager.H> +#include <util/util_reasoncodes.H> +#include <hbotcompid.H> + +// Thread pool constructor flags +#define DISABLE_CHILD_RC_CHECKING false +#define CHECK_CHILD_RC true namespace __ThreadPoolTest { @@ -86,6 +96,17 @@ namespace __ThreadPoolTest uint64_t* iv_value; }; + /** A simple work item that causes the task to crash */ + struct CrashedTask + { + CrashedTask() {}; + void operator()() + { + uint8_t* l_ptr = nullptr; + *l_ptr = 1; + } + }; + }; @@ -96,7 +117,8 @@ class ThreadPoolTest: public CxxTest::TestSuite * ordered pools. */ void testSimpleWorkItem() { - Util::ThreadPool<__ThreadPoolTest::Simple> instance; + Util::ThreadPool<__ThreadPoolTest::Simple> + instance(DISABLE_CHILD_RC_CHECKING); uint64_t value = 0; instance.insert(new __ThreadPoolTest::Simple(&value)); @@ -108,7 +130,8 @@ class ThreadPoolTest: public CxxTest::TestSuite TS_FAIL("Value was not changed by Simple worker thread."); } - Util::ThreadPool<__ThreadPoolTest::SimpleOrdered> instance2; + Util::ThreadPool<__ThreadPoolTest::SimpleOrdered> + instance2(DISABLE_CHILD_RC_CHECKING); value = 0; instance2.insert(new __ThreadPoolTest::SimpleOrdered(&value)); @@ -127,7 +150,8 @@ class ThreadPoolTest: public CxxTest::TestSuite * threads as directed. */ void testThreadManager() { - Util::ThreadPool<__ThreadPoolTest::EnsureThreads> instance; + Util::ThreadPool<__ThreadPoolTest::EnsureThreads> + instance(DISABLE_CHILD_RC_CHECKING); barrier_t barrier; for (size_t count = 1; count < 5; count++) @@ -151,7 +175,8 @@ class ThreadPoolTest: public CxxTest::TestSuite /** Test that the order functions work on the thread pool. */ void testThreadOrder() { - Util::ThreadPool<__ThreadPoolTest::EnsureOrder> instance; + Util::ThreadPool<__ThreadPoolTest::EnsureOrder> + instance(DISABLE_CHILD_RC_CHECKING); uint64_t value = 0; // Ensure that adding work items in order works. @@ -191,6 +216,102 @@ class ThreadPoolTest: public CxxTest::TestSuite instance.start(); instance.shutdown(); } + + /** Test a good child task that doesn't return an error. */ + void testChildRCGoodTask() + { + TS_INFO(ENTER_MRK"testChildRCGoodTask"); + Util::ThreadPool<__ThreadPoolTest::Simple> + l_threadPool(CHECK_CHILD_RC); + uint64_t l_value = 0; + + do { + l_threadPool.insert(new __ThreadPoolTest::Simple(&l_value)); + l_threadPool.start(); + errlHndl_t l_errl = l_threadPool.shutdown(); + if(l_errl) + { + TS_FAIL("testChildRCGoodTask: an unexpected error log (EID 0x%.8%x) returned from the thread pool", l_errl->eid()); + errlCommit(l_errl, CXXTEST_COMP_ID); + break; + } + + if(l_value == 0) + { + TS_FAIL("testChildRCGoodTask: the test value was not changed by the child task"); + break; + } + + }while(0); + TS_INFO(EXIT_MRK"testChildRCGoodTask"); + } + + /** Test that the crashed task's error log is returned to thread pool + correctly */ + void testChildRCCrashedTask() + { + TS_INFO(ENTER_MRK"testChildRCCrashedTask"); + Util::ThreadPool<__ThreadPoolTest::CrashedTask> + l_threadPool(CHECK_CHILD_RC); + errlHndl_t l_errl = nullptr; + + do { + l_threadPool.insert(new __ThreadPoolTest::CrashedTask()); + l_threadPool.start(); + l_errl = l_threadPool.shutdown(); + if(!l_errl) + { + TS_FAIL("testChildRCCrashedTask: the thread pool did not return an error log as was expected"); + break; + } + + if(l_errl->moduleId() != Util::UTIL_MOD_TP_SHUTDOWN) + { + TS_FAIL("testChildRCCrashedTask: unexpected moduleId returned from EID 0x%.8x; expected: 0x%x; actual: 0x%x", + l_errl->eid(), + Util::UTIL_MOD_TP_SHUTDOWN, + l_errl->moduleId()); + break; + } + + if(l_errl->reasonCode() != Util::UTIL_RC_CHILD_TASK_FAILED) + { + TS_FAIL("testChildRCCrashedTask: unexpected return code from EID 0x%.8x; expected: 0x%x; actual 0x%x", + l_errl->eid(), + Util::UTIL_RC_CHILD_TASK_FAILED, + l_errl->reasonCode()); + break; + } + }while(0); + + if(l_errl) + { + ERRORLOG::errlCommit(l_errl, CXXTEST_COMP_ID); + } + TS_INFO(EXIT_MRK"testChildRCCrashedTask"); + } + + /* Test that error is not returned by crashed task when explicitly + not requested by thread pool */ + void testChildNoRCCrashedTask() + { + TS_INFO(ENTER_MRK"testChildNoRCCrashedTask"); + Util::ThreadPool<__ThreadPoolTest::CrashedTask> + l_threadPool(DISABLE_CHILD_RC_CHECKING); + errlHndl_t l_errl = nullptr; + + l_threadPool.insert(new __ThreadPoolTest::CrashedTask()); + l_threadPool.start(); + l_errl = l_threadPool.shutdown(); + + if(l_errl) + { + TS_FAIL("testChildNoRCCrashedTask: unexpected error returned from the thread pool (EID 0x%.8x)", l_errl->eid()); + ERRORLOG::errlCommit(l_errl, CXXTEST_COMP_ID); + } + + TS_INFO(EXIT_MRK"testChildNoRCCrashedTask"); + } }; #endif diff --git a/src/usr/util/threadpool.C b/src/usr/util/threadpool.C index d2b156839..b51cca642 100644 --- a/src/usr/util/threadpool.C +++ b/src/usr/util/threadpool.C @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2019 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -23,6 +25,9 @@ #include <util/threadpool.H> #include <sys/task.h> #include <sys/misc.h> +#include <util/util_reasoncodes.H> +#include <errl/errlmanager.H> +#include <hbotcompid.H> void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__init() { @@ -96,10 +101,15 @@ void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__start( mutex_unlock(&iv_mutex); } -void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown() +errlHndl_t Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown() { mutex_lock(&iv_mutex); + int l_childRc = 0; + void* l_childRetval = nullptr; + errlHndl_t l_origError = nullptr; + errlHndl_t l_errl = nullptr; + // Set shutdown status and signal all children to release from their // condition variable. iv_shutdown = true; @@ -109,14 +119,52 @@ void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown() while(!iv_children.empty()) { tid_t child = iv_children.front(); + tid_t l_returnedTid = 0; iv_children.pop_front(); mutex_unlock(&iv_mutex); - task_wait_tid(child, NULL, NULL); // Don't need status. + l_returnedTid = task_wait_tid(child, &l_childRc, &l_childRetval); + if(iv_checkChildRc && + ((l_returnedTid != child) || + (l_childRc != TASK_STATUS_EXITED_CLEAN))) + { + /** + * @errortype + * @moduleid UTIL_MOD_TP_SHUTDOWN + * @reasoncode UTIL_RC_CHILD_TASK_FAILED + * @userdata1 The return code of the child thread + * @userdata2[0:31] The returned task ID of the child thread + * @userdata2[32:63] The original task ID of the child thread + * @devdesc The child thread of a thread pool returned an + * error + * @custdesc A failure occurred during the IPL of the system + */ + l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_UNRECOVERABLE, + UTIL_MOD_TP_SHUTDOWN, + UTIL_RC_CHILD_TASK_FAILED, + l_childRc, + TWO_UINT32_TO_UINT64(l_returnedTid, + child), + ERRORLOG::ErrlEntry::ADD_SW_CALLOUT + ); + l_errl->collectTrace(UTIL_COMP_NAME); + + if(!l_origError) + { + l_origError = l_errl; + l_errl = nullptr; + } + else + { + l_errl->plid(l_origError->plid()); + errlCommit(l_errl, UTIL_COMP_ID); + } + } mutex_lock(&iv_mutex); } mutex_unlock(&iv_mutex); + return l_origError; } // Default thread count of one per HW thread. |

