summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorIlya Smirnov <ismirno@us.ibm.com>2019-04-11 09:42:19 -0500
committerDaniel M. Crowell <dcrowell@us.ibm.com>2019-04-24 13:52:59 -0500
commita1f8b3160946b44d37ac947de70430f4a49b28cd (patch)
tree5f09163e170d4d56e4b7b33857d4f00a03018729 /src
parent2007c4f940856589a087452beac9a20359098b0b (diff)
downloadblackbird-hostboot-a1f8b3160946b44d37ac947de70430f4a49b28cd.tar.gz
blackbird-hostboot-a1f8b3160946b44d37ac947de70430f4a49b28cd.zip
Add Child RC Checking to Thread Pool
Add logic for Thread Pool to return error log when shutdown() is called if a child thread (or multiple child threads) had crashed. This is a default behavior, but a constructor option can switch this functionality off. The new logic will aid in debugging of the failures of child processes, as the failures could be catched when they occurred and not at some point after. Change-Id: I9736d659a086701b8e4f18f41504df4864924d88 RTC: 208517 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75897 Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com> Reviewed-by: Michael Baiocchi <mbaiocch@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src')
-rw-r--r--src/include/usr/util/impl/threadpool.H14
-rw-r--r--src/include/usr/util/threadpool.H20
-rw-r--r--src/include/usr/util/util_reasoncodes.H2
-rw-r--r--src/usr/diag/mdia/mdia.C12
-rw-r--r--src/usr/diag/mdia/mdiafwd.H6
-rw-r--r--src/usr/diag/mdia/mdiasm.C13
-rw-r--r--src/usr/diag/mdia/mdiasm.H4
-rw-r--r--src/usr/isteps/istep13/call_mss_draminit_trainadv.C13
-rw-r--r--src/usr/util/test/threadpool.H131
-rw-r--r--src/usr/util/threadpool.C54
10 files changed, 241 insertions, 28 deletions
diff --git a/src/include/usr/util/impl/threadpool.H b/src/include/usr/util/impl/threadpool.H
index 84ee9afd7..5ad530057 100644
--- a/src/include/usr/util/impl/threadpool.H
+++ b/src/include/usr/util/impl/threadpool.H
@@ -5,7 +5,9 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* COPYRIGHT International Business Machines Corp. 2012,2014 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
+/* [+] International Business Machines Corp. */
+/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
@@ -38,6 +40,7 @@
#include <algorithm>
#include <sys/sync.h>
#include <sys/task.h>
+#include <errl/errlentry.H>
namespace Util
{
@@ -82,7 +85,11 @@ namespace __Util_ThreadPool_Impl
/** Simple constructor, call __init to avoid the in-charge and
* not-in-charge construction costs. */
- ThreadPoolImpl() { __init(); };
+ ThreadPoolImpl(bool i_checkChildRc = true) :
+ iv_checkChildRc(i_checkChildRc)
+ {
+ __init();
+ };
protected:
/** Initialize the object. */
@@ -105,7 +112,7 @@ namespace __Util_ThreadPool_Impl
*/
void __start(start_fn_t fn, void* instance);
/** Stop the thread-pool. */
- void __shutdown();
+ errlHndl_t __shutdown();
/** Outstanding work-list. */
worklist_t iv_worklist;
@@ -119,6 +126,7 @@ namespace __Util_ThreadPool_Impl
std::list<tid_t> iv_children;
/** State of object. */
bool iv_shutdown;
+ bool iv_checkChildRc;
};
diff --git a/src/include/usr/util/threadpool.H b/src/include/usr/util/threadpool.H
index f96e0b916..43fdeba7a 100644
--- a/src/include/usr/util/threadpool.H
+++ b/src/include/usr/util/threadpool.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2015 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -58,6 +58,7 @@
#include <stdint.h>
#include <util/traits/has_lessthan.H>
#include <util/impl/threadpool.H>
+#include <errl/errlentry.H>
namespace Util
{
@@ -87,7 +88,10 @@ class ThreadPool : public Util::__Util_ThreadPool_Impl::ThreadPoolImpl
{
public:
/** Basic Constructor. Initialize ThreadPool. */
- ThreadPool() : Util::__Util_ThreadPool_Impl::ThreadPoolImpl() { };
+ ThreadPool(bool i_checkChildRc = true) :
+ Util::__Util_ThreadPool_Impl::ThreadPoolImpl(i_checkChildRc)
+ {
+ };
/** Basic Destructor. Ensures pool is properly shut down. */
~ThreadPool() { shutdown(); };
@@ -98,12 +102,20 @@ class ThreadPool : public Util::__Util_ThreadPool_Impl::ThreadPoolImpl
__start(reinterpret_cast<start_fn_t>(&run), this);
};
/** @brief Completes outstanding work and destroys worker threads.
+ * Returns an error log when any child task crashes when
+ * iv_checkChildRc is set.
*
* @note This function will block until all work is completed and
* worker threads are destroyed.
+ *
+ * @return nullptr if all child tasks finished (child status
+ * checking disabled);
+ * nullptr if all child tasks finished successfully, or a
+ * pointer to error log otherwise (child status checking
+ * enabled).
*/
- void shutdown()
- { __shutdown(); };
+ errlHndl_t shutdown()
+ { return __shutdown(); };
/** @brief Insert a work item onto the thread-pool's queue.
*
diff --git a/src/include/usr/util/util_reasoncodes.H b/src/include/usr/util/util_reasoncodes.H
index 17ea0edfc..1f8146baf 100644
--- a/src/include/usr/util/util_reasoncodes.H
+++ b/src/include/usr/util/util_reasoncodes.H
@@ -53,6 +53,7 @@ namespace Util
UTIL_MOD_GET_OBUS_PLL_BUCKET = 0x14, // UtilCommonAttr::getObusPllBucket
UTIL_LIDMGR_CSTOR = 0x15, // UtilLidMgr::UtilLidMgr
UTIL_MCL_PROCESS_SINGLE_COMP = 0x16, // UtilLidMgr::processSingleComponent
+ UTIL_MOD_TP_SHUTDOWN = 0x17, // Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown
};
enum ReasonCode
@@ -85,6 +86,7 @@ namespace Util
UTIL_ERC_NO_MATCHING_FREQ = UTIL_COMP_ID | 0x1B,
UTIL_LIDMGR_INVAL_LID_REQUEST = UTIL_COMP_ID | 0x1C,
UTIL_LIDMGR_INVAL_COMP = UTIL_COMP_ID | 0x1D,
+ UTIL_RC_CHILD_TASK_FAILED = UTIL_COMP_ID | 0x1E,
};
};
diff --git a/src/usr/diag/mdia/mdia.C b/src/usr/diag/mdia/mdia.C
index a13f28e59..c4b2f4993 100644
--- a/src/usr/diag/mdia/mdia.C
+++ b/src/usr/diag/mdia/mdia.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2018 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -115,7 +115,10 @@ errlHndl_t runStep(const TargetHandleList & i_targetList)
// ensure threads and pools are shutdown when finished
- doStepCleanup(globals);
+ if(nullptr == err)
+ {
+ err = doStepCleanup(globals);
+ }
// If this step completes without the need for a reconfig due to an RCD
// parity error, clear all RCD parity error counters.
@@ -140,13 +143,14 @@ errlHndl_t runStep(const TargetHandleList & i_targetList)
}
-void doStepCleanup(const Globals & i_globals)
+errlHndl_t doStepCleanup(const Globals & i_globals)
{
// stop the state machine
- Singleton<StateMachine>::instance().shutdown();
+ errlHndl_t l_errl = Singleton<StateMachine>::instance().shutdown();
// TODO ... stop the command monitor
+ return l_errl;
}
errlHndl_t processEvent(MaintCommandEvent & i_event)
diff --git a/src/usr/diag/mdia/mdiafwd.H b/src/usr/diag/mdia/mdiafwd.H
index e3395781e..eae069588 100644
--- a/src/usr/diag/mdia/mdiafwd.H
+++ b/src/usr/diag/mdia/mdiafwd.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2012,2018 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -211,8 +211,10 @@ errlHndl_t getWorkFlow(
* @brief doStepCleanup shut down threads and pools on step exit
*
* @param[in] i_globals contains objects to be cleaned up
+ *
+ * @return nullptr on success; non-nullptr on error
*/
-void doStepCleanup(const Globals & i_globals);
+errlHndl_t doStepCleanup(const Globals & i_globals);
/**
* @brief check if hw state has been changed for an mba
diff --git a/src/usr/diag/mdia/mdiasm.C b/src/usr/diag/mdia/mdiasm.C
index 486861e66..8ffa7e4be 100644
--- a/src/usr/diag/mdia/mdiasm.C
+++ b/src/usr/diag/mdia/mdiasm.C
@@ -1665,10 +1665,12 @@ void StateMachine::reset()
mutex_unlock(&iv_mutex);
}
-void StateMachine::shutdown()
+errlHndl_t StateMachine::shutdown()
{
mutex_lock(&iv_mutex);
+ errlHndl_t l_errl = nullptr;
+
Util::ThreadPool<WorkItem> * tp = iv_tp;
CommandMonitor * monitor = iv_monitor;
@@ -1684,7 +1686,7 @@ void StateMachine::shutdown()
if(tp)
{
MDIA_FAST("Stopping threadPool...");
- tp->shutdown();
+ l_errl = tp->shutdown();
delete tp;
}
@@ -1696,11 +1698,16 @@ void StateMachine::shutdown()
}
MDIA_FAST("sm: ...shutdown complete");
+ return l_errl;
}
StateMachine::~StateMachine()
{
- shutdown();
+ errlHndl_t l_errl = shutdown();
+ if(l_errl)
+ {
+ errlCommit(l_errl, MDIA_COMP_ID);
+ }
sync_cond_destroy(&iv_cond);
mutex_destroy(&iv_mutex);
diff --git a/src/usr/diag/mdia/mdiasm.H b/src/usr/diag/mdia/mdiasm.H
index 924af567a..55cc8ed74 100644
--- a/src/usr/diag/mdia/mdiasm.H
+++ b/src/usr/diag/mdia/mdiasm.H
@@ -106,8 +106,10 @@ class StateMachine
/**
* @brief shutdown state machine
+ *
+ * @retval nullptr on success; non-nullptr on error
*/
- void shutdown();
+ errlHndl_t shutdown();
/**
* @brief processMaintCommandEvent process maint command event from prd
diff --git a/src/usr/isteps/istep13/call_mss_draminit_trainadv.C b/src/usr/isteps/istep13/call_mss_draminit_trainadv.C
index fc42f7264..f50916be7 100644
--- a/src/usr/isteps/istep13/call_mss_draminit_trainadv.C
+++ b/src/usr/isteps/istep13/call_mss_draminit_trainadv.C
@@ -154,7 +154,7 @@ void MembufWorkItem::operator()()
mutex_unlock(&g_stepErrorMutex);
// Commit Error
- errlCommit( l_err, HWPF_COMP_ID );
+ errlCommit( l_err, ISTEP_COMP_ID );
break;
}
@@ -210,7 +210,7 @@ void* call_mss_draminit_trainadv (void *io_pArgs)
l_stepError.addErrorDetails( l_err );
// Commit Error
- errlCommit( l_err, HWPF_COMP_ID );
+ errlCommit( l_err, ISTEP_COMP_ID );
}
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
@@ -252,7 +252,14 @@ void* call_mss_draminit_trainadv (void *io_pArgs)
tp.start();
//wait for all workitems to complete, then clean up all threads.
- tp.shutdown();
+ l_err = tp.shutdown();
+ if(l_err)
+ {
+ TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
+ ERR_MRK"call_mss_draminit_trainadv: thread pool returned an error");
+ l_stepError.addErrorDetails(l_err);
+ errlCommit(l_err, ISTEP_COMP_ID);
+ }
}
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
diff --git a/src/usr/util/test/threadpool.H b/src/usr/util/test/threadpool.H
index 9784540a9..e8f41e9a3 100644
--- a/src/usr/util/test/threadpool.H
+++ b/src/usr/util/test/threadpool.H
@@ -5,7 +5,9 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* COPYRIGHT International Business Machines Corp. 2012,2014 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
+/* [+] International Business Machines Corp. */
+/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
@@ -25,6 +27,14 @@
#include <cxxtest/TestSuite.H>
#include <util/threadpool.H>
+#include <errl/errlentry.H>
+#include <errl/errlmanager.H>
+#include <util/util_reasoncodes.H>
+#include <hbotcompid.H>
+
+// Thread pool constructor flags
+#define DISABLE_CHILD_RC_CHECKING false
+#define CHECK_CHILD_RC true
namespace __ThreadPoolTest
{
@@ -86,6 +96,17 @@ namespace __ThreadPoolTest
uint64_t* iv_value;
};
+ /** A simple work item that causes the task to crash */
+ struct CrashedTask
+ {
+ CrashedTask() {};
+ void operator()()
+ {
+ uint8_t* l_ptr = nullptr;
+ *l_ptr = 1;
+ }
+ };
+
};
@@ -96,7 +117,8 @@ class ThreadPoolTest: public CxxTest::TestSuite
* ordered pools. */
void testSimpleWorkItem()
{
- Util::ThreadPool<__ThreadPoolTest::Simple> instance;
+ Util::ThreadPool<__ThreadPoolTest::Simple>
+ instance(DISABLE_CHILD_RC_CHECKING);
uint64_t value = 0;
instance.insert(new __ThreadPoolTest::Simple(&value));
@@ -108,7 +130,8 @@ class ThreadPoolTest: public CxxTest::TestSuite
TS_FAIL("Value was not changed by Simple worker thread.");
}
- Util::ThreadPool<__ThreadPoolTest::SimpleOrdered> instance2;
+ Util::ThreadPool<__ThreadPoolTest::SimpleOrdered>
+ instance2(DISABLE_CHILD_RC_CHECKING);
value = 0;
instance2.insert(new __ThreadPoolTest::SimpleOrdered(&value));
@@ -127,7 +150,8 @@ class ThreadPoolTest: public CxxTest::TestSuite
* threads as directed. */
void testThreadManager()
{
- Util::ThreadPool<__ThreadPoolTest::EnsureThreads> instance;
+ Util::ThreadPool<__ThreadPoolTest::EnsureThreads>
+ instance(DISABLE_CHILD_RC_CHECKING);
barrier_t barrier;
for (size_t count = 1; count < 5; count++)
@@ -151,7 +175,8 @@ class ThreadPoolTest: public CxxTest::TestSuite
/** Test that the order functions work on the thread pool. */
void testThreadOrder()
{
- Util::ThreadPool<__ThreadPoolTest::EnsureOrder> instance;
+ Util::ThreadPool<__ThreadPoolTest::EnsureOrder>
+ instance(DISABLE_CHILD_RC_CHECKING);
uint64_t value = 0;
// Ensure that adding work items in order works.
@@ -191,6 +216,102 @@ class ThreadPoolTest: public CxxTest::TestSuite
instance.start();
instance.shutdown();
}
+
+ /** Test a good child task that doesn't return an error. */
+ void testChildRCGoodTask()
+ {
+ TS_INFO(ENTER_MRK"testChildRCGoodTask");
+ Util::ThreadPool<__ThreadPoolTest::Simple>
+ l_threadPool(CHECK_CHILD_RC);
+ uint64_t l_value = 0;
+
+ do {
+ l_threadPool.insert(new __ThreadPoolTest::Simple(&l_value));
+ l_threadPool.start();
+ errlHndl_t l_errl = l_threadPool.shutdown();
+ if(l_errl)
+ {
+ TS_FAIL("testChildRCGoodTask: an unexpected error log (EID 0x%.8%x) returned from the thread pool", l_errl->eid());
+ errlCommit(l_errl, CXXTEST_COMP_ID);
+ break;
+ }
+
+ if(l_value == 0)
+ {
+ TS_FAIL("testChildRCGoodTask: the test value was not changed by the child task");
+ break;
+ }
+
+ }while(0);
+ TS_INFO(EXIT_MRK"testChildRCGoodTask");
+ }
+
+ /** Test that the crashed task's error log is returned to thread pool
+ correctly */
+ void testChildRCCrashedTask()
+ {
+ TS_INFO(ENTER_MRK"testChildRCCrashedTask");
+ Util::ThreadPool<__ThreadPoolTest::CrashedTask>
+ l_threadPool(CHECK_CHILD_RC);
+ errlHndl_t l_errl = nullptr;
+
+ do {
+ l_threadPool.insert(new __ThreadPoolTest::CrashedTask());
+ l_threadPool.start();
+ l_errl = l_threadPool.shutdown();
+ if(!l_errl)
+ {
+ TS_FAIL("testChildRCCrashedTask: the thread pool did not return an error log as was expected");
+ break;
+ }
+
+ if(l_errl->moduleId() != Util::UTIL_MOD_TP_SHUTDOWN)
+ {
+ TS_FAIL("testChildRCCrashedTask: unexpected moduleId returned from EID 0x%.8x; expected: 0x%x; actual: 0x%x",
+ l_errl->eid(),
+ Util::UTIL_MOD_TP_SHUTDOWN,
+ l_errl->moduleId());
+ break;
+ }
+
+ if(l_errl->reasonCode() != Util::UTIL_RC_CHILD_TASK_FAILED)
+ {
+ TS_FAIL("testChildRCCrashedTask: unexpected return code from EID 0x%.8x; expected: 0x%x; actual 0x%x",
+ l_errl->eid(),
+ Util::UTIL_RC_CHILD_TASK_FAILED,
+ l_errl->reasonCode());
+ break;
+ }
+ }while(0);
+
+ if(l_errl)
+ {
+ ERRORLOG::errlCommit(l_errl, CXXTEST_COMP_ID);
+ }
+ TS_INFO(EXIT_MRK"testChildRCCrashedTask");
+ }
+
+ /* Test that error is not returned by crashed task when explicitly
+ not requested by thread pool */
+ void testChildNoRCCrashedTask()
+ {
+ TS_INFO(ENTER_MRK"testChildNoRCCrashedTask");
+ Util::ThreadPool<__ThreadPoolTest::CrashedTask>
+ l_threadPool(DISABLE_CHILD_RC_CHECKING);
+ errlHndl_t l_errl = nullptr;
+
+ l_threadPool.insert(new __ThreadPoolTest::CrashedTask());
+ l_threadPool.start();
+ l_errl = l_threadPool.shutdown();
+
+ if(l_errl)
+ {
+ TS_FAIL("testChildNoRCCrashedTask: unexpected error returned from the thread pool (EID 0x%.8x)", l_errl->eid());
+ ERRORLOG::errlCommit(l_errl, CXXTEST_COMP_ID);
+ }
+
+ TS_INFO(EXIT_MRK"testChildNoRCCrashedTask");
+ }
};
#endif
diff --git a/src/usr/util/threadpool.C b/src/usr/util/threadpool.C
index d2b156839..b51cca642 100644
--- a/src/usr/util/threadpool.C
+++ b/src/usr/util/threadpool.C
@@ -5,7 +5,9 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* COPYRIGHT International Business Machines Corp. 2012,2014 */
+/* Contributors Listed Below - COPYRIGHT 2012,2019 */
+/* [+] International Business Machines Corp. */
+/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
@@ -23,6 +25,9 @@
#include <util/threadpool.H>
#include <sys/task.h>
#include <sys/misc.h>
+#include <util/util_reasoncodes.H>
+#include <errl/errlmanager.H>
+#include <hbotcompid.H>
void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__init()
{
@@ -96,10 +101,15 @@ void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__start(
mutex_unlock(&iv_mutex);
}
-void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown()
+errlHndl_t Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown()
{
mutex_lock(&iv_mutex);
+ int l_childRc = 0;
+ void* l_childRetval = nullptr;
+ errlHndl_t l_origError = nullptr;
+ errlHndl_t l_errl = nullptr;
+
// Set shutdown status and signal all children to release from their
// condition variable.
iv_shutdown = true;
@@ -109,14 +119,52 @@ void Util::__Util_ThreadPool_Impl::ThreadPoolImpl::__shutdown()
while(!iv_children.empty())
{
tid_t child = iv_children.front();
+ tid_t l_returnedTid = 0;
iv_children.pop_front();
mutex_unlock(&iv_mutex);
- task_wait_tid(child, NULL, NULL); // Don't need status.
+ l_returnedTid = task_wait_tid(child, &l_childRc, &l_childRetval);
+ if(iv_checkChildRc &&
+ ((l_returnedTid != child) ||
+ (l_childRc != TASK_STATUS_EXITED_CLEAN)))
+ {
+ /**
+ * @errortype
+ * @moduleid UTIL_MOD_TP_SHUTDOWN
+ * @reasoncode UTIL_RC_CHILD_TASK_FAILED
+ * @userdata1 The return code of the child thread
+ * @userdata2[0:31] The returned task ID of the child thread
+ * @userdata2[32:63] The original task ID of the child thread
+ * @devdesc The child thread of a thread pool returned an
+ * error
+ * @custdesc A failure occurred during the IPL of the system
+ */
+ l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_UNRECOVERABLE,
+ UTIL_MOD_TP_SHUTDOWN,
+ UTIL_RC_CHILD_TASK_FAILED,
+ l_childRc,
+ TWO_UINT32_TO_UINT64(l_returnedTid,
+ child),
+ ERRORLOG::ErrlEntry::ADD_SW_CALLOUT
+ );
+ l_errl->collectTrace(UTIL_COMP_NAME);
+
+ if(!l_origError)
+ {
+ l_origError = l_errl;
+ l_errl = nullptr;
+ }
+ else
+ {
+ l_errl->plid(l_origError->plid());
+ errlCommit(l_errl, UTIL_COMP_ID);
+ }
+ }
mutex_lock(&iv_mutex);
}
mutex_unlock(&iv_mutex);
+ return l_origError;
}
// Default thread count of one per HW thread.
OpenPOWER on IntegriCloud