diff options
author | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2016-05-05 16:15:57 +0000 |
---|---|---|
committer | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2016-05-05 16:15:57 +0000 |
commit | 11dc82fa83b9e058dee690584d7ba4c0380fbf64 (patch) | |
tree | a7eda7b548953e1949e86cbe62a0cd5f63366023 | |
parent | 3e0b7837bfcabb877e5f1a6747335409d367be76 (diff) | |
download | bcm5719-llvm-11dc82fa83b9e058dee690584d7ba4c0380fbf64.tar.gz bcm5719-llvm-11dc82fa83b9e058dee690584d7ba4c0380fbf64.zip |
[STATS] Use partitioned timer scheme
This change removes the current timers with ones that partition time properly.
The current timers are nested, so that if a new timer, B, starts when the
current timer, A, is already timing, A's time will include B's. To eliminate
this problem, the partitioned timers are designed to stop the current timer (A),
let the new timer run (B), and when the new timer is finished, restart the
previously running timer (A). With this partitioning of time, a threads' timers
all sum up to the OMP_worker_thread_life time and can now easily show the
percentage of time a thread is spending in different parts of the runtime or
user code.
There is also a new state variable associated with each thread which tells where
it is executing a task. This corresponds with the timers: OMP_task_*, e.g., if
time is spent in OMP_task_taskwait, then that thread executed tasks inside a
#pragma omp taskwait construct.
The changes are mostly changing the MACROs to use the new PARITIONED_* macros,
the new partitionedTimers class and its methods, and new state logic.
Differential Revision: http://reviews.llvm.org/D19229
llvm-svn: 268640
-rw-r--r-- | openmp/runtime/src/kmp_barrier.cpp | 18 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_csupport.c | 19 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_dispatch.cpp | 2 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_runtime.c | 23 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_sched.cpp | 2 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_stats.cpp | 79 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_stats.h | 205 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_stats_timing.h | 24 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_tasking.c | 26 | ||||
-rw-r--r-- | openmp/runtime/src/kmp_wait_release.h | 30 | ||||
-rw-r--r-- | openmp/runtime/src/z_Linux_util.c | 9 |
11 files changed, 369 insertions, 68 deletions
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index 6fef085f6b6..b51fdeea6b3 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -1048,6 +1048,8 @@ __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *)) { KMP_TIME_DEVELOPER_BLOCK(KMP_barrier); + KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); + KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); register int tid = __kmp_tid_from_gtid(gtid); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team = this_thr->th.th_team; @@ -1348,6 +1350,8 @@ __kmp_end_split_barrier(enum barrier_type bt, int gtid) void __kmp_join_barrier(int gtid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier); register kmp_info_t *this_thr = __kmp_threads[gtid]; register kmp_team_t *team; @@ -1463,6 +1467,18 @@ __kmp_join_barrier(int gtid) __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj) ); } +#if KMP_STATS_ENABLED + // Have master thread flag the workers to indicate they are now waiting for + // next parallel region, Also wake them up so they switch their timers to idle. + for (int i=0; i<team->t.t_nproc; ++i) { + kmp_info_t* team_thread = team->t.t_threads[i]; + if (team_thread == this_thr) + continue; + team_thread->th.th_stats->setIdleFlag(); + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) + __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc); + } +#endif #if USE_ITT_BUILD if (__itt_sync_create_ptr || KMP_ITT_DEBUG) __kmp_itt_barrier_middle(gtid, itt_sync_obj); @@ -1546,6 +1562,8 @@ __kmp_join_barrier(int gtid) void __kmp_fork_barrier(int gtid, int tid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier); kmp_info_t *this_thr = __kmp_threads[gtid]; kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c index aa934ae26b6..2c2a08f8307 100644 --- a/openmp/runtime/src/kmp_csupport.c +++ b/openmp/runtime/src/kmp_csupport.c @@ -290,7 +290,6 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) } else { - KMP_STOP_EXPLICIT_TIMER(OMP_serial); KMP_COUNT_BLOCK(OMP_PARALLEL); } #endif @@ -345,10 +344,6 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) } #endif } -#if (KMP_STATS_ENABLED) - if (!inParallel) - KMP_START_EXPLICIT_TIMER(OMP_serial); -#endif } #if OMP_40_ENABLED @@ -669,7 +664,6 @@ void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) { KMP_COUNT_BLOCK(OMP_BARRIER); - KMP_TIME_BLOCK(OMP_barrier); KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) ); if (! TCR_4(__kmp_init_parallel)) @@ -713,7 +707,7 @@ __kmpc_master(ident_t *loc, kmp_int32 global_tid) if( KMP_MASTER_GTID( global_tid )) { KMP_COUNT_BLOCK(OMP_MASTER); - KMP_START_EXPLICIT_TIMER(OMP_master); + KMP_PUSH_PARTITIONED_TIMER(OMP_master); status = 1; } @@ -763,7 +757,7 @@ __kmpc_end_master(ident_t *loc, kmp_int32 global_tid) KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) ); KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid )); - KMP_STOP_EXPLICIT_TIMER(OMP_master); + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE kmp_info_t *this_thr = __kmp_threads[ global_tid ]; @@ -1088,7 +1082,7 @@ __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none); #else KMP_COUNT_BLOCK(OMP_CRITICAL); - KMP_TIME_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ + KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */ kmp_user_lock_p lck; KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) ); @@ -1250,6 +1244,7 @@ __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_nam __kmp_itt_critical_acquired( lck ); #endif /* USE_ITT_BUILD */ + KMP_PUSH_PARTITIONED_TIMER(OMP_critical); KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid )); } // __kmpc_critical_with_hint @@ -1342,7 +1337,7 @@ __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit) #endif #endif // KMP_USE_DYNAMIC_LOCK - KMP_STOP_EXPLICIT_TIMER(OMP_critical); + KMP_POP_PARTITIONED_TIMER(); KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid )); } @@ -1464,7 +1459,7 @@ __kmpc_single(ident_t *loc, kmp_int32 global_tid) if (rc) { // We are going to execute the single statement, so we should count it. KMP_COUNT_BLOCK(OMP_SINGLE); - KMP_START_EXPLICIT_TIMER(OMP_single); + KMP_PUSH_PARTITIONED_TIMER(OMP_single); } #if OMPT_SUPPORT && OMPT_TRACE @@ -1507,7 +1502,7 @@ void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid) { __kmp_exit_single( global_tid ); - KMP_STOP_EXPLICIT_TIMER(OMP_single); + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE kmp_info_t *this_thr = __kmp_threads[ global_tid ]; diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index 9c3974a2f4b..6b8fbeea0b1 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -1424,7 +1424,7 @@ __kmp_dispatch_next( // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs // more than a compile time choice to use static scheduling would.) - KMP_TIME_BLOCK(FOR_dynamic_scheduling); + KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling); int status; dispatch_private_info_template< T > * pr; diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c index 092d913bbbb..c11cb625026 100644 --- a/openmp/runtime/src/kmp_runtime.c +++ b/openmp/runtime/src/kmp_runtime.c @@ -1543,7 +1543,8 @@ __kmp_fork_call( #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv #if OMPT_SUPPORT , exit_runtime_p @@ -1618,7 +1619,8 @@ __kmp_fork_call( gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) ); { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); if (! parent_team->t.t_invoke( gtid )) { KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" ); } @@ -1738,7 +1740,8 @@ __kmp_fork_call( #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv #if OMPT_SUPPORT , exit_runtime_p @@ -1795,7 +1798,8 @@ __kmp_fork_call( team->t.t_level--; // AC: call special invoker for outer "parallel" of the teams construct { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); invoker(gtid); } } else { @@ -1842,7 +1846,8 @@ __kmp_fork_call( #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); __kmp_invoke_microtask( microtask, gtid, 0, argc, args #if OMPT_SUPPORT , exit_runtime_p @@ -2178,7 +2183,8 @@ __kmp_fork_call( } // END of timer KMP_fork_call block { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke); if (! team->t.t_invoke( gtid )) { KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" ); @@ -5448,6 +5454,8 @@ __kmp_launch_thread( kmp_info_t *this_thr ) KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); { KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); rc = (*pteam)->t.t_invoke( gtid ); } KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop); @@ -6783,7 +6791,8 @@ __kmp_invoke_task_func( int gtid ) #endif { - KMP_TIME_BLOCK(OMP_work); + KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); + KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn), gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv #if OMPT_SUPPORT diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp index a09aa868173..eb2ee6792d0 100644 --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -84,7 +84,7 @@ __kmp_for_static_init( typename traits_t< T >::signed_t chunk ) { KMP_COUNT_BLOCK(OMP_FOR_static); - KMP_TIME_BLOCK (FOR_static_scheduling); + KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling); typedef typename traits_t< T >::unsigned_t UT; typedef typename traits_t< T >::signed_t ST; diff --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp index a4c626b988d..8100f3bf4ae 100644 --- a/openmp/runtime/src/kmp_stats.cpp +++ b/openmp/runtime/src/kmp_stats.cpp @@ -157,6 +157,7 @@ std::string statistic::format(char unit, bool total) const void explicitTimer::start(timer_e timerEnumValue) { startTime = tsc_tick_count::now(); + totalPauseTime = 0; if(timeStat::logEvent(timerEnumValue)) { __kmp_stats_thread_ptr->incrementNestValue(); } @@ -170,7 +171,7 @@ void explicitTimer::stop(timer_e timerEnumValue) { tsc_tick_count finishTime = tsc_tick_count::now(); //stat->addSample ((tsc_tick_count::now() - startTime).ticks()); - stat->addSample ((finishTime - startTime).ticks()); + stat->addSample(((finishTime - startTime) - totalPauseTime).ticks()); if(timeStat::logEvent(timerEnumValue)) { __kmp_stats_thread_ptr->push_event(startTime.getValue() - __kmp_stats_start_time.getValue(), finishTime.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue); @@ -182,6 +183,74 @@ void explicitTimer::stop(timer_e timerEnumValue) { return; } +/* ************************************************************** */ +/* ************* partitionedTimers member functions ************* */ +partitionedTimers::partitionedTimers() { + timer_stack.reserve(8); +} + +// add a timer to this collection of partitioned timers. +void partitionedTimers::add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer) { + KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST+1); + timers[timer_index] = timer_pointer; +} + +// initialize the paritioned timers to an initial timer +void partitionedTimers::init(timerPair init_timer_pair) { + KMP_DEBUG_ASSERT(this->timer_stack.size() == 0); + timer_stack.push_back(init_timer_pair); + timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer()); +} + +// stop/save the current timer, and start the new timer (timer_pair) +// There is a special condition where if the current timer is equal to +// the one you are trying to push, then it only manipulates the stack, +// and it won't stop/start the currently running timer. +void partitionedTimers::push(timerPair timer_pair) { + // get the current timer + // stop current timer + // push new timer + // start the new timer + KMP_DEBUG_ASSERT(this->timer_stack.size() > 0); + timerPair current_timer = timer_stack.back(); + timer_stack.push_back(timer_pair); + if(current_timer != timer_pair) { + timers[current_timer.get_index()]->pause(); + timers[timer_pair.get_index()]->start(timer_pair.get_timer()); + } +} + +// stop/discard the current timer, and start the previously saved timer +void partitionedTimers::pop() { + // get the current timer + // stop current timer + // pop current timer + // get the new current timer and start it back up + KMP_DEBUG_ASSERT(this->timer_stack.size() > 1); + timerPair current_timer = timer_stack.back(); + timer_stack.pop_back(); + timerPair new_timer = timer_stack.back(); + if(current_timer != new_timer) { + timers[current_timer.get_index()]->stop(current_timer.get_timer()); + timers[new_timer.get_index()]->resume(); + } +} + +// Wind up all the currently running timers. +// This pops off all the timers from the stack and clears the stack +// After this is called, init() must be run again to initialize the +// stack of timers +void partitionedTimers::windup() { + while(timer_stack.size() > 1) { + this->pop(); + } + if(timer_stack.size() > 0) { + timerPair last_timer = timer_stack.back(); + timer_stack.pop_back(); + timers[last_timer.get_index()]->stop(last_timer.get_timer()); + } +} + /* ******************************************************************* */ /* ************* kmp_stats_event_vector member functions ************* */ @@ -397,8 +466,10 @@ void kmp_stats_output_module::windupExplicitTimers() // If the timer wasn't running, this won't record anything anyway. kmp_stats_list::iterator it; for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) { + kmp_stats_list* ptr = *it; + ptr->getPartitionedTimers()->windup(); for (int timer=0; timer<EXPLICIT_TIMER_LAST; timer++) { - (*it)->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer); + ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer); } } } @@ -595,11 +666,7 @@ void __kmp_reset_stats() // reset the event vector so all previous events are "erased" (*it)->resetEventVector(); - - // May need to restart the explicit timers in thread zero? } - KMP_START_EXPLICIT_TIMER(OMP_serial); - KMP_START_EXPLICIT_TIMER(OMP_start_end); } // This function will reset all stats and stop all threads' explicit timers if they haven't been stopped already. diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h index 0377fcf9ed9..e317e7cbf80 100644 --- a/openmp/runtime/src/kmp_stats.h +++ b/openmp/runtime/src/kmp_stats.h @@ -27,6 +27,7 @@ #include <limits> #include <math.h> +#include <vector> #include <string> #include <stdint.h> #include <new> // placement new @@ -52,6 +53,23 @@ enum stats_flags_e { }; /*! + * @ingroup STATS_GATHERING + * \brief the states which a thread can be in + * + */ +enum stats_state_e { + IDLE, + SERIAL_REGION, + FORK_JOIN_BARRIER, + PLAIN_BARRIER, + TASKWAIT, + TASKYIELD, + TASKGROUP, + IMPLICIT_TASK, + EXPLICIT_TASK +}; + +/*! * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h * * @param macro a user defined macro that takes three arguments - macro(COUNTER_NAME, flags, arg) @@ -103,18 +121,25 @@ enum stats_flags_e { * * @ingroup STATS_GATHERING2 */ -#define KMP_FOREACH_TIMER(macro, arg) \ - macro (OMP_start_end, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \ - macro (OMP_serial, stats_flags_e::onlyInMaster | stats_flags_e::noTotal, arg) \ - macro (OMP_work, 0, arg) \ - macro (OMP_barrier, 0, arg) \ - macro (FOR_static_scheduling, 0, arg) \ - macro (FOR_dynamic_scheduling, 0, arg) \ - macro (OMP_task, 0, arg) \ - macro (OMP_critical, 0, arg) \ - macro (OMP_critical_wait, 0, arg) \ - macro (OMP_single, 0, arg) \ - macro (OMP_master, 0, arg) \ +#define KMP_FOREACH_TIMER(macro, arg) \ + macro (OMP_worker_thread_life, 0, arg) \ + macro (FOR_static_scheduling, 0, arg) \ + macro (FOR_dynamic_scheduling, 0, arg) \ + macro (OMP_critical, 0, arg) \ + macro (OMP_critical_wait, 0, arg) \ + macro (OMP_single, 0, arg) \ + macro (OMP_master, 0, arg) \ + macro (OMP_idle, 0, arg) \ + macro (OMP_plain_barrier, 0, arg) \ + macro (OMP_fork_join_barrier, 0, arg) \ + macro (OMP_parallel, 0, arg) \ + macro (OMP_task_immediate, 0, arg) \ + macro (OMP_task_taskwait, 0, arg) \ + macro (OMP_task_taskyield, 0, arg) \ + macro (OMP_task_taskgroup, 0, arg) \ + macro (OMP_task_join_bar, 0, arg) \ + macro (OMP_task_plain_bar, 0, arg) \ + macro (OMP_serial, 0, arg) \ macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ macro (FOR_static_iterations, stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ @@ -129,7 +154,16 @@ enum stats_flags_e { // OMP_barrier -- Time at "real" barriers (includes task time) // FOR_static_scheduling -- Time spent doing scheduling for a static "for" // FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for" -// OMP_task -- Time spent executing tasks +// OMP_idle -- Worker threads time spent waiting for inclusion in a parallel region +// OMP_plain_barrier -- Time spent in a barrier construct +// OMP_fork_join_barrier -- Time spent in a the fork-join barrier surrounding a parallel region +// OMP_parallel -- Time spent inside a parallel construct +// OMP_task_immediate -- Time spent executing non-deferred tasks +// OMP_task_taskwait -- Time spent executing tasks inside a taskwait construct +// OMP_task_taskyield -- Time spent executing tasks inside a taskyield construct +// OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup construct +// OMP_task_join_bar -- Time spent executing tasks inside a join barrier +// OMP_task_plain_bar -- Time spent executing tasks inside a barrier construct // OMP_single -- Time spent executing a "single" region // OMP_master -- Time spent executing a "master" region // OMP_set_numthreads -- Values passed to omp_set_num_threads @@ -197,12 +231,25 @@ enum stats_flags_e { * * @ingroup STATS_GATHERING */ -#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ - macro(OMP_serial, 0, arg) \ - macro(OMP_start_end, 0, arg) \ - macro(OMP_critical, 0, arg) \ - macro(OMP_single, 0, arg) \ - macro(OMP_master, 0, arg) \ +#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) \ + macro(OMP_worker_thread_life, 0, arg) \ + macro(FOR_static_scheduling, 0, arg) \ + macro(FOR_dynamic_scheduling, 0, arg) \ + macro(OMP_critical, 0, arg) \ + macro(OMP_critical_wait, 0, arg) \ + macro(OMP_single, 0, arg) \ + macro(OMP_master, 0, arg) \ + macro(OMP_idle, 0, arg) \ + macro(OMP_plain_barrier, 0, arg) \ + macro(OMP_fork_join_barrier, 0, arg) \ + macro(OMP_parallel, 0, arg) \ + macro(OMP_task_immediate, 0, arg) \ + macro(OMP_task_taskwait, 0, arg) \ + macro(OMP_task_taskyield, 0, arg) \ + macro(OMP_task_taskgroup, 0, arg) \ + macro(OMP_task_join_bar, 0, arg) \ + macro(OMP_task_plain_bar, 0, arg) \ + macro(OMP_serial, 0, arg) \ KMP_FOREACH_EXPLICIT_DEVELOPER_TIMER(macro,arg) \ macro(LAST, 0, arg) @@ -227,6 +274,21 @@ enum counter_e { }; #undef ENUMERATE +class timerPair { + explicit_timer_e timer_index; + timer_e timer; + public: + timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {} + inline explicit_timer_e get_index() const { return timer_index; } + inline timer_e get_timer() const { return timer; } + bool operator==(const timerPair & rhs) { + return this->get_index() == rhs.get_index(); + } + bool operator!=(const timerPair & rhs) { + return !(*this == rhs); + } +}; + class statistic { double minVal; @@ -294,15 +356,19 @@ class explicitTimer { timeStat * stat; tsc_tick_count startTime; + tsc_tick_count pauseStartTime; + tsc_tick_count::tsc_interval_t totalPauseTime; public: - explicitTimer () : stat(0), startTime(0) { } - explicitTimer (timeStat * s) : stat(s), startTime() { } + explicitTimer () : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() { } + explicitTimer (timeStat * s) : stat(s), startTime(), pauseStartTime(0), totalPauseTime() { } void setStat (timeStat *s) { stat = s; } void start(timer_e timerEnumValue); + void pause() { pauseStartTime = tsc_tick_count::now(); } + void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); } void stop(timer_e timerEnumValue); - void reset() { startTime = 0; } + void reset() { startTime = 0; pauseStartTime = 0; totalPauseTime = 0; } }; // Where all you need is to time a block, this is enough. @@ -315,6 +381,49 @@ class blockTimer : public explicitTimer ~blockTimer() { stop(timerEnumValue); } }; +// Where you need to partition a threads clock ticks into separate states +// e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and +// DOING_NOTHING would render these conditions: +// time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive +// No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice versa +class partitionedTimers +{ + private: + explicitTimer* timers[EXPLICIT_TIMER_LAST+1]; + std::vector<timerPair> timer_stack; + public: + partitionedTimers(); + void add_timer(explicit_timer_e timer_index, explicitTimer* timer_pointer); + void init(timerPair timer_index); + void push(timerPair timer_index); + void pop(); + void windup(); +}; + +// Special wrapper around the partioned timers to aid timing code blocks +// It avoids the need to have an explicit end, leaving the scope suffices. +class blockPartitionedTimer +{ + partitionedTimers* part_timers; + timerPair timer_pair; + public: + blockPartitionedTimer(partitionedTimers* pt, timerPair tp) : part_timers(pt), timer_pair(tp) { part_timers->push(timer_pair); } + ~blockPartitionedTimer() { part_timers->pop(); } +}; + +// Special wrapper around the thread state to aid in keeping state in code blocks +// It avoids the need to have an explicit end, leaving the scope suffices. +class blockThreadState +{ + stats_state_e* state_pointer; + stats_state_e old_state; + public: + blockThreadState(stats_state_e* thread_state_pointer, stats_state_e new_state) : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { + *state_pointer = new_state; + } + ~blockThreadState() { *state_pointer = old_state; } +}; + // If all you want is a count, then you can use this... // The individual per-thread counts will be aggregated into a statistic at program exit. class counter @@ -473,14 +582,19 @@ class kmp_stats_list { timeStat _timers[TIMER_LAST+1]; counter _counters[COUNTER_LAST+1]; explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST+1]; + partitionedTimers _partitionedTimers; int _nestLevel; // one per thread kmp_stats_event_vector _event_vector; kmp_stats_list* next; kmp_stats_list* prev; + stats_state_e state; + int thread_is_idle_flag; public: - kmp_stats_list() : next(this) , prev(this) , _event_vector(), _nestLevel(0) { + kmp_stats_list() : _nestLevel(0), _event_vector(), next(this), prev(this), + state(IDLE), thread_is_idle_flag(0) { #define doInit(name,ignore1,ignore2) \ - getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); + getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name)); \ + _partitionedTimers.add_timer(EXPLICIT_TIMER_##name, getExplicitTimer(EXPLICIT_TIMER_##name)); KMP_FOREACH_EXPLICIT_TIMER(doInit,0); #undef doInit } @@ -488,6 +602,7 @@ class kmp_stats_list { inline timeStat * getTimer(timer_e idx) { return &_timers[idx]; } inline counter * getCounter(counter_e idx) { return &_counters[idx]; } inline explicitTimer * getExplicitTimer(explicit_timer_e idx) { return &_explicitTimers[idx]; } + inline partitionedTimers * getPartitionedTimers() { return &_partitionedTimers; } inline timeStat * getTimers() { return _timers; } inline counter * getCounters() { return _counters; } inline explicitTimer * getExplicitTimers() { return _explicitTimers; } @@ -498,6 +613,12 @@ class kmp_stats_list { inline void decrementNestValue() { _nestLevel--; } inline int getGtid() const { return gtid; } inline void setGtid(int newgtid) { gtid = newgtid; } + inline void setState(stats_state_e newstate) { state = newstate; } + inline stats_state_e getState() const { return state; } + inline stats_state_e * getStatePointer() { return &state; } + inline bool isIdle() { return thread_is_idle_flag==1; } + inline void setIdleFlag() { thread_is_idle_flag = 1; } + inline void resetIdleFlag() { thread_is_idle_flag = 0; } kmp_stats_list* push_back(int gtid); // returns newly created list node inline void push_event(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) { _event_vector.push_back(start_time, stop_time, nest_level, name); @@ -699,6 +820,35 @@ extern kmp_stats_output_module __kmp_stats_output; __kmp_output_stats(heading_string) /*! + * \brief Initializes the paritioned timers to begin with name. + * + * @param name timer which you want this thread to begin with + * + * @ingroup STATS_GATHERING +*/ +#define KMP_INIT_PARTITIONED_TIMERS(name) \ + __kmp_stats_thread_ptr->getPartitionedTimers()->init(timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_TIME_PARTITIONED_BLOCK(name) \ + blockPartitionedTimer __PBLOCKTIME__(__kmp_stats_thread_ptr->getPartitionedTimers(), \ + timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_PUSH_PARTITIONED_TIMER(name) \ + __kmp_stats_thread_ptr->getPartitionedTimers()->push(timerPair(EXPLICIT_TIMER_##name, TIMER_##name)) + +#define KMP_POP_PARTITIONED_TIMER() \ + __kmp_stats_thread_ptr->getPartitionedTimers()->pop() + +#define KMP_SET_THREAD_STATE(state_name) \ + __kmp_stats_thread_ptr->setState(state_name) + +#define KMP_GET_THREAD_STATE() \ + __kmp_stats_thread_ptr->getState() + +#define KMP_SET_THREAD_STATE_BLOCK(state_name) \ + blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), state_name) + +/*! * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) * * \details Reset all stats for all threads. @@ -739,6 +889,13 @@ extern kmp_stats_output_module __kmp_stats_output; #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) #define KMP_START_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) #define KMP_STOP_DEVELOPER_EXPLICIT_TIMER(n) ((void)0) +#define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) +#define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) +#define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) +#define KMP_POP_PARTITIONED_TIMER() ((void)0) +#define KMP_SET_THREAD_STATE(state_name) ((void)0) +#define KMP_GET_THREAD_STATE() ((void)0) +#define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) #endif // KMP_STATS_ENABLED #endif // KMP_STATS_H diff --git a/openmp/runtime/src/kmp_stats_timing.h b/openmp/runtime/src/kmp_stats_timing.h index 0ce0da99a5e..0605d9456da 100644 --- a/openmp/runtime/src/kmp_stats_timing.h +++ b/openmp/runtime/src/kmp_stats_timing.h @@ -40,11 +40,16 @@ class tsc_tick_count { #endif double ticks() const { return double(value); } int64_t getValue() const { return value; } + tsc_interval_t& operator=(int64_t nvalue) { value = nvalue; return *this; } friend class tsc_tick_count; - friend tsc_interval_t operator-( - const tsc_tick_count t1, const tsc_tick_count t0); + friend tsc_interval_t operator-(const tsc_tick_count& t1, + const tsc_tick_count& t0); + friend tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1, + const tsc_tick_count::tsc_interval_t& i0); + friend tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1, + const tsc_tick_count::tsc_interval_t& i0); }; #if KMP_HAVE___BUILTIN_READCYCLECOUNTER @@ -66,14 +71,25 @@ class tsc_tick_count { static double tick_time(); // returns seconds per cycle (period) of clock #endif static tsc_tick_count now() { return tsc_tick_count(); } // returns the rdtsc register value - friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0); + friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0); }; -inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0) +inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count& t1, const tsc_tick_count& t0) { return tsc_tick_count::tsc_interval_t( t1.my_count-t0.my_count ); } +inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0) +{ + return tsc_tick_count::tsc_interval_t( i1.value-i0.value ); +} + +inline tsc_tick_count::tsc_interval_t& operator+=(tsc_tick_count::tsc_interval_t& i1, const tsc_tick_count::tsc_interval_t& i0) +{ + i1.value += i0.value; + return i1; +} + #if KMP_HAVE_TICK_TIME inline double tsc_tick_count::tsc_interval_t::seconds() const { diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c index 2758027f572..1b89f0213d8 100644 --- a/openmp/runtime/src/kmp_tasking.c +++ b/openmp/runtime/src/kmp_tasking.c @@ -36,16 +36,6 @@ static int __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t static void __kmp_bottom_half_finish_proxy( kmp_int32 gtid, kmp_task_t * ptask ); #endif -static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) { - if (!flag) return; - // Attempt to wake up a thread: examine its type and call appropriate template - switch (((kmp_flag_64 *)flag)->get_type()) { - case flag32: __kmp_resume_32(gtid, NULL); break; - case flag64: __kmp_resume_64(gtid, NULL); break; - case flag_oncore: __kmp_resume_oncore(gtid, NULL); break; - } -} - #ifdef BUILD_TIED_TASK_STACK //--------------------------------------------------------------------------- @@ -1207,8 +1197,17 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta // Thunks generated by gcc take a different argument list. // if (!discard) { +#if KMP_STATS_ENABLED KMP_COUNT_BLOCK(TASK_executed); - KMP_TIME_BLOCK (OMP_task); + switch(KMP_GET_THREAD_STATE()) { + case FORK_JOIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar); break; + case PLAIN_BARRIER: KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar); break; + case TASKYIELD: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield); break; + case TASKWAIT: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait); break; + case TASKGROUP: KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup); break; + default: KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate); break; + } +#endif // KMP_STATS_ENABLED #endif // OMP_40_ENABLED #if OMPT_SUPPORT && OMPT_TRACE @@ -1231,6 +1230,7 @@ __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_ta { (*(task->routine))(gtid, task); } + KMP_POP_PARTITIONED_TIMER(); #if OMPT_SUPPORT && OMPT_TRACE /* let OMPT know that we're returning to the callee task */ @@ -1369,6 +1369,7 @@ kmp_int32 __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task) { kmp_int32 res; + KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK); #if KMP_DEBUG kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task); @@ -1392,6 +1393,7 @@ __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid ) kmp_taskdata_t * taskdata; kmp_info_t * thread; int thread_finished = FALSE; + KMP_SET_THREAD_STATE_BLOCK(TASKWAIT); KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref) ); @@ -1481,6 +1483,7 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part ) int thread_finished = FALSE; KMP_COUNT_BLOCK(OMP_TASKYIELD); + KMP_SET_THREAD_STATE_BLOCK(TASKYIELD); KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n", gtid, loc_ref, end_part) ); @@ -1561,6 +1564,7 @@ __kmpc_end_taskgroup( ident_t* loc, int gtid ) KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc) ); KMP_DEBUG_ASSERT( taskgroup != NULL ); + KMP_SET_THREAD_STATE_BLOCK(TASKGROUP); if ( __kmp_tasking_mode != tskm_immediate_exec ) { #if USE_ITT_BUILD diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h index bed40170e03..a277fbf2725 100644 --- a/openmp/runtime/src/kmp_wait_release.h +++ b/openmp/runtime/src/kmp_wait_release.h @@ -18,6 +18,7 @@ #include "kmp.h" #include "kmp_itt.h" +#include "kmp_stats.h" /*! @defgroup WAIT_RELEASE Wait/Release operations @@ -104,6 +105,9 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin } th_gtid = this_thr->th.th_info.ds.ds_gtid; KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag)); +#if KMP_STATS_ENABLED + stats_state_e thread_state = KMP_GET_THREAD_STATE(); +#endif #if OMPT_SUPPORT && OMPT_BLAME ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state; @@ -223,6 +227,15 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin } } +#if KMP_STATS_ENABLED + // Check if thread has been signalled to idle state + // This indicates that the logical "join-barrier" has finished + if (this_thr->th.th_stats->isIdle() && KMP_GET_THREAD_STATE() == FORK_JOIN_BARRIER) { + KMP_SET_THREAD_STATE(IDLE); + KMP_PUSH_PARTITIONED_TIMER(OMP_idle); + } +#endif + // Don't suspend if KMP_BLOCKTIME is set to "infinite" if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) continue; @@ -273,6 +286,14 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin } } #endif +#if KMP_STATS_ENABLED + // If we were put into idle state, pop that off the state stack + if (KMP_GET_THREAD_STATE() == IDLE) { + KMP_POP_PARTITIONED_TIMER(); + KMP_SET_THREAD_STATE(thread_state); + this_thr->th.th_stats->resetIdleFlag(); + } +#endif KMP_FSYNC_SPIN_ACQUIRED(spin); } @@ -556,6 +577,15 @@ public: flag_type get_ptr_type() { return flag_oncore; } }; +// Used to wake up threads, volatile void* flag is usually the th_sleep_loc associated +// with int gtid. +static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) { + switch (((kmp_flag_64 *)flag)->get_type()) { + case flag32: __kmp_resume_32(gtid, NULL); break; + case flag64: __kmp_resume_64(gtid, NULL); break; + case flag_oncore: __kmp_resume_oncore(gtid, NULL); break; + } +} /*! @} diff --git a/openmp/runtime/src/z_Linux_util.c b/openmp/runtime/src/z_Linux_util.c index 1e7683d2402..5d61cd177fe 100644 --- a/openmp/runtime/src/z_Linux_util.c +++ b/openmp/runtime/src/z_Linux_util.c @@ -697,6 +697,9 @@ __kmp_launch_worker( void *thr ) #if KMP_STATS_ENABLED // set __thread local index to point to thread-specific stats __kmp_stats_thread_ptr = ((kmp_info_t*)thr)->th.th_stats; + KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + KMP_SET_THREAD_STATE(IDLE); + KMP_INIT_PARTITIONED_TIMERS(OMP_idle); #endif #if USE_ITT_BUILD @@ -972,8 +975,9 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size ) __kmp_stats_start_time = tsc_tick_count::now(); __kmp_stats_thread_ptr = th->th.th_stats; __kmp_stats_init(); - KMP_START_EXPLICIT_TIMER(OMP_serial); - KMP_START_EXPLICIT_TIMER(OMP_start_end); + KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); + KMP_SET_THREAD_STATE(SERIAL_REGION); + KMP_INIT_PARTITIONED_TIMERS(OMP_serial); } __kmp_release_tas_lock(&__kmp_stats_lock, gtid); @@ -1856,6 +1860,7 @@ void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) { void __kmp_resume_monitor() { + KMP_TIME_DEVELOPER_BLOCK(USER_resume); int status; #ifdef KMP_DEBUG int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1; |