diff options
Diffstat (limited to 'openmp/runtime/src/kmp_barrier.cpp')
-rw-r--r-- | openmp/runtime/src/kmp_barrier.cpp | 3185 |
1 files changed, 1683 insertions, 1502 deletions
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp index a9a46f14372..f25915f0154 100644 --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -15,9 +15,9 @@ #include "kmp.h" #include "kmp_wait_release.h" -#include "kmp_stats.h" #include "kmp_itt.h" #include "kmp_os.h" +#include "kmp_stats.h" #if KMP_MIC @@ -29,15 +29,15 @@ #if KMP_MIC && USE_NGO_STORES // ICV copying -#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) +#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src)) #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) -#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) -#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory") +#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt) +#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory") #else -#define ngo_load(src) ((void)0) +#define ngo_load(src) ((void)0) #define ngo_store_icvs(dst, src) copy_icvs((dst), (src)) -#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) -#define ngo_sync() ((void)0) +#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE) +#define ngo_sync() ((void)0) #endif /* KMP_MIC && USE_NGO_STORES */ void __kmp_print_structure(void); // Forward declaration @@ -45,1785 +45,1966 @@ void __kmp_print_structure(void); // Forward declaration // ---------------------------- Barrier Algorithms ---------------------------- // Linear Barrier -static void -__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); - register kmp_team_t *team = this_thr->th.th_team; - register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; - register kmp_info_t **other_threads = team->t.t_threads; - - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); +static void __kmp_linear_barrier_gather( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather); + register kmp_team_t *team = this_thr->th.th_team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_info_t **other_threads = team->t.t_threads; + + KA_TRACE( + 20, + ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } + // Barrier imbalance - save arrive time to the thread + if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = + __itt_get_timestamp(); + } #endif - // We now perform a linear reduction to signal that all of the threads have arrived. - if (!KMP_MASTER_TID(tid)) { - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived, - thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - // Mark arrival to master thread - /* After performing this write, a worker thread may not assume that the team is valid - any more - it could be deallocated by the master thread at any time. */ - ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); - flag.release(); - } else { - register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; - register int nproc = this_thr->th.th_team_nproc; - register int i; - // Don't have to worry about sleep bit here or atomic since team setting - register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; - - // Collect all the worker team member threads. - for (i=1; i<nproc; ++i) { + // We now perform a linear reduction to signal that all of the threads have + // arrived. + if (!KMP_MASTER_TID(tid)) { + KA_TRACE(20, + ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)" + "arrived(%p): %llu => %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team), + team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + // Mark arrival to master thread + /* After performing this write, a worker thread may not assume that the team + is valid any more - it could be deallocated by the master thread at any + time. */ + ANNOTATE_BARRIER_BEGIN(this_thr); + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]); + flag.release(); + } else { + register kmp_balign_team_t *team_bar = &team->t.t_bar[bt]; + register int nproc = this_thr->th.th_team_nproc; + register int i; + // Don't have to worry about sleep bit here or atomic since team setting + register kmp_uint64 new_state = + team_bar->b_arrived + KMP_BARRIER_STATE_BUMP; + + // Collect all the worker team member threads. + for (i = 1; i < nproc; ++i) { #if KMP_CACHE_MANAGE - // Prefetch next thread's arrived count - if (i+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived); + // Prefetch next thread's arrived count + if (i + 1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived); #endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(i, team), team->t.t_id, i, - &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); - - // Wait for worker thread to arrive - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(other_threads[i]); + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), + team->t.t_id, i, + &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state)); + + // Wait for worker thread to arrive + kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, + new_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(other_threads[i]); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and the other thread time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - other_threads[i]->th.th_bar_min_time); - } + // Barrier imbalance - write min of the thread time and the other thread + // time to the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN( + this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); + } #endif - if (reduce) { - KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid, - team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); - ANNOTATE_REDUCE_AFTER(reduce); - (*reduce)(this_thr->th.th_local.reduce_data, - other_threads[i]->th.th_local.reduce_data); - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - } - // Don't have to worry about sleep bit here or atomic since team setting - team_bar->b_arrived = new_state; - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state)); + if (reduce) { + KA_TRACE(100, + ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), + team->t.t_id, i)); + ANNOTATE_REDUCE_AFTER(reduce); + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[i]->th.th_local.reduce_data); + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); + } } - KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + // Don't have to worry about sleep bit here or atomic since team setting + team_bar->b_arrived = new_state; + KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d " + "arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, + new_state)); + } + KA_TRACE( + 20, + ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } -static void -__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); - register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; - register kmp_team_t *team; +static void __kmp_linear_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release); + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_team_t *team; - if (KMP_MASTER_TID(tid)) { - register unsigned int i; - register kmp_uint32 nproc = this_thr->th.th_team_nproc; - register kmp_info_t **other_threads; + if (KMP_MASTER_TID(tid)) { + register unsigned int i; + register kmp_uint32 nproc = this_thr->th.th_team_nproc; + register kmp_info_t **other_threads; - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - other_threads = team->t.t_threads; + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + other_threads = team->t.t_threads; - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); - if (nproc > 1) { + if (nproc > 1) { #if KMP_BARRIER_ICV_PUSH - { - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); - if (propagate_icvs) { - ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); - for (i=1; i<nproc; ++i) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE); - ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); - } - ngo_sync(); - } - } + { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); + if (propagate_icvs) { + ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs); + for (i = 1; i < nproc; ++i) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], + team, i, FALSE); + ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); + } + ngo_sync(); + } + } #endif // KMP_BARRIER_ICV_PUSH - // Now, release all of the worker threads - for (i=1; i<nproc; ++i) { + // Now, release all of the worker threads + for (i = 1; i < nproc; ++i) { #if KMP_CACHE_MANAGE - // Prefetch next thread's go flag - if (i+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go); + // Prefetch next thread's go flag + if (i + 1 < nproc) + KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go); #endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i, - &other_threads[i]->th.th_bar[bt].bb.b_go, - other_threads[i]->th.th_bar[bt].bb.b_go, - other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); - ANNOTATE_BARRIER_BEGIN(other_threads[i]); - kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]); - flag.release(); - } - } - } else { // Wait for the MASTER thread to release us - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(this_thr); + KA_TRACE( + 20, + ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) " + "go(%p): %u => %u\n", + gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid, + team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP)); + ANNOTATE_BARRIER_BEGIN(other_threads[i]); + kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, + other_threads[i]); + flag.release(); + } + } + } else { // Wait for the MASTER thread to release us + KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n", + gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is + // disabled) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ // Early exit for reaping threads releasing forkjoin barrier - if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) - return; - // The worker thread may now assume that the team is valid. + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; +// The worker thread may now assume that the team is valid. #ifdef KMP_DEBUG - tid = __kmp_tid_from_gtid(gtid); - team = __kmp_threads[gtid]->th.th_team; + tid = __kmp_tid_from_gtid(gtid); + team = __kmp_threads[gtid]->th.th_team; #endif - KMP_DEBUG_ASSERT(team != NULL); - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } - KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(team != NULL); + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, + ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + KA_TRACE( + 20, + ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } // Tree barrier static void -__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); - register kmp_team_t *team = this_thr->th.th_team; - register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; - register kmp_info_t **other_threads = team->t.t_threads; - register kmp_uint32 nproc = this_thr->th.th_team_nproc; - register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; - register kmp_uint32 branch_factor = 1 << branch_bits; - register kmp_uint32 child; - register kmp_uint32 child_tid; - register kmp_uint64 new_state; - - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); +__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, + int tid, void (*reduce)(void *, void *) + USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather); + register kmp_team_t *team = this_thr->th.th_team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_info_t **other_threads = team->t.t_threads; + register kmp_uint32 nproc = this_thr->th.th_team_nproc; + register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; + register kmp_uint32 branch_factor = 1 << branch_bits; + register kmp_uint32 child; + register kmp_uint32 child_tid; + register kmp_uint64 new_state; + + KA_TRACE( + 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } + // Barrier imbalance - save arrive time to the thread + if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = + __itt_get_timestamp(); + } #endif - // Perform tree gather to wait until all threads have arrived; reduce any required data as we go - child_tid = (tid << branch_bits) + 1; - if (child_tid < nproc) { - // Parent threads wait for all their children to arrive - new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - child = 1; - do { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + // Perform tree gather to wait until all threads have arrived; reduce any + // required data as we go + child_tid = (tid << branch_bits) + 1; + if (child_tid < nproc) { + // Parent threads wait for all their children to arrive + new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + child = 1; + do { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; #if KMP_CACHE_MANAGE - // Prefetch next thread's arrived count - if (child+1 <= branch_factor && child_tid+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived); + // Prefetch next thread's arrived count + if (child + 1 <= branch_factor && child_tid + 1 < nproc) + KMP_CACHE_PREFETCH( + &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived); #endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, - &child_bar->b_arrived, new_state)); - // Wait for child to arrive - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(child_thr); + KA_TRACE(20, + ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); + // Wait for child to arrive + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(child_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - child_thr->th.th_bar_min_time); - } + // Barrier imbalance - write min of the thread time and a child time to + // the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, + child_thr->th.th_bar_min_time); + } #endif - if (reduce) { - KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - ANNOTATE_REDUCE_AFTER(reduce); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - child++; - child_tid++; - } - while (child <= branch_factor && child_tid < nproc); - } - - if (!KMP_MASTER_TID(tid)) { // Worker threads - register kmp_int32 parent_tid = (tid - 1) >> branch_bits; - - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, - thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - - // Mark arrival to parent thread - /* After performing this write, a worker thread may not assume that the team is valid - any more - it could be deallocated by the master thread at any time. */ - ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); - flag.release(); - } else { - // Need to update the team arrived pointer if we are the master thread - if (nproc > 1) // New value was already computed above - team->t.t_bar[bt].b_arrived = new_state; - else - team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, - &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); - } - KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + if (reduce) { + KA_TRACE(100, + ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + ANNOTATE_REDUCE_AFTER(reduce); + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); + } + child++; + child_tid++; + } while (child <= branch_factor && child_tid < nproc); + } + + if (!KMP_MASTER_TID(tid)) { // Worker threads + register kmp_int32 parent_tid = (tid - 1) >> branch_bits; + + KA_TRACE(20, + ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), + team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + + // Mark arrival to parent thread + /* After performing this write, a worker thread may not assume that the team + is valid any more - it could be deallocated by the master thread at any + time. */ + ANNOTATE_BARRIER_BEGIN(this_thr); + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]); + flag.release(); + } else { + // Need to update the team arrived pointer if we are the master thread + if (nproc > 1) // New value was already computed above + team->t.t_bar[bt].b_arrived = new_state; + else + team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; + KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d " + "arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, + &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + KA_TRACE(20, + ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } -static void -__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); - register kmp_team_t *team; - register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; - register kmp_uint32 nproc; - register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; - register kmp_uint32 branch_factor = 1 << branch_bits; - register kmp_uint32 child; - register kmp_uint32 child_tid; - - // Perform a tree release for all of the threads that have been gathered - if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(this_thr); +static void __kmp_tree_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release); + register kmp_team_t *team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_uint32 nproc; + register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; + register kmp_uint32 branch_factor = 1 << branch_bits; + register kmp_uint32 child; + register kmp_uint32 child_tid; + + // Perform a tree release for all of the threads that have been gathered + if (!KMP_MASTER_TID( + tid)) { // Handle fork barrier workers who aren't part of a team yet + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid, + &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + // Wait for parent thread to release us + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In fork barrier where we could not get the object reliably (or + // ITTNOTIFY is disabled) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ // Early exit for reaping threads releasing forkjoin barrier if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; + return; - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } else { - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - } - nproc = this_thr->th.th_team_nproc; - child_tid = (tid << branch_bits) + 1; - - if (child_tid < nproc) { - register kmp_info_t **other_threads = team->t.t_threads; - child = 1; - // Parent threads release all their children - do { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, + ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid, + team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } else { + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + } + nproc = this_thr->th.th_team_nproc; + child_tid = (tid << branch_bits) + 1; + + if (child_tid < nproc) { + register kmp_info_t **other_threads = team->t.t_threads; + child = 1; + // Parent threads release all their children + do { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; #if KMP_CACHE_MANAGE - // Prefetch next thread's go count - if (child+1 <= branch_factor && child_tid+1 < nproc) - KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go); + // Prefetch next thread's go count + if (child + 1 <= branch_factor && child_tid + 1 < nproc) + KMP_CACHE_PREFETCH( + &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go); #endif /* KMP_CACHE_MANAGE */ #if KMP_BARRIER_ICV_PUSH - { - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); - if (propagate_icvs) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid], - team, child_tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, - &team->t.t_implicit_task_taskdata[0].td_icvs); - } - } -#endif // KMP_BARRIER_ICV_PUSH - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, - child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child from barrier - ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - child++; - child_tid++; + { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, + team->t.t_threads[child_tid], team, + child_tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, + &team->t.t_implicit_task_taskdata[0].td_icvs); } - while (child <= branch_factor && child_tid < nproc); - } - KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + } +#endif // KMP_BARRIER_ICV_PUSH + KA_TRACE(20, + ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" + "go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child from barrier + ANNOTATE_BARRIER_BEGIN(child_thr); + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + child++; + child_tid++; + } while (child <= branch_factor && child_tid < nproc); + } + KA_TRACE( + 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } - // Hyper Barrier static void -__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - void (*reduce)(void *, void *) - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); - register kmp_team_t *team = this_thr->th.th_team; - register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; - register kmp_info_t **other_threads = team->t.t_threads; - register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; - register kmp_uint32 num_threads = this_thr->th.th_team_nproc; - register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; - register kmp_uint32 branch_factor = 1 << branch_bits; - register kmp_uint32 offset; - register kmp_uint32 level; - - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); +__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, + int tid, void (*reduce)(void *, void *) + USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather); + register kmp_team_t *team = this_thr->th.th_team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_info_t **other_threads = team->t.t_threads; + register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE; + register kmp_uint32 num_threads = this_thr->th.th_team_nproc; + register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt]; + register kmp_uint32 branch_factor = 1 << branch_bits; + register kmp_uint32 offset; + register kmp_uint32 level; + + KA_TRACE( + 20, + ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); - } + // Barrier imbalance - save arrive time to the thread + if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = + __itt_get_timestamp(); + } #endif - /* Perform a hypercube-embedded tree gather to wait until all of the threads have - arrived, and reduce any required data as we go. */ - kmp_flag_64 p_flag(&thr_bar->b_arrived); - for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) - { - register kmp_uint32 child; - register kmp_uint32 child_tid; - - if (((tid >> level) & (branch_factor - 1)) != 0) { - register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1); - - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, - thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); - // Mark arrival to parent thread - /* After performing this write (in the last iteration of the enclosing for loop), - a worker thread may not assume that the team is valid any more - it could be - deallocated by the master thread at any time. */ - ANNOTATE_BARRIER_BEGIN(this_thr); - p_flag.set_waiter(other_threads[parent_tid]); - p_flag.release(); - break; - } + /* Perform a hypercube-embedded tree gather to wait until all of the threads + have arrived, and reduce any required data as we go. */ + kmp_flag_64 p_flag(&thr_bar->b_arrived); + for (level = 0, offset = 1; offset < num_threads; + level += branch_bits, offset <<= branch_bits) { + register kmp_uint32 child; + register kmp_uint32 child_tid; - // Parent threads wait for children to arrive - if (new_state == KMP_BARRIER_UNUSED_STATE) - new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads; - child++, child_tid+=(1 << level)) - { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + if (((tid >> level) & (branch_factor - 1)) != 0) { + register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1); + + KA_TRACE(20, + ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team), + team->t.t_id, parent_tid, &thr_bar->b_arrived, + thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + // Mark arrival to parent thread + /* After performing this write (in the last iteration of the enclosing for + loop), a worker thread may not assume that the team is valid any more + - it could be deallocated by the master thread at any time. */ + ANNOTATE_BARRIER_BEGIN(this_thr); + p_flag.set_waiter(other_threads[parent_tid]); + p_flag.release(); + break; + } + + // Parent threads wait for children to arrive + if (new_state == KMP_BARRIER_UNUSED_STATE) + new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + for (child = 1, child_tid = tid + (1 << level); + child < branch_factor && child_tid < num_threads; + child++, child_tid += (1 << level)) { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; #if KMP_CACHE_MANAGE - register kmp_uint32 next_child_tid = child_tid + (1 << level); - // Prefetch next thread's arrived count - if (child+1 < branch_factor && next_child_tid < num_threads) - KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); + register kmp_uint32 next_child_tid = child_tid + (1 << level); + // Prefetch next thread's arrived count + if (child + 1 < branch_factor && next_child_tid < num_threads) + KMP_CACHE_PREFETCH( + &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived); #endif /* KMP_CACHE_MANAGE */ - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " - "arrived(%p) == %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid, - &child_bar->b_arrived, new_state)); - // Wait for child to arrive - kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); - c_flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(child_thr); + KA_TRACE(20, + ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); + // Wait for child to arrive + kmp_flag_64 c_flag(&child_bar->b_arrived, new_state); + c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(child_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, - child_thr->th.th_bar_min_time); - } + // Barrier imbalance - write min of the thread time and a child time to + // the thread. + if (__kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, + child_thr->th.th_bar_min_time); + } #endif - if (reduce) { - KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - ANNOTATE_REDUCE_AFTER(reduce); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - } - } - - if (KMP_MASTER_TID(tid)) { - // Need to update the team arrived pointer if we are the master thread - if (new_state == KMP_BARRIER_UNUSED_STATE) - team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; - else - team->t.t_bar[bt].b_arrived = new_state; - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, - &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + if (reduce) { + KA_TRACE(100, + ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid)); + ANNOTATE_REDUCE_AFTER(reduce); + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); + } } - KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + } + + if (KMP_MASTER_TID(tid)) { + // Need to update the team arrived pointer if we are the master thread + if (new_state == KMP_BARRIER_UNUSED_STATE) + team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP; + else + team->t.t_bar[bt].b_arrived = new_state; + KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d " + "arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, + &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + KA_TRACE( + 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } // The reverse versions seem to beat the forward versions overall #define KMP_REVERSE_HYPER_BAR -static void -__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void *itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); - register kmp_team_t *team; - register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; - register kmp_info_t **other_threads; - register kmp_uint32 num_threads; - register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ]; - register kmp_uint32 branch_factor = 1 << branch_bits; - register kmp_uint32 child; - register kmp_uint32 child_tid; - register kmp_uint32 offset; - register kmp_uint32 level; - - /* Perform a hypercube-embedded tree release for all of the threads that have been gathered. - If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse - order of the corresponding gather, otherwise threads are released in the same order. */ - if (KMP_MASTER_TID(tid)) { // master - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); +static void __kmp_hyper_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release); + register kmp_team_t *team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_info_t **other_threads; + register kmp_uint32 num_threads; + register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt]; + register kmp_uint32 branch_factor = 1 << branch_bits; + register kmp_uint32 child; + register kmp_uint32 child_tid; + register kmp_uint32 offset; + register kmp_uint32 level; + + /* Perform a hypercube-embedded tree release for all of the threads that have + been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads + are released in the reverse order of the corresponding gather, otherwise + threads are released in the same order. */ + if (KMP_MASTER_TID(tid)) { // master + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); #if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) { // master already has ICVs in final destination; copy - copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); - } -#endif + if (propagate_icvs) { // master already has ICVs in final destination; copy + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); } - else { // Handle fork barrier workers who aren't part of a team yet - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); - // Wait for parent thread to release us - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(this_thr); +#endif + } else { // Handle fork barrier workers who aren't part of a team yet + KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, + &thr_bar->b_go, KMP_BARRIER_STATE_BUMP)); + // Wait for parent thread to release us + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(this_thr); #if USE_ITT_BUILD && USE_ITT_NOTIFY - if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { - // In fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); - // Cancel wait on previous parallel region... - __kmp_itt_task_starting(itt_sync_obj); - - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj != NULL) - // Call prepare as early as possible for "new" barrier - __kmp_itt_task_finished(itt_sync_obj); - } else + if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) { + // In fork barrier where we could not get the object reliably + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1); + // Cancel wait on previous parallel region... + __kmp_itt_task_starting(itt_sync_obj); + + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj != NULL) + // Call prepare as early as possible for "new" barrier + __kmp_itt_task_finished(itt_sync_obj); + } else #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ // Early exit for reaping threads releasing forkjoin barrier if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; + return; - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. - } - num_threads = this_thr->th.th_team_nproc; - other_threads = team->t.t_threads; + TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); + KA_TRACE(20, + ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + num_threads = this_thr->th.th_team_nproc; + other_threads = team->t.t_threads; #ifdef KMP_REVERSE_HYPER_BAR - // Count up to correct level for parent - for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0); - level+=branch_bits, offset<<=branch_bits); - - // Now go down from there - for (level-=branch_bits, offset>>=branch_bits; offset != 0; - level-=branch_bits, offset>>=branch_bits) + // Count up to correct level for parent + for (level = 0, offset = 1; + offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0); + level += branch_bits, offset <<= branch_bits) + ; + + // Now go down from there + for (level -= branch_bits, offset >>= branch_bits; offset != 0; + level -= branch_bits, offset >>= branch_bits) #else - // Go down the tree, level by level - for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits) + // Go down the tree, level by level + for (level = 0, offset = 1; offset < num_threads; + level += branch_bits, offset <<= branch_bits) #endif // KMP_REVERSE_HYPER_BAR - { + { #ifdef KMP_REVERSE_HYPER_BAR - /* Now go in reverse order through the children, highest to lowest. - Initial setting of child is conservative here. */ - child = num_threads >> ((level==0)?level:level-1); - for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level); - child>=1; child--, child_tid-=(1<<level)) + /* Now go in reverse order through the children, highest to lowest. + Initial setting of child is conservative here. */ + child = num_threads >> ((level == 0) ? level : level - 1); + for (child = (child < branch_factor - 1) ? child : branch_factor - 1, + child_tid = tid + (child << level); + child >= 1; child--, child_tid -= (1 << level)) #else - if (((tid >> level) & (branch_factor - 1)) != 0) - // No need to go lower than this, since this is the level parent would be notified - break; - // Iterate through children on this level of the tree - for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads; - child++, child_tid+=(1<<level)) + if (((tid >> level) & (branch_factor - 1)) != 0) + // No need to go lower than this, since this is the level parent would be + // notified + break; + // Iterate through children on this level of the tree + for (child = 1, child_tid = tid + (1 << level); + child < branch_factor && child_tid < num_threads; + child++, child_tid += (1 << level)) #endif // KMP_REVERSE_HYPER_BAR - { - if (child_tid >= num_threads) continue; // Child doesn't exist so keep going - else { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + { + if (child_tid >= num_threads) + continue; // Child doesn't exist so keep going + else { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; #if KMP_CACHE_MANAGE - register kmp_uint32 next_child_tid = child_tid - (1 << level); - // Prefetch next thread's go count -# ifdef KMP_REVERSE_HYPER_BAR - if (child-1 >= 1 && next_child_tid < num_threads) -# else - if (child+1 < branch_factor && next_child_tid < num_threads) -# endif // KMP_REVERSE_HYPER_BAR - KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); + register kmp_uint32 next_child_tid = child_tid - (1 << level); +// Prefetch next thread's go count +#ifdef KMP_REVERSE_HYPER_BAR + if (child - 1 >= 1 && next_child_tid < num_threads) +#else + if (child + 1 < branch_factor && next_child_tid < num_threads) +#endif // KMP_REVERSE_HYPER_BAR + KMP_CACHE_PREFETCH( + &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go); #endif /* KMP_CACHE_MANAGE */ #if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) // push my fixed ICVs to my child - copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); + if (propagate_icvs) // push my fixed ICVs to my child + copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); #endif // KMP_BARRIER_ICV_PUSH - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" - "go(%p): %u => %u\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(child_tid, team), team->t.t_id, - child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child from barrier - ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } + KA_TRACE( + 20, + ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)" + "go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child from barrier + ANNOTATE_BARRIER_BEGIN(child_thr); + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } } + } #if KMP_BARRIER_ICV_PUSH - if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); - } + if (propagate_icvs && + !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, + FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->th_fixed_icvs); + } #endif - KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + KA_TRACE( + 20, + ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } // Hierarchical Barrier // Initialize thread barrier data -/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the - minimum amount of initialization required based on how the team has changed. Returns true if - leaf children will require both on-core and traditional wake-up mechanisms. For example, if the - team size increases, threads already in the team will respond to on-core wakeup on their parent - thread, but threads newly added to the team will only be listening on the their local b_go. */ -static bool -__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, - int gtid, int tid, kmp_team_t *team) -{ - // Checks to determine if (re-)initialization is needed - bool uninitialized = thr_bar->team == NULL; - bool team_changed = team != thr_bar->team; - bool team_sz_changed = nproc != thr_bar->nproc; - bool tid_changed = tid != thr_bar->old_tid; - bool retval = false; - - if (uninitialized || team_sz_changed) { - __kmp_get_hierarchy(nproc, thr_bar); - } - - if (uninitialized || team_sz_changed || tid_changed) { - thr_bar->my_level = thr_bar->depth-1; // default for master - thr_bar->parent_tid = -1; // default for master - if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy - kmp_uint32 d=0; - while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level - kmp_uint32 rem; - if (d == thr_bar->depth-2) { // reached level right below the master - thr_bar->parent_tid = 0; - thr_bar->my_level = d; - break; - } - else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster? - // thread is not a subtree root at next level, so this is max - thr_bar->parent_tid = tid - rem; - thr_bar->my_level = d; - break; - } - ++d; - } +/* Initializes/re-initializes the hierarchical barrier data stored on a thread. + Performs the minimum amount of initialization required based on how the team + has changed. Returns true if leaf children will require both on-core and + traditional wake-up mechanisms. For example, if the team size increases, + threads already in the team will respond to on-core wakeup on their parent + thread, but threads newly added to the team will only be listening on the + their local b_go. */ +static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, + kmp_bstate_t *thr_bar, + kmp_uint32 nproc, int gtid, + int tid, kmp_team_t *team) { + // Checks to determine if (re-)initialization is needed + bool uninitialized = thr_bar->team == NULL; + bool team_changed = team != thr_bar->team; + bool team_sz_changed = nproc != thr_bar->nproc; + bool tid_changed = tid != thr_bar->old_tid; + bool retval = false; + + if (uninitialized || team_sz_changed) { + __kmp_get_hierarchy(nproc, thr_bar); + } + + if (uninitialized || team_sz_changed || tid_changed) { + thr_bar->my_level = thr_bar->depth - 1; // default for master + thr_bar->parent_tid = -1; // default for master + if (!KMP_MASTER_TID( + tid)) { // if not master, find parent thread in hierarchy + kmp_uint32 d = 0; + while (d < thr_bar->depth) { // find parent based on level of thread in + // hierarchy, and note level + kmp_uint32 rem; + if (d == thr_bar->depth - 2) { // reached level right below the master + thr_bar->parent_tid = 0; + thr_bar->my_level = d; + break; + } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != + 0) { // TODO: can we make this op faster? + // thread is not a subtree root at next level, so this is max + thr_bar->parent_tid = tid - rem; + thr_bar->my_level = d; + break; } - thr_bar->offset = 7-(tid-thr_bar->parent_tid-1); - thr_bar->old_tid = tid; - thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; - thr_bar->team = team; - thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; - } - if (uninitialized || team_changed || tid_changed) { - thr_bar->team = team; - thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; - retval = true; - } - if (uninitialized || team_sz_changed || tid_changed) { - thr_bar->nproc = nproc; - thr_bar->leaf_kids = thr_bar->base_leaf_kids; - if (thr_bar->my_level == 0) thr_bar->leaf_kids=0; - if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc) - thr_bar->leaf_kids = nproc - tid - 1; - thr_bar->leaf_state = 0; - for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1; + ++d; + } } - return retval; + thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1); + thr_bar->old_tid = tid; + thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; + thr_bar->team = team; + thr_bar->parent_bar = + &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; + } + if (uninitialized || team_changed || tid_changed) { + thr_bar->team = team; + thr_bar->parent_bar = + &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb; + retval = true; + } + if (uninitialized || team_sz_changed || tid_changed) { + thr_bar->nproc = nproc; + thr_bar->leaf_kids = thr_bar->base_leaf_kids; + if (thr_bar->my_level == 0) + thr_bar->leaf_kids = 0; + if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc) + thr_bar->leaf_kids = nproc - tid - 1; + thr_bar->leaf_state = 0; + for (int i = 0; i < thr_bar->leaf_kids; ++i) + ((char *)&(thr_bar->leaf_state))[7 - i] = 1; + } + return retval; } -static void -__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, - int gtid, int tid, void (*reduce) (void *, void *) - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); - register kmp_team_t *team = this_thr->th.th_team; - register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb; - register kmp_uint32 nproc = this_thr->th.th_team_nproc; - register kmp_info_t **other_threads = team->t.t_threads; - register kmp_uint64 new_state; - - int level = team->t.t_level; +static void __kmp_hierarchical_barrier_gather( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather); + register kmp_team_t *team = this_thr->th.th_team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_uint32 nproc = this_thr->th.th_team_nproc; + register kmp_info_t **other_threads = team->t.t_threads; + register kmp_uint64 new_state; + + int level = team->t.t_level; #if OMP_40_ENABLED - if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct? - if (this_thr->th.th_teams_size.nteams > 1) - ++level; // level was not increased in teams construct for team_of_masters + if (other_threads[0] + ->th.th_teams_microtask) // are we inside the teams construct? + if (this_thr->th.th_teams_size.nteams > 1) + ++level; // level was not increased in teams construct for team_of_masters #endif - if (level == 1) thr_bar->use_oncore_barrier = 1; - else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested + if (level == 1) + thr_bar->use_oncore_barrier = 1; + else + thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { - this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); - } + // Barrier imbalance - save arrive time to the thread + if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); + } #endif - (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); - - if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) - register kmp_int32 child_tid; - new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { - if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag - kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n", - gtid, team->t.t_id, tid)); - kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - if (reduce) { - ANNOTATE_REDUCE_AFTER(reduce); - for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - ANNOTATE_BARRIER_END(other_threads[child_tid]); - (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); - } - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits - } - // Next, wait for higher level children on each child's b_arrived flag - for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0 - kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(child_thr); - if (reduce) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - ANNOTATE_REDUCE_AFTER(reduce); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - } - } + (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, + team); + + if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) + register kmp_int32 child_tid; + new_state = + (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && + thr_bar->use_oncore_barrier) { + if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on + // my b_arrived flag + kmp_uint64 leaf_state = + KMP_MASTER_TID(tid) + ? thr_bar->b_arrived | thr_bar->leaf_state + : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting " + "for leaf kids\n", + gtid, team->t.t_id, tid)); + kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + if (reduce) { + ANNOTATE_REDUCE_AFTER(reduce); + for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; + ++child_tid) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " + "T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid)); + ANNOTATE_BARRIER_END(other_threads[child_tid]); + (*reduce)(this_thr->th.th_local.reduce_data, + other_threads[child_tid]->th.th_local.reduce_data); + } + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } - else { // Blocktime is not infinite - for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first - kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { - register kmp_info_t *child_thr = other_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) " - "arrived(%p) == %llu\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_arrived, new_state)); - kmp_flag_64 flag(&child_bar->b_arrived, new_state); - flag.wait(this_thr, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(child_thr); - if (reduce) { - KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid)); - ANNOTATE_REDUCE_AFTER(reduce); - (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); - ANNOTATE_REDUCE_BEFORE(reduce); - ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); - } - } - } - } - } - // All subordinates are gathered; now release parent if not master thread - - if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " - "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid, - __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid, - &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP)); - /* Mark arrival to parent: After performing this write, a worker thread may not assume that - the team is valid any more - it could be deallocated by the master thread at any time. */ - if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME - || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it - ANNOTATE_BARRIER_BEGIN(this_thr); - kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); - flag.release(); + (void)KMP_TEST_THEN_AND64( + (volatile kmp_int64 *)&thr_bar->b_arrived, + ~(thr_bar->leaf_state)); // clear leaf_state bits + } + // Next, wait for higher level children on each child's b_arrived flag + for (kmp_uint32 d = 1; d < thr_bar->my_level; + ++d) { // gather lowest level threads first, but skip 0 + kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], + skip = thr_bar->skip_per_level[d]; + if (last > nproc) + last = nproc; + for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " + "T#%d(%d:%d) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_arrived, new_state)); + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(child_thr); + if (reduce) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " + "T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid)); + ANNOTATE_REDUCE_AFTER(reduce); + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); + } } - else { // Leaf does special release on the "offset" bits of parent's b_arrived flag - thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; - kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); - flag.set_waiter(other_threads[thr_bar->parent_tid]); - flag.release(); + } + } else { // Blocktime is not infinite + for (kmp_uint32 d = 0; d < thr_bar->my_level; + ++d) { // Gather lowest level threads first + kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1], + skip = thr_bar->skip_per_level[d]; + if (last > nproc) + last = nproc; + for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { + register kmp_info_t *child_thr = other_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait " + "T#%d(%d:%d) " + "arrived(%p) == %llu\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_arrived, new_state)); + kmp_flag_64 flag(&child_bar->b_arrived, new_state); + flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(child_thr); + if (reduce) { + KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " + "T#%d(%d:%d)\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid)); + ANNOTATE_REDUCE_AFTER(reduce); + (*reduce)(this_thr->th.th_local.reduce_data, + child_thr->th.th_local.reduce_data); + ANNOTATE_REDUCE_BEFORE(reduce); + ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); + } } - } else { // Master thread needs to update the team's b_arrived value - team->t.t_bar[bt].b_arrived = new_state; - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n", - gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } } - // Is the team access below unsafe or just technically invalid? - KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + } + // All subordinates are gathered; now release parent if not master thread + + if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy + KA_TRACE( + 20, + ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) " + "arrived(%p): %llu => %llu\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, + thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived, + thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP)); + /* Mark arrival to parent: After performing this write, a worker thread may + not assume that the team is valid any more - it could be deallocated by + the master thread at any time. */ + if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || + !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived + // flag; release it + ANNOTATE_BARRIER_BEGIN(this_thr); + kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]); + flag.release(); + } else { // Leaf does special release on the "offset" bits of parent's + // b_arrived flag + thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP; + kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset); + flag.set_waiter(other_threads[thr_bar->parent_tid]); + flag.release(); + } + } else { // Master thread needs to update the team's b_arrived value + team->t.t_bar[bt].b_arrived = new_state; + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d " + "arrived(%p) = %llu\n", + gtid, team->t.t_id, tid, team->t.t_id, + &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived)); + } + // Is the team access below unsafe or just technically invalid? + KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } -static void -__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, - int propagate_icvs - USE_ITT_BUILD_ARG(void * itt_sync_obj) ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); - register kmp_team_t *team; - register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; - register kmp_uint32 nproc; - bool team_change = false; // indicates on-core barrier shouldn't be used - - if (KMP_MASTER_TID(tid)) { - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n", - gtid, team->t.t_id, tid, bt)); - } - else { // Worker threads - // Wait for parent thread to release me - if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME - || thr_bar->my_level != 0 || thr_bar->team == NULL) { - // Use traditional method of waiting on my own b_go flag - thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; - kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); - flag.wait(this_thr, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - ANNOTATE_BARRIER_END(this_thr); - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - } - else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested - // Wait on my "offset" bits on parent's b_go flag - thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; - kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset, - bt, this_thr - USE_ITT_BUILD_ARG(itt_sync_obj) ); - flag.wait(this_thr, TRUE); - if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - } - else { // Reset my bits on parent's b_go flag - ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; - } - } - thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; - // Early exit for reaping threads releasing forkjoin barrier - if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) - return; - // The worker thread may now assume that the team is valid. - team = __kmp_threads[gtid]->th.th_team; - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); - KMP_MB(); // Flush all pending memory write invalidates. +static void __kmp_hierarchical_barrier_release( + enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, + int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release); + register kmp_team_t *team; + register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb; + register kmp_uint32 nproc; + bool team_change = false; // indicates on-core barrier shouldn't be used + + if (KMP_MASTER_TID(tid)) { + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master " + "entered barrier type %d\n", + gtid, team->t.t_id, tid, bt)); + } else { // Worker threads + // Wait for parent thread to release me + if (!thr_bar->use_oncore_barrier || + __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 || + thr_bar->team == NULL) { + // Use traditional method of waiting on my own b_go flag + thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG; + kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP); + flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + ANNOTATE_BARRIER_END(this_thr); + TCW_8(thr_bar->b_go, + KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + } else { // Thread barrier data is initialized, this is a leaf, blocktime is + // infinite, not nested + // Wait on my "offset" bits on parent's b_go flag + thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG; + kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, + thr_bar->offset, bt, + this_thr USE_ITT_BUILD_ARG(itt_sync_obj)); + flag.wait(this_thr, TRUE); + if (thr_bar->wait_flag == + KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go + TCW_8(thr_bar->b_go, + KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + } else { // Reset my bits on parent's b_go flag + ((char *)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0; + } } + thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING; + // Early exit for reaping threads releasing forkjoin barrier + if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done)) + return; + // The worker thread may now assume that the team is valid. + team = __kmp_threads[gtid]->th.th_team; + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); - nproc = this_thr->th.th_team_nproc; - int level = team->t.t_level; + KA_TRACE( + 20, + ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE)); + KMP_MB(); // Flush all pending memory write invalidates. + } + + nproc = this_thr->th.th_team_nproc; + int level = team->t.t_level; #if OMP_40_ENABLED - if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct? - if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level) - ++level; // level was not increased in teams construct for team_of_workers - if( this_thr->th.th_teams_size.nteams > 1 ) - ++level; // level was not increased in teams construct for team_of_masters - } + if (team->t.t_threads[0] + ->th.th_teams_microtask) { // are we inside the teams construct? + if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && + this_thr->th.th_teams_level == level) + ++level; // level was not increased in teams construct for team_of_workers + if (this_thr->th.th_teams_size.nteams > 1) + ++level; // level was not increased in teams construct for team_of_masters + } #endif - if (level == 1) thr_bar->use_oncore_barrier = 1; - else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested - - // If the team size has increased, we still communicate with old leaves via oncore barrier. - unsigned short int old_leaf_kids = thr_bar->leaf_kids; - kmp_uint64 old_leaf_state = thr_bar->leaf_state; - team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); - // But if the entire team changes, we won't use oncore barrier at all - if (team_change) old_leaf_kids = 0; + if (level == 1) + thr_bar->use_oncore_barrier = 1; + else + thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested + + // If the team size has increased, we still communicate with old leaves via + // oncore barrier. + unsigned short int old_leaf_kids = thr_bar->leaf_kids; + kmp_uint64 old_leaf_state = thr_bar->leaf_state; + team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, + tid, team); + // But if the entire team changes, we won't use oncore barrier at all + if (team_change) + old_leaf_kids = 0; #if KMP_BARRIER_ICV_PUSH - if (propagate_icvs) { - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy - copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs); - } - else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime - if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) - // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &thr_bar->parent_bar->th_fixed_icvs); - // non-leaves will get ICVs piggybacked with b_go via NGO store - } - else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs - if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access - copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); - else // leaves copy parent's fixed ICVs directly to local ICV store - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &thr_bar->parent_bar->th_fixed_icvs); - } + if (propagate_icvs) { + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, + FALSE); + if (KMP_MASTER_TID( + tid)) { // master already has copy in final destination; copy + copy_icvs(&thr_bar->th_fixed_icvs, + &team->t.t_implicit_task_taskdata[tid].td_icvs); + } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && + thr_bar->use_oncore_barrier) { // optimization for inf blocktime + if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0) + // leaves (on-core children) pull parent's fixed ICVs directly to local + // ICV store + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->parent_bar->th_fixed_icvs); + // non-leaves will get ICVs piggybacked with b_go via NGO store + } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs + if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can + // access + copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs); + else // leaves copy parent's fixed ICVs directly to local ICV store + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->parent_bar->th_fixed_icvs); } + } #endif // KMP_BARRIER_ICV_PUSH - // Now, release my children - if (thr_bar->my_level) { // not a leaf - register kmp_int32 child_tid; - kmp_uint32 last; - if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { - if (KMP_MASTER_TID(tid)) { // do a flat release - // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go. - thr_bar->b_go = KMP_BARRIER_STATE_BUMP; - // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line - ngo_load(&thr_bar->th_fixed_icvs); - // This loops over all the threads skipping only the leaf nodes in the hierarchy - for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) { - register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" - " go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Use ngo store (if available) to both store ICVs and release child via child's b_go - ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); - } - ngo_sync(); - } - TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time - // Now, release leaf children - if (thr_bar->leaf_kids) { // if there are any - // We test team_change on the off-chance that the level 1 team changed. - if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new - if (old_leaf_kids) { // release old leaf kids - thr_bar->b_go |= old_leaf_state; - } - // Release new leaf kids - last = tid+thr_bar->skip_per_level[1]; - if (last > nproc) last = nproc; - for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1 - register kmp_info_t *child_thr = team->t.t_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" - " T#%d(%d:%d) go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child using child's b_go flag - ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } - else { // Release all children at once with leaf_state bits on my own b_go flag - thr_bar->b_go |= thr_bar->leaf_state; - } - } + // Now, release my children + if (thr_bar->my_level) { // not a leaf + register kmp_int32 child_tid; + kmp_uint32 last; + if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && + thr_bar->use_oncore_barrier) { + if (KMP_MASTER_TID(tid)) { // do a flat release + // Set local b_go to bump children via NGO store of the cache line + // containing IVCs and b_go. + thr_bar->b_go = KMP_BARRIER_STATE_BUMP; + // Use ngo stores if available; b_go piggybacks in the last 8 bytes of + // the cache line + ngo_load(&thr_bar->th_fixed_icvs); + // This loops over all the threads skipping only the leaf nodes in the + // hierarchy + for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc; + child_tid += thr_bar->skip_per_level[1]) { + register kmp_bstate_t *child_bar = + &team->t.t_threads[child_tid]->th.th_bar[bt].bb; + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " + "releasing T#%d(%d:%d)" + " go(%p): %u => %u\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Use ngo store (if available) to both store ICVs and release child + // via child's b_go + ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs); } - else { // Blocktime is not infinite; do a simple hierarchical release - for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first - last = tid+thr_bar->skip_per_level[d+1]; - kmp_uint32 skip = thr_bar->skip_per_level[d]; - if (last > nproc) last = nproc; - for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) { - register kmp_info_t *child_thr = team->t.t_threads[child_tid]; - register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)" - " go(%p): %u => %u\n", - gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), - team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, - child_bar->b_go + KMP_BARRIER_STATE_BUMP)); - // Release child using child's b_go flag - ANNOTATE_BARRIER_BEGIN(child_thr); - kmp_flag_64 flag(&child_bar->b_go, child_thr); - flag.release(); - } - } + ngo_sync(); + } + TCW_8(thr_bar->b_go, + KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time + // Now, release leaf children + if (thr_bar->leaf_kids) { // if there are any + // We test team_change on the off-chance that the level 1 team changed. + if (team_change || + old_leaf_kids < thr_bar->leaf_kids) { // some old, some new + if (old_leaf_kids) { // release old leaf kids + thr_bar->b_go |= old_leaf_state; + } + // Release new leaf kids + last = tid + thr_bar->skip_per_level[1]; + if (last > nproc) + last = nproc; + for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last; + ++child_tid) { // skip_per_level[0]=1 + register kmp_info_t *child_thr = team->t.t_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + KA_TRACE( + 20, + ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing" + " T#%d(%d:%d) go(%p): %u => %u\n", + gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), + team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child using child's b_go flag + ANNOTATE_BARRIER_BEGIN(child_thr); + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } + } else { // Release all children at once with leaf_state bits on my own + // b_go flag + thr_bar->b_go |= thr_bar->leaf_state; } + } + } else { // Blocktime is not infinite; do a simple hierarchical release + for (int d = thr_bar->my_level - 1; d >= 0; + --d) { // Release highest level threads first + last = tid + thr_bar->skip_per_level[d + 1]; + kmp_uint32 skip = thr_bar->skip_per_level[d]; + if (last > nproc) + last = nproc; + for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) { + register kmp_info_t *child_thr = team->t.t_threads[child_tid]; + register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb; + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) " + "releasing T#%d(%d:%d) go(%p): %u => %u\n", + gtid, team->t.t_id, tid, + __kmp_gtid_from_tid(child_tid, team), team->t.t_id, + child_tid, &child_bar->b_go, child_bar->b_go, + child_bar->b_go + KMP_BARRIER_STATE_BUMP)); + // Release child using child's b_go flag + ANNOTATE_BARRIER_BEGIN(child_thr); + kmp_flag_64 flag(&child_bar->b_go, child_thr); + flag.release(); + } + } + } #if KMP_BARRIER_ICV_PUSH - if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs); + if (propagate_icvs && !KMP_MASTER_TID(tid)) + // non-leaves copy ICVs from fixed ICVs to local dest + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &thr_bar->th_fixed_icvs); #endif // KMP_BARRIER_ICV_PUSH - } - KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt)); + } + KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for " + "barrier type %d\n", + gtid, team->t.t_id, tid, bt)); } -// ---------------------------- End of Barrier Algorithms ---------------------------- + +// End of Barrier Algorithms // Internal function to do a barrier. /* If is_split is true, do a split barrier, otherwise, do a plain barrier - If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier + If reduce is non-NULL, do a split reduction barrier, otherwise, do a split + barrier Returns 0 if master thread, 1 if worker thread. */ -int -__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, - void *reduce_data, void (*reduce)(void *, void *)) -{ - KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); - KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); - register int tid = __kmp_tid_from_gtid(gtid); - register kmp_info_t *this_thr = __kmp_threads[gtid]; - register kmp_team_t *team = this_thr->th.th_team; - register int status = 0; - ident_t *loc = __kmp_threads[gtid]->th.th_ident; +int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, + size_t reduce_size, void *reduce_data, + void (*reduce)(void *, void *)) { + KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier); + KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); + register int tid = __kmp_tid_from_gtid(gtid); + register kmp_info_t *this_thr = __kmp_threads[gtid]; + register kmp_team_t *team = this_thr->th.th_team; + register int status = 0; + ident_t *loc = __kmp_threads[gtid]->th.th_ident; #if OMPT_SUPPORT - ompt_task_id_t my_task_id; - ompt_parallel_id_t my_parallel_id; + ompt_task_id_t my_task_id; + ompt_parallel_id_t my_parallel_id; #endif - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); + KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, + __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); - ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); + ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); #if OMPT_SUPPORT - if (ompt_enabled) { + if (ompt_enabled) { #if OMPT_BLAME - my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; - my_parallel_id = team->t.ompt_team_info.parallel_id; + my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id; + my_parallel_id = team->t.ompt_team_info.parallel_id; #if OMPT_TRACE - if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { - if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { - ompt_callbacks.ompt_callback(ompt_event_single_others_end)( - my_parallel_id, my_task_id); - } - } -#endif - if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( - my_parallel_id, my_task_id); - } + if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) { + if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) { + ompt_callbacks.ompt_callback(ompt_event_single_others_end)( + my_parallel_id, my_task_id); + } + } #endif - // It is OK to report the barrier state after the barrier begin callback. - // According to the OMPT specification, a compliant implementation may - // even delay reporting this state until the barrier begins to wait. - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(my_parallel_id, + my_task_id); } #endif + // It is OK to report the barrier state after the barrier begin callback. + // According to the OMPT specification, a compliant implementation may + // even delay reporting this state until the barrier begins to wait. + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + } +#endif - if (! team->t.t_serialized) { + if (!team->t.t_serialized) { #if USE_ITT_BUILD - // This value will be used in itt notify events below. - void *itt_sync_obj = NULL; -# if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); -# endif + // This value will be used in itt notify events below. + void *itt_sync_obj = NULL; +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); +#endif #endif /* USE_ITT_BUILD */ - if (__kmp_tasking_mode == tskm_extra_barrier) { - __kmp_tasking_barrier(team, this_thr, gtid); - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); - } + if (__kmp_tasking_mode == tskm_extra_barrier) { + __kmp_tasking_barrier(team, this_thr, gtid); + KA_TRACE(15, + ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid, + __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid))); + } - /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when - the team struct is not guaranteed to exist. */ - // See note about the corresponding code in __kmp_join_barrier() being performance-critical. - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + /* Copy the blocktime info to the thread, where __kmp_wait_template() can + access it when the team struct is not guaranteed to exist. */ + // See note about the corresponding code in __kmp_join_barrier() being + // performance-critical. + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { #if KMP_USE_MONITOR - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + this_thr->th.th_team_bt_intervals = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; #else - this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); + this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); #endif - } + } #if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_starting(gtid, itt_sync_obj); + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_starting(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ #if USE_DEBUGGER - // Let the debugger know: the thread arrived to the barrier and waiting. - if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. - team->t.t_bar[bt].b_master_arrived += 1; - } else { - this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; - } // if + // Let the debugger know: the thread arrived to the barrier and waiting. + if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure. + team->t.t_bar[bt].b_master_arrived += 1; + } else { + this_thr->th.th_bar[bt].bb.b_worker_arrived += 1; + } // if #endif /* USE_DEBUGGER */ - if (reduce != NULL) { - //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 - this_thr->th.th_local.reduce_data = reduce_data; - } + if (reduce != NULL) { + // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956 + this_thr->th.th_local.reduce_data = reduce_data; + } - if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) - __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1 + if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec) + __kmp_task_team_setup( + this_thr, team, + 0); // use 0 to only setup the current team if nthreads > 1 - switch (__kmp_barrier_gather_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear - __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj)); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear - __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } + switch (__kmp_barrier_gather_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits + // to 0; use linear + __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits + // to 0; use linear + __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + default: { + __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, + reduce USE_ITT_BUILD_ARG(itt_sync_obj)); + } + } - KMP_MB(); + KMP_MB(); - if (KMP_MASTER_TID(tid)) { - status = 0; - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } + if (KMP_MASTER_TID(tid)) { + status = 0; + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); + } #if USE_DEBUGGER - // Let the debugger know: All threads are arrived and starting leaving the barrier. - team->t.t_bar[bt].b_team_arrived += 1; + // Let the debugger know: All threads are arrived and starting leaving the + // barrier. + team->t.t_bar[bt].b_team_arrived += 1; #endif #if OMP_40_ENABLED // Reset cancellation flag for worksharing constructs - if(team->t.t_cancel_request == cancel_loop || - team->t.t_cancel_request == cancel_sections ) { + if (team->t.t_cancel_request == cancel_loop || + team->t.t_cancel_request == cancel_sections) { team->t.t_cancel_request = cancel_noreq; } #endif #if USE_ITT_BUILD - /* TODO: In case of split reduction barrier, master thread may send acquired event early, - before the final summation into the shared variable is done (final summation can be a - long operation for array reductions). */ - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); + /* TODO: In case of split reduction barrier, master thread may send + acquired event early, before the final summation into the shared + variable is done (final summation can be a long operation for array + reductions). */ + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier - report frame end (only if active_level == 1) - if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && + // Barrier - report frame end (only if active_level == 1) + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && + __kmp_forkjoin_frames_mode && #if OMP_40_ENABLED - this_thr->th.th_teams_microtask == NULL && + this_thr->th.th_teams_microtask == NULL && #endif - team->t.t_active_level == 1) - { - kmp_uint64 cur_time = __itt_get_timestamp(); - kmp_info_t **other_threads = team->t.t_threads; - int nproc = this_thr->th.th_team_nproc; - int i; - switch(__kmp_forkjoin_frames_mode) { - case 1: - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) - __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); - break; - case 3: - if( __itt_metadata_add_ptr ) { - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; - // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below - this_thr->th.th_bar_arrive_time = 0; - for (i=1; i<nproc; ++i) { - delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); - other_threads[i]->th.th_bar_arrive_time = 0; - } - __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL)); - } - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - } + team->t.t_active_level == 1) { + kmp_uint64 cur_time = __itt_get_timestamp(); + kmp_info_t **other_threads = team->t.t_threads; + int nproc = this_thr->th.th_team_nproc; + int i; + switch (__kmp_forkjoin_frames_mode) { + case 1: + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, + loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + case 2: // AC 2015-01-19: currently does not work for hierarchical (to + // be fixed) + __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, + 1, loc, nproc); + break; + case 3: + if (__itt_metadata_add_ptr) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; + // Set arrive time to zero to be able to check it in + // __kmp_invoke_task(); the same is done inside the loop below + this_thr->th.th_bar_arrive_time = 0; + for (i = 1; i < nproc; ++i) { + delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); + other_threads[i]->th.th_bar_arrive_time = 0; } + __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, + cur_time, delta, + (kmp_uint64)(reduce != NULL)); + } + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, + loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + } + } #endif /* USE_ITT_BUILD */ - } else { - status = 1; + } else { + status = 1; #if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ - } - if (status == 1 || ! is_split) { - switch (__kmp_barrier_release_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } - } - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } - } + } + if (status == 1 || !is_split) { + switch (__kmp_barrier_release_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release( + bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_tree_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + default: { + __kmp_linear_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); + } + } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } + } #if USE_ITT_BUILD - /* GEH: TODO: Move this under if-condition above and also include in - __kmp_end_split_barrier(). This will more accurately represent the actual release time - of the threads for split barriers. */ - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); + /* GEH: TODO: Move this under if-condition above and also include in + __kmp_end_split_barrier(). This will more accurately represent the actual + release time of the threads for split barriers. */ + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ - } else { // Team is serialized. - status = 0; - if (__kmp_tasking_mode != tskm_immediate_exec) { + } else { // Team is serialized. + status = 0; + if (__kmp_tasking_mode != tskm_immediate_exec) { #if OMP_45_ENABLED - if ( this_thr->th.th_task_team != NULL ) { - void *itt_sync_obj = NULL; + if (this_thr->th.th_task_team != NULL) { + void *itt_sync_obj = NULL; #if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); - __kmp_itt_barrier_starting(gtid, itt_sync_obj); - } + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1); + __kmp_itt_barrier_starting(gtid, itt_sync_obj); + } #endif - KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE); - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj)); - __kmp_task_team_setup(this_thr, team, 0); + KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == + TRUE); + __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); + __kmp_task_team_setup(this_thr, team, 0); #if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ - } + } #else - // The task team should be NULL for serialized code (tasks will be executed immediately) - KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); + // The task team should be NULL for serialized code (tasks will be + // executed immediately) + KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL); + KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL); #endif - } } - KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", - gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status)); + } + KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n", + gtid, __kmp_team_from_gtid(gtid)->t.t_id, + __kmp_tid_from_gtid(gtid), status)); #if OMPT_SUPPORT - if (ompt_enabled) { + if (ompt_enabled) { #if OMPT_BLAME - if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_end)( - my_parallel_id, my_task_id); - } -#endif - this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; + if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)(my_parallel_id, + my_task_id); } #endif - ANNOTATE_BARRIER_END(&team->t.t_bar); + this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; + } +#endif + ANNOTATE_BARRIER_END(&team->t.t_bar); - return status; + return status; } - -void -__kmp_end_split_barrier(enum barrier_type bt, int gtid) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); - KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); - int tid = __kmp_tid_from_gtid(gtid); - kmp_info_t *this_thr = __kmp_threads[gtid]; - kmp_team_t *team = this_thr->th.th_team; - - ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); - if (!team->t.t_serialized) { - if (KMP_MASTER_GTID(gtid)) { - switch (__kmp_barrier_release_pattern[bt]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL)); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); - __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - break; - } - default: { - __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE - USE_ITT_BUILD_ARG(NULL) ); - } - } - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } // if - } +void __kmp_end_split_barrier(enum barrier_type bt, int gtid) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier); + KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER); + int tid = __kmp_tid_from_gtid(gtid); + kmp_info_t *this_thr = __kmp_threads[gtid]; + kmp_team_t *team = this_thr->th.th_team; + + ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); + if (!team->t.t_serialized) { + if (KMP_MASTER_GTID(gtid)) { + switch (__kmp_barrier_release_pattern[bt]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]); + __kmp_tree_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + break; + } + default: { + __kmp_linear_barrier_release(bt, this_thr, gtid, tid, + FALSE USE_ITT_BUILD_ARG(NULL)); + } + } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } // if } - ANNOTATE_BARRIER_END(&team->t.t_bar); + } + ANNOTATE_BARRIER_END(&team->t.t_bar); } - -void -__kmp_join_barrier(int gtid) -{ - KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); - KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); - register kmp_info_t *this_thr = __kmp_threads[gtid]; - register kmp_team_t *team; - register kmp_uint nproc; - kmp_info_t *master_thread; - int tid; +void __kmp_join_barrier(int gtid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); + register kmp_info_t *this_thr = __kmp_threads[gtid]; + register kmp_team_t *team; + register kmp_uint nproc; + kmp_info_t *master_thread; + int tid; #ifdef KMP_DEBUG - int team_id; + int team_id; #endif /* KMP_DEBUG */ #if USE_ITT_BUILD - void *itt_sync_obj = NULL; -# if USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need - // Get object created at fork_barrier - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); -# endif + void *itt_sync_obj = NULL; +#if USE_ITT_NOTIFY + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need + // Get object created at fork_barrier + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); +#endif #endif /* USE_ITT_BUILD */ - KMP_MB(); + KMP_MB(); - // Get current info - team = this_thr->th.th_team; - nproc = this_thr->th.th_team_nproc; - KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); - tid = __kmp_tid_from_gtid(gtid); + // Get current info + team = this_thr->th.th_team; + nproc = this_thr->th.th_team_nproc; + KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc); + tid = __kmp_tid_from_gtid(gtid); #ifdef KMP_DEBUG - team_id = team->t.t_id; + team_id = team->t.t_id; #endif /* KMP_DEBUG */ - master_thread = this_thr->th.th_team_master; + master_thread = this_thr->th.th_team_master; #ifdef KMP_DEBUG - if (master_thread != team->t.t_threads[0]) { - __kmp_print_structure(); - } + if (master_thread != team->t.t_threads[0]) { + __kmp_print_structure(); + } #endif /* KMP_DEBUG */ - KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); - KMP_MB(); - - // Verify state - KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); - KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); - KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); - KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid)); - - ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); + KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]); + KMP_MB(); + + // Verify state + KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); + KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team)); + KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root)); + KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]); + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", + gtid, team_id, tid)); + + ANNOTATE_BARRIER_BEGIN(&team->t.t_bar); #if OMPT_SUPPORT #if OMPT_TRACE - if (ompt_enabled && - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( - team->t.ompt_team_info.parallel_id, - team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); - } + if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_begin)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } #endif - this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; + this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier; #endif - if (__kmp_tasking_mode == tskm_extra_barrier) { - __kmp_tasking_barrier(team, this_thr, gtid); - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid)); - } -# ifdef KMP_DEBUG - if (__kmp_tasking_mode != tskm_immediate_exec) { - KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n", - __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state], - this_thr->th.th_task_team)); - KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]); - } -# endif /* KMP_DEBUG */ + if (__kmp_tasking_mode == tskm_extra_barrier) { + __kmp_tasking_barrier(team, this_thr, gtid); + KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, + team_id, tid)); + } +#ifdef KMP_DEBUG + if (__kmp_tasking_mode != tskm_immediate_exec) { + KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = " + "%p, th_task_team = %p\n", + __kmp_gtid_from_thread(this_thr), team_id, + team->t.t_task_team[this_thr->th.th_task_state], + this_thr->th.th_task_team)); + KMP_DEBUG_ASSERT(this_thr->th.th_task_team == + team->t.t_task_team[this_thr->th.th_task_state]); + } +#endif /* KMP_DEBUG */ - /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the - team struct is not guaranteed to exist. Doing these loads causes a cache miss slows - down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite, - since the values are not used by __kmp_wait_template() in that case. */ - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + /* Copy the blocktime info to the thread, where __kmp_wait_template() can + access it when the team struct is not guaranteed to exist. Doing these + loads causes a cache miss slows down EPCC parallel by 2x. As a workaround, + we do not perform the copy if blocktime=infinite, since the values are not + used by __kmp_wait_template() in that case. */ + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { #if KMP_USE_MONITOR - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + this_thr->th.th_team_bt_intervals = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; #else - this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); + this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); #endif - } + } #if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_starting(gtid, itt_sync_obj); + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_starting(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ - switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); - __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); - __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } + switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); + __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]); + __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + default: { + __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, + NULL USE_ITT_BUILD_ARG(itt_sync_obj)); + } + } + + /* From this point on, the team data structure may be deallocated at any time + by the master thread - it is unsafe to reference it in any of the worker + threads. Any per-team data items that need to be referenced before the + end of the barrier should be moved to the kmp_task_team_t structs. */ + if (KMP_MASTER_TID(tid)) { + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj)); } - - /* From this point on, the team data structure may be deallocated at any time by the - master thread - it is unsafe to reference it in any of the worker threads. Any per-team - data items that need to be referenced before the end of the barrier should be moved to - the kmp_task_team_t structs. */ - if (KMP_MASTER_TID(tid)) { - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_wait(this_thr, team - USE_ITT_BUILD_ARG(itt_sync_obj) ); - } #if KMP_STATS_ENABLED - // Have master thread flag the workers to indicate they are now waiting for - // next parallel region, Also wake them up so they switch their timers to idle. - for (int i=0; i<team->t.t_nproc; ++i) { - kmp_info_t* team_thread = team->t.t_threads[i]; - if (team_thread == this_thr) - continue; - team_thread->th.th_stats->setIdleFlag(); - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL) - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc); - } + // Have master thread flag the workers to indicate they are now waiting for + // next parallel region, Also wake them up so they switch their timers to + // idle. + for (int i = 0; i < team->t.t_nproc; ++i) { + kmp_info_t *team_thread = team->t.t_threads[i]; + if (team_thread == this_thr) + continue; + team_thread->th.th_stats->setIdleFlag(); + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && + team_thread->th.th_sleep_loc != NULL) + __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), + team_thread->th.th_sleep_loc); + } #endif #if USE_ITT_BUILD - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ -# if USE_ITT_BUILD && USE_ITT_NOTIFY - // Join barrier - report frame end - if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Join barrier - report frame end + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && + __kmp_forkjoin_frames_mode && #if OMP_40_ENABLED - this_thr->th.th_teams_microtask == NULL && + this_thr->th.th_teams_microtask == NULL && #endif - team->t.t_active_level == 1) - { - kmp_uint64 cur_time = __itt_get_timestamp(); - ident_t * loc = team->t.t_ident; - kmp_info_t **other_threads = team->t.t_threads; - int nproc = this_thr->th.th_team_nproc; - int i; - switch(__kmp_forkjoin_frames_mode) { - case 1: - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - break; - case 2: - __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); - break; - case 3: - if( __itt_metadata_add_ptr ) { - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; - // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below - this_thr->th.th_bar_arrive_time = 0; - for (i=1; i<nproc; ++i) { - delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time ); - other_threads[i]->th.th_bar_arrive_time = 0; - } - __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0); - } - __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); - this_thr->th.th_frame_time = cur_time; - break; - } + team->t.t_active_level == 1) { + kmp_uint64 cur_time = __itt_get_timestamp(); + ident_t *loc = team->t.t_ident; + kmp_info_t **other_threads = team->t.t_threads; + int nproc = this_thr->th.th_team_nproc; + int i; + switch (__kmp_forkjoin_frames_mode) { + case 1: + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, + loc, nproc); + break; + case 2: + __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, + loc, nproc); + break; + case 3: + if (__itt_metadata_add_ptr) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; + // Set arrive time to zero to be able to check it in + // __kmp_invoke_task(); the same is done inside the loop below + this_thr->th.th_bar_arrive_time = 0; + for (i = 1; i < nproc; ++i) { + delta += (cur_time - other_threads[i]->th.th_bar_arrive_time); + other_threads[i]->th.th_bar_arrive_time = 0; + } + __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, + cur_time, delta, 0); } -# endif /* USE_ITT_BUILD */ + __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, + loc, nproc); + this_thr->th.th_frame_time = cur_time; + break; + } } +#endif /* USE_ITT_BUILD */ + } #if USE_ITT_BUILD - else { - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) - __kmp_itt_barrier_middle(gtid, itt_sync_obj); - } + else { + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) + __kmp_itt_barrier_middle(gtid, itt_sync_obj); + } #endif /* USE_ITT_BUILD */ #if KMP_DEBUG - if (KMP_MASTER_TID(tid)) { - KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", - gtid, team_id, tid, nproc)); - } + if (KMP_MASTER_TID(tid)) { + KA_TRACE( + 15, + ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n", + gtid, team_id, tid, nproc)); + } #endif /* KMP_DEBUG */ - // TODO now, mark worker threads as done so they may be disbanded - KMP_MB(); // Flush all pending memory write invalidates. - KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); + // TODO now, mark worker threads as done so they may be disbanded + KMP_MB(); // Flush all pending memory write invalidates. + KA_TRACE(10, + ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid)); #if OMPT_SUPPORT - if (ompt_enabled) { + if (ompt_enabled) { #if OMPT_BLAME - if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { - ompt_callbacks.ompt_callback(ompt_event_barrier_end)( - team->t.ompt_team_info.parallel_id, - team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); - } + if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) { + ompt_callbacks.ompt_callback(ompt_event_barrier_end)( + team->t.ompt_team_info.parallel_id, + team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id); + } #endif - // return to default state - this_thr->th.ompt_thread_info.state = ompt_state_overhead; - } + // return to default state + this_thr->th.ompt_thread_info.state = ompt_state_overhead; + } #endif - ANNOTATE_BARRIER_END(&team->t.t_bar); + ANNOTATE_BARRIER_END(&team->t.t_bar); } - -// TODO release worker threads' fork barriers as we are ready instead of all at once -void -__kmp_fork_barrier(int gtid, int tid) -{ - KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); - KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); - kmp_info_t *this_thr = __kmp_threads[gtid]; - kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; +// TODO release worker threads' fork barriers as we are ready instead of all at +// once +void __kmp_fork_barrier(int gtid, int tid) { + KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier); + KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER); + kmp_info_t *this_thr = __kmp_threads[gtid]; + kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL; #if USE_ITT_BUILD - void * itt_sync_obj = NULL; + void *itt_sync_obj = NULL; #endif /* USE_ITT_BUILD */ - if (team) - ANNOTATE_BARRIER_END(&team->t.t_bar); + if (team) + ANNOTATE_BARRIER_END(&team->t.t_bar); - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", - gtid, (team != NULL) ? team->t.t_id : -1, tid)); + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid, + (team != NULL) ? team->t.t_id : -1, tid)); - // th_team pointer only valid for master thread here - if (KMP_MASTER_TID(tid)) { + // th_team pointer only valid for master thread here + if (KMP_MASTER_TID(tid)) { #if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - // Create itt barrier object - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); - __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing - } + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + // Create itt barrier object + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1); + __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing + } #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ #ifdef KMP_DEBUG - register kmp_info_t **other_threads = team->t.t_threads; - register int i; - - // Verify state - KMP_MB(); - - for(i=1; i<team->t.t_nproc; ++i) { - KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n", - gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, - team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, - other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); - KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) - & ~(KMP_BARRIER_SLEEP_STATE)) - == KMP_INIT_BARRIER_STATE); - KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); - } + register kmp_info_t **other_threads = team->t.t_threads; + register int i; + + // Verify state + KMP_MB(); + + for (i = 1; i < team->t.t_nproc; ++i) { + KA_TRACE(500, + ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go " + "== %u.\n", + gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid, + team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid, + other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)); + KMP_DEBUG_ASSERT( + (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) & + ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE); + KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team); + } #endif - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1 - } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_setup( + this_thr, team, + 0); // 0 indicates setup current task team if nthreads > 1 + } - /* The master thread may have changed its blocktime between the join barrier and the - fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can - access it when the team struct is not guaranteed to exist. */ - // See note about the corresponding code in __kmp_join_barrier() being performance-critical - if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { + /* The master thread may have changed its blocktime between the join barrier + and the fork barrier. Copy the blocktime info to the thread, where + __kmp_wait_template() can access it when the team struct is not + guaranteed to exist. */ + // See note about the corresponding code in __kmp_join_barrier() being + // performance-critical + if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { #if KMP_USE_MONITOR - this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; - this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; + this_thr->th.th_team_bt_intervals = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals; + this_thr->th.th_team_bt_set = + team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set; #else - this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); + this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(); #endif - } - } // master - - switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { - case bp_hyper_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); - __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_hierarchical_bar: { - __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - case bp_tree_bar: { - KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); - __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); - break; - } - default: { - __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE - USE_ITT_BUILD_ARG(itt_sync_obj) ); } - } - - // Early exit for reaping threads releasing forkjoin barrier - if (TCR_4(__kmp_global.g.g_done)) { - this_thr->th.th_task_team = NULL; + } // master + + switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) { + case bp_hyper_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); + __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_hierarchical_bar: { + __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + case bp_tree_bar: { + KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]); + __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + break; + } + default: { + __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, + TRUE USE_ITT_BUILD_ARG(itt_sync_obj)); + } + } + + // Early exit for reaping threads releasing forkjoin barrier + if (TCR_4(__kmp_global.g.g_done)) { + this_thr->th.th_task_team = NULL; #if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - if (!KMP_MASTER_TID(tid)) { - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - if (itt_sync_obj) - __kmp_itt_barrier_finished(gtid, itt_sync_obj); - } - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); - return; + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + if (!KMP_MASTER_TID(tid)) { + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + if (itt_sync_obj) + __kmp_itt_barrier_finished(gtid, itt_sync_obj); + } } - - /* We can now assume that a valid team structure has been allocated by the master and - propagated to all worker threads. The current thread, however, may not be part of the - team, so we can't blindly assume that the team pointer is non-null. */ - team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); - KMP_DEBUG_ASSERT(team != NULL); - tid = __kmp_tid_from_gtid(gtid); - +#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ + KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid)); + return; + } + + /* We can now assume that a valid team structure has been allocated by the + master and propagated to all worker threads. The current thread, however, + may not be part of the team, so we can't blindly assume that the team + pointer is non-null. */ + team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team); + KMP_DEBUG_ASSERT(team != NULL); + tid = __kmp_tid_from_gtid(gtid); #if KMP_BARRIER_ICV_PULL - /* Master thread's copy of the ICVs was set up on the implicit taskdata in - __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has - this data before this function is called. We cannot modify __kmp_fork_call() to look at - the fixed ICVs in the master's thread struct, because it is not always the case that the - threads arrays have been allocated when __kmp_fork_call() is executed. */ - { - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); - if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs - // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); - __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE); - copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, - &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs); - } + /* Master thread's copy of the ICVs was set up on the implicit taskdata in + __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's + implicit task has this data before this function is called. We cannot + modify __kmp_fork_call() to look at the fixed ICVs in the master's thread + struct, because it is not always the case that the threads arrays have + been allocated when __kmp_fork_call() is executed. */ + { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy); + if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs + // Copy the initial ICVs from the master's thread struct to the implicit + // task for this tid. + KA_TRACE(10, + ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid)); + __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, + tid, FALSE); + copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, + &team->t.t_threads[0] + ->th.th_bar[bs_forkjoin_barrier] + .bb.th_fixed_icvs); } + } #endif // KMP_BARRIER_ICV_PULL - if (__kmp_tasking_mode != tskm_immediate_exec) { - __kmp_task_team_sync(this_thr, team); - } + if (__kmp_tasking_mode != tskm_immediate_exec) { + __kmp_task_team_sync(this_thr, team); + } #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED - kmp_proc_bind_t proc_bind = team->t.t_proc_bind; - if (proc_bind == proc_bind_intel) { + kmp_proc_bind_t proc_bind = team->t.t_proc_bind; + if (proc_bind == proc_bind_intel) { #endif #if KMP_AFFINITY_SUPPORTED - // Call dynamic affinity settings - if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { - __kmp_balanced_affinity(tid, team->t.t_nproc); - } + // Call dynamic affinity settings + if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) { + __kmp_balanced_affinity(tid, team->t.t_nproc); + } #endif // KMP_AFFINITY_SUPPORTED #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED + } else if (proc_bind != proc_bind_false) { + if (this_thr->th.th_new_place == this_thr->th.th_current_place) { + KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", + __kmp_gtid_from_thread(this_thr), + this_thr->th.th_current_place)); + } else { + __kmp_affinity_set_place(gtid); } - else if (proc_bind != proc_bind_false) { - if (this_thr->th.th_new_place == this_thr->th.th_current_place) { - KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n", - __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place)); - } - else { - __kmp_affinity_set_place(gtid); - } - } + } #endif #if USE_ITT_BUILD && USE_ITT_NOTIFY - if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { - if (!KMP_MASTER_TID(tid)) { - // Get correct barrier object - itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); - __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired - } // (prepare called inside barrier_release) - } + if (__itt_sync_create_ptr || KMP_ITT_DEBUG) { + if (!KMP_MASTER_TID(tid)) { + // Get correct barrier object + itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier); + __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired + } // (prepare called inside barrier_release) + } #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - ANNOTATE_BARRIER_END(&team->t.t_bar); - KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid)); + ANNOTATE_BARRIER_END(&team->t.t_bar); + KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, + team->t.t_id, tid)); } +void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, + kmp_internal_control_t *new_icvs, ident_t *loc) { + KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); -void -__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc ) -{ - KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy); - - KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); - KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); + KMP_DEBUG_ASSERT(team && new_nproc && new_icvs); + KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); - /* Master thread's copy of the ICVs was set up on the implicit taskdata in - __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has - this data before this function is called. */ +/* Master thread's copy of the ICVs was set up on the implicit taskdata in + __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's + implicit task has this data before this function is called. */ #if KMP_BARRIER_ICV_PULL - /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where - all of the worker threads can access them and make their own copies after the barrier. */ - KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point - copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs); - KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team)); + /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains + untouched), where all of the worker threads can access them and make their + own copies after the barrier. */ + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be + // allocated at this point + copy_icvs( + &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, + new_icvs); + KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, + team->t.t_threads[0], team)); #elif KMP_BARRIER_ICV_PUSH - // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. - KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team)); + // The ICVs will be propagated in the fork barrier, so nothing needs to be + // done here. + KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, + team->t.t_threads[0], team)); #else - // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. - ngo_load(new_icvs); - KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point - for (int f=1; f<new_nproc; ++f) { // Skip the master thread - // TODO: GEH - pass in better source location info since usually NULL here - KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", - f, team->t.t_threads[f], team)); - __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); - ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); - KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", - f, team->t.t_threads[f], team)); - } - ngo_sync(); + // Copy the ICVs to each of the non-master threads. This takes O(nthreads) + // time. + ngo_load(new_icvs); + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be + // allocated at this point + for (int f = 1; f < new_nproc; ++f) { // Skip the master thread + // TODO: GEH - pass in better source location info since usually NULL here + KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", + f, team->t.t_threads[f], team)); + __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE); + ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); + KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", + f, team->t.t_threads[f], team)); + } + ngo_sync(); #endif // KMP_BARRIER_ICV_PULL } |