diff options
Diffstat (limited to 'openmp/runtime/src/kmp_runtime.c')
-rw-r--r-- | openmp/runtime/src/kmp_runtime.c | 707 |
1 files changed, 257 insertions, 450 deletions
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c index 7d66b9bc304..37c372bd89e 100644 --- a/openmp/runtime/src/kmp_runtime.c +++ b/openmp/runtime/src/kmp_runtime.c @@ -1,7 +1,7 @@ /* * kmp_runtime.c -- KPTS runtime support library - * $Revision: 42642 $ - * $Date: 2013-09-06 01:57:24 -0500 (Fri, 06 Sep 2013) $ + * $Revision: 42839 $ + * $Date: 2013-11-24 13:01:00 -0600 (Sun, 24 Nov 2013) $ */ @@ -88,6 +88,8 @@ char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: " #endif /* KMP_DEBUG */ +#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) ) + /* ------------------------------------------------------------------------ */ /* ------------------------------------------------------------------------ */ @@ -472,8 +474,7 @@ __kmp_wait_sleep( kmp_info_t *this_thr, __kmp_unref_task_team( task_team, this_thr ); } else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) { __kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag - USE_ITT_BUILD_ARG( itt_sync_obj ) - ); + USE_ITT_BUILD_ARG( itt_sync_obj ), 0); } }; // if }; // if @@ -994,7 +995,7 @@ DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) { } # endif /* KMP_OS_WINDOWS */ -#endif /* GUIDEDLL_EXPORTS +#endif /* GUIDEDLL_EXPORTS */ /* ------------------------------------------------------------------------ */ @@ -1190,10 +1191,8 @@ __kmp_linear_barrier_gather( enum barrier_type bt, register kmp_balign_team_t *team_bar = & team -> t.t_bar[ bt ]; register int nproc = this_thr -> th.th_team_nproc; register int i; - register kmp_uint new_state; - /* Don't have to worry about sleep bit here or atomic since team setting */ - new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP; + register kmp_uint new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP; /* Collect all the worker team member threads. */ for (i = 1; i < nproc; i++) { @@ -1341,7 +1340,7 @@ __kmp_tree_barrier_gather( enum barrier_type bt, /* Need to update the team arrived pointer if we are the master thread */ if ( nproc > 1 ) - /* New value was already computed in above loop */ + /* New value was already computed above */ team -> t.t_bar[ bt ].b_arrived = new_state; else team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP; @@ -1380,6 +1379,12 @@ __kmp_hyper_barrier_gather( enum barrier_type bt, KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] ); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) { + this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); + } +#endif /* * We now perform a hypercube-embedded tree gather to wait until all * of the threads have arrived, and reduce any required data @@ -1417,6 +1422,9 @@ __kmp_hyper_barrier_gather( enum barrier_type bt, /* parent threads wait for children to arrive */ + if (new_state == KMP_BARRIER_UNUSED_STATE) + new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP; + for ( child = 1, child_tid = tid + (1 << level); child < branch_factor && child_tid < num_threads; child++, child_tid += (1 << level) ) @@ -1429,10 +1437,6 @@ __kmp_hyper_barrier_gather( enum barrier_type bt, if ( child+1 < branch_factor && next_child_tid < num_threads ) KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived ); #endif /* KMP_CACHE_MANAGE */ - /* Only read this arrived flag once per thread that needs it */ - if (new_state == KMP_BARRIER_UNUSED_STATE) - new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP; - KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) " "arrived(%p) == %u\n", gtid, team->t.t_id, tid, @@ -1444,6 +1448,12 @@ __kmp_hyper_barrier_gather( enum barrier_type bt, USE_ITT_BUILD_ARG (itt_sync_obj) ); +#if USE_ITT_BUILD + // Barrier imbalance - write min of the thread time and a child time to the thread. + if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) { + this_thr->th.th_bar_arrive_time = KMP_MIN( this_thr->th.th_bar_arrive_time, child_thr->th.th_bar_arrive_time ); + } +#endif if (reduce) { KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n", @@ -1729,7 +1739,6 @@ __kmp_tree_barrier_release( enum barrier_type bt, /* The reverse versions seem to beat the forward versions overall */ #define KMP_REVERSE_HYPER_BAR -#ifdef KMP_REVERSE_HYPER_BAR static void __kmp_hyper_barrier_release( enum barrier_type bt, kmp_info_t *this_thr, @@ -1751,15 +1760,13 @@ __kmp_hyper_barrier_release( enum barrier_type bt, register kmp_uint32 offset; register kmp_uint32 level; - /* - * We now perform a hypercube-embedded tree release for all - * of the threads that have been gathered, but in the exact - * reverse order from the corresponding gather (for load balance. - */ + /* Perform a hypercube-embedded tree release for all of the threads + that have been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) + the threads are released in the reverse order of the corresponding gather, + otherwise threads are released in the same order. */ if ( ! KMP_MASTER_TID( tid )) { /* worker threads */ - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) ); @@ -1807,7 +1814,7 @@ __kmp_hyper_barrier_release( enum barrier_type bt, TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) ); + gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) ); KMP_MB(); /* Flush all pending memory write invalidates. */ @@ -1822,6 +1829,7 @@ __kmp_hyper_barrier_release( enum barrier_type bt, num_threads = this_thr -> th.th_team_nproc; other_threads = team -> t.t_threads; +#ifdef KMP_REVERSE_HYPER_BAR /* count up to correct level for parent */ for ( level = 0, offset = 1; offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0); @@ -1831,7 +1839,14 @@ __kmp_hyper_barrier_release( enum barrier_type bt, for ( level -= branch_bits, offset >>= branch_bits; offset != 0; level -= branch_bits, offset >>= branch_bits ) +#else + /* Go down the tree, level by level */ + for ( level = 0, offset = 1; + offset < num_threads; + level += branch_bits, offset <<= branch_bits ) +#endif // KMP_REVERSE_HYPER_BAR { +#ifdef KMP_REVERSE_HYPER_BAR /* Now go in reverse order through the children, highest to lowest. Initial setting of child is conservative here. */ child = num_threads >> ((level==0)?level:level-1); @@ -1839,8 +1854,18 @@ __kmp_hyper_barrier_release( enum barrier_type bt, child_tid = tid + (child << level); child >= 1; child--, child_tid -= (1 << level) ) - { +#else + if (((tid >> level) & (branch_factor - 1)) != 0) + /* No need to go any lower than this, since this is the level + parent would be notified */ + break; + /* iterate through children on this level of the tree */ + for ( child = 1, child_tid = tid + (1 << level); + child < branch_factor && child_tid < num_threads; + child++, child_tid += (1 << level) ) +#endif // KMP_REVERSE_HYPER_BAR + { if ( child_tid >= num_threads ) continue; /* child doesn't exist so keep going */ else { register kmp_info_t *child_thr = other_threads[ child_tid ]; @@ -1848,7 +1873,11 @@ __kmp_hyper_barrier_release( enum barrier_type bt, #if KMP_CACHE_MANAGE register kmp_uint32 next_child_tid = child_tid - (1 << level); /* prefetch next thread's go count */ +#ifdef KMP_REVERSE_HYPER_BAR if ( child-1 >= 1 && next_child_tid < num_threads ) +#else + if ( child+1 < branch_factor && next_child_tid < num_threads ) +#endif // KMP_REVERSE_HYPER_BAR KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go ); #endif /* KMP_CACHE_MANAGE */ @@ -1880,154 +1909,6 @@ __kmp_hyper_barrier_release( enum barrier_type bt, gtid, team->t.t_id, tid, bt ) ); } -#else /* !KMP_REVERSE_HYPER_BAR */ - -static void -__kmp_hyper_barrier_release( enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs ) -{ - /* handle fork barrier workers who aren't part of a team yet */ - register kmp_team_t *team; - register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb; - register kmp_info_t **other_threads; - register kmp_uint32 num_threads; - register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ]; - register kmp_uint32 branch_factor = 1 << branch_bits; - register kmp_uint32 child; - register kmp_uint32 child_tid; - register kmp_uint32 offset; - register kmp_uint32 level; - - /* - * We now perform a hypercube-embedded tree release for all - * of the threads that have been gathered, but in the same order - * as the gather. - */ - - if ( ! KMP_MASTER_TID( tid )) { - /* worker threads */ - - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", - gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) ); - - /* wait for parent thread to release us */ - __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE, NULL ); - -#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY - if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) { - // we are on a fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 ); - // cancel wait on previous parallel region... - __kmp_itt_task_starting( itt_sync_obj ); - - if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) - return; - - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); - if ( itt_sync_obj != NULL ) - __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier - - } else -#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */ - // - // early exit for reaping threads releasing forkjoin barrier - // - if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) ) - return; - - // - // The worker thread may now assume that the team is valid. - // -#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY - // libguide only code (cannot use *itt_task* routines) - if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) { - // we are on a fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); - __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls... - } -#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */ - team = __kmp_threads[ gtid ]-> th.th_team; - KMP_DEBUG_ASSERT( team != NULL ); - tid = __kmp_tid_from_gtid( gtid ); - - TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE); - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", - gtid, ( team != NULL ) ? team->t.t_id : -1, tid, - &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) ); - - KMP_MB(); /* Flush all pending memory write invalidates. */ - - } else { /* KMP_MASTER_TID(tid) */ - team = __kmp_threads[ gtid ]-> th.th_team; - KMP_DEBUG_ASSERT( team != NULL ); - - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) enter for barrier type %d\n", - gtid, team->t.t_id, tid, bt ) ); - } - - /* Now set up team parameters since workers have been released */ - if ( team == NULL ) { - /* handle fork barrier workers who are now part of a team */ - tid = __kmp_tid_from_gtid( gtid ); - team = __kmp_threads[ gtid ]-> th.th_team; - } - num_threads = this_thr -> th.th_team_nproc; - other_threads = team -> t.t_threads; - - /* Go down the tree, level by level */ - for ( level = 0, offset = 1; - offset < num_threads; - level += branch_bits, offset <<= branch_bits ) - { - if (((tid >> level) & (branch_factor - 1)) != 0) - /* No need to go any lower than this, since this is the level - parent would be notified */ - break; - - /* iterate through children on this level of the tree */ - for ( child = 1, child_tid = tid + (1 << level); - child < branch_factor && child_tid < num_threads; - child++, child_tid += (1 << level) ) - { - register kmp_info_t *child_thr = other_threads[ child_tid ]; - register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb; -#if KMP_CACHE_MANAGE - { - register kmp_uint32 next_child_tid = child_tid + (1 << level); - /* prefetch next thread's go count */ - if ( child+1 < branch_factor && next_child_tid < num_threads ) - KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go ); - } -#endif /* KMP_CACHE_MANAGE */ - -#if KMP_BARRIER_ICV_PUSH - if ( propagate_icvs ) { - KMP_DEBUG_ASSERT( team != NULL ); - __kmp_init_implicit_task( team->t.t_ident, - team->t.t_threads[child_tid], team, child_tid, FALSE ); - load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs); - store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs); - sync_icvs(); - } -#endif // KMP_BARRIER_ICV_PUSH - - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing " - "T#%d(%d:%u) go(%p): %u => %u\n", - gtid, team->t.t_id, tid, - __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, - child_tid, &child_bar -> b_go, child_bar -> b_go, - child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) ); - - /* release child from barrier */ - __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence ); - } - } - - KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n", - gtid, team->t.t_id, tid, bt ) ); -} -#endif /* KMP_REVERSE_HYPER_BAR */ - - /* * Internal function to do a barrier. * If is_split is true, do a split barrier, otherwise, do a plain barrier @@ -2043,6 +1924,8 @@ __kmp_barrier( enum barrier_type bt, int gtid, int is_split, register kmp_team_t *team = this_thr -> th.th_team; register int status = 0; + ident_t * tmp_loc = __kmp_threads[ gtid ]->th.th_ident; + KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) ); @@ -2126,34 +2009,23 @@ __kmp_barrier( enum barrier_type bt, int gtid, int is_split, #endif /* OMP_30_ENABLED */ +#if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier - report frame end -#if USE_ITT_BUILD - // Collect information only if the file was opened succesfully. - if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file ) - { - ident_t * loc = this_thr->th.th_ident; - if (loc) { - // Use compiler-generated location to mark the frame: - // "<func>$omp$frame@[file:]<line>[:<col>]" - kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 ); - - kmp_uint64 fr_end; -#if defined( __GNUC__ ) -# if !defined( __INTEL_COMPILER ) - fr_end = __kmp_hardware_timestamp(); -# else - fr_end = __rdtsc(); -# endif -#else - fr_end = __rdtsc(); -#endif - K_DIAG( 3, ( "__kmp_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n", - gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) ); - - __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n", - str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end ); - __kmp_str_loc_free( &str_loc ); - this_thr->th.th_frame_time = fr_end; + if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) { + kmp_uint64 tmp = __itt_get_timestamp(); + switch( __kmp_forkjoin_frames_mode ) { + case 1: + __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc ); + this_thr->th.th_frame_time = tmp; + break; + case 2: + __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc ); + break; + case 3: + __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc ); + __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc ); + this_thr->th.th_frame_time = tmp; + break; } } #endif /* USE_ITT_BUILD */ @@ -2465,7 +2337,7 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team, KMP_MB(); /* first, let's setup the master thread */ - master_th -> th.th_info .ds.ds_tid = 0; + master_th -> th.th_info.ds.ds_tid = 0; master_th -> th.th_team = team; master_th -> th.th_team_nproc = team -> t.t_nproc; master_th -> th.th_team_master = master_th; @@ -2514,6 +2386,17 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team, static void __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration +static void +__kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, +#if OMP_30_ENABLED + kmp_internal_control_t * new_icvs, + ident_t * loc +#else + int new_set_nproc, int new_set_dynamic, int new_set_nested, + int new_set_blocktime, int new_bt_intervals, int new_bt_set +#endif // OMP_30_ENABLED + ); // forward declaration + /* most of the work for a fork */ /* return true if we really went parallel, false if serialized */ int @@ -2527,7 +2410,7 @@ __kmp_fork_call( microtask_t microtask, launch_t invoker, /* TODO: revert workaround for Intel(R) 64 tracker #96 */ -#if KMP_ARCH_X86_64 && KMP_OS_LINUX +#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX va_list * ap #else va_list ap @@ -2576,7 +2459,6 @@ __kmp_fork_call( #endif - master_th->th.th_ident = loc; #if OMP_40_ENABLED @@ -2590,7 +2472,7 @@ __kmp_fork_call( argv = (void**)parent_team->t.t_argv; for( i=argc-1; i >= 0; --i ) /* TODO: revert workaround for Intel(R) 64 tracker #96 */ -#if KMP_ARCH_X86_64 && KMP_OS_LINUX +#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX *argv++ = va_arg( *ap, void * ); #else *argv++ = va_arg( ap, void * ); @@ -2686,11 +2568,11 @@ __kmp_fork_call( /* create a serialized parallel region? */ if ( nthreads == 1 ) { /* josh todo: hypothetical question: what do we do for OS X*? */ -#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) +#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM ) void * args[ argc ]; #else void * * args = (void**) alloca( argc * sizeof( void * ) ); -#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) */ +#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM ) */ __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock ); KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid )); @@ -2721,7 +2603,7 @@ __kmp_fork_call( if ( ap ) { for( i=argc-1; i >= 0; --i ) /* TODO: revert workaround for Intel(R) 64 tracker #96 */ - #if KMP_ARCH_X86_64 && KMP_OS_LINUX + #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX *argv++ = va_arg( *ap, void * ); #else *argv++ = va_arg( ap, void * ); @@ -2741,7 +2623,7 @@ __kmp_fork_call( argv = args; for( i=argc-1; i >= 0; --i ) /* TODO: revert workaround for Intel(R) 64 tracker #96 */ - #if KMP_ARCH_X86_64 && KMP_OS_LINUX + #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX *argv++ = va_arg( *ap, void * ); #else *argv++ = va_arg( ap, void * ); @@ -2957,7 +2839,7 @@ __kmp_fork_call( #endif /* OMP_40_ENABLED */ for( i=argc-1; i >= 0; --i ) /* TODO: revert workaround for Intel(R) 64 tracker #96 */ -#if KMP_ARCH_X86_64 && KMP_OS_LINUX +#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX *argv++ = va_arg( *ap, void * ); #else *argv++ = va_arg( ap, void * ); @@ -2977,6 +2859,18 @@ __kmp_fork_call( root -> r.r_active = TRUE; __kmp_fork_team_threads( root, team, master_th, gtid ); + __kmp_setup_icv_copy(team, nthreads +#if OMP_30_ENABLED + , &master_th->th.th_current_task->td_icvs, loc +#else + , parent_team->t.t_set_nproc[master_tid], + parent_team->t.t_set_dynamic[master_tid], + parent_team->t.t_set_nested[master_tid], + parent_team->t.t_set_blocktime[master_tid], + parent_team->t.t_set_bt_intervals[master_tid], + parent_team->t.t_set_bt_set[master_tid] +#endif /* OMP_30_ENABLED */ + ); __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock ); @@ -2992,23 +2886,12 @@ __kmp_fork_call( __kmp_itt_region_forking( gtid ); #endif /* USE_ITT_BUILD */ +#if USE_ITT_BUILD && USE_ITT_NOTIFY && OMP_30_ENABLED // Internal fork - report frame begin -#if USE_ITT_BUILD - // Collect information only if the file was opened succesfully. - if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file ) + if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr ) { - kmp_uint64 fr_begin; -#if defined( __GNUC__ ) -# if !defined( __INTEL_COMPILER ) - fr_begin = __kmp_hardware_timestamp(); -# else - fr_begin = __rdtsc(); -# endif -#else - fr_begin = __rdtsc(); -#endif if( ! ( team->t.t_active_level > 1 ) ) { - master_th->th.th_frame_time = fr_begin; + master_th->th.th_frame_time = __itt_get_timestamp(); } } #endif /* USE_ITT_BUILD */ @@ -3134,7 +3017,10 @@ __kmp_join_call(ident_t *loc, int gtid // Either not in teams or exiting teams region // (teams is a frame and no other frames inside the teams) # endif /* OMP_40_ENABLED */ + { + master_th->th.th_ident = loc; __kmp_itt_region_joined( gtid ); + } #endif /* USE_ITT_BUILD */ #if OMP_40_ENABLED @@ -4644,6 +4530,7 @@ __kmp_register_root( int initial_thread ) root -> r.r_root_team -> t.t_threads[0] = root_thread; root -> r.r_hot_team -> t.t_threads[0] = root_thread; root_thread -> th.th_serial_team -> t.t_threads[0] = root_thread; + root_thread -> th.th_serial_team -> t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now). root -> r.r_uber_thread = root_thread; /* initialize the thread, get it ready to go */ @@ -5007,6 +4894,19 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid ) TCW_4( __kmp_init_monitor, 1 ); __kmp_create_monitor( & __kmp_monitor ); KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) ); + #if KMP_OS_WINDOWS + // AC: wait until monitor has started. This is a fix for CQ232808. + // The reason is that if the library is loaded/unloaded in a loop with small (parallel) + // work in between, then there is high probability that monitor thread started after + // the library shutdown. At shutdown it is too late to cope with the problem, because + // when the master is in DllMain (process detach) the monitor has no chances to start + // (it is blocked), and master has no means to inform the monitor that the library has gone, + // because all the memory which the monitor can access is going to be released/reset. + while ( TCR_4(__kmp_init_monitor) < 2 ) { + KMP_YIELD( TRUE ); + } + KF_TRACE( 10, ( "after monitor thread has started\n" ) ); + #endif } __kmp_release_bootstrap_lock( & __kmp_monitor_lock ); } @@ -5049,6 +4949,7 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid ) 0 ); } KMP_ASSERT ( serial_team ); + serial_team -> t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now). serial_team -> t.t_threads[0] = new_thr; KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", new_thr ) ); @@ -5144,76 +5045,94 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid ) * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ static void -__kmp_reinitialize_team( - kmp_team_t * team, - int new_nproc, - #if OMP_30_ENABLED - kmp_internal_control_t * new_icvs, - ident_t * loc - #else - int new_set_nproc, int new_set_dynamic, int new_set_nested, - int new_set_blocktime, int new_bt_intervals, int new_bt_set - #endif // OMP_30_ENABLED -) { - int f; - #if OMP_30_ENABLED - KMP_DEBUG_ASSERT( team && new_nproc && new_icvs ); - KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc ); - team->t.t_ident = loc; - #else - KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc ); - #endif // OMP_30_ENABLED +__kmp_reinitialize_team( kmp_team_t *team, +#if OMP_30_ENABLED + kmp_internal_control_t *new_icvs, ident_t *loc +#else + int new_set_nproc, int new_set_dynamic, int new_set_nested, + int new_set_blocktime, int new_bt_intervals, int new_bt_set +#endif + ) { + KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n", + team->t.t_threads[0], team ) ); +#if OMP_30_ENABLED + KMP_DEBUG_ASSERT( team && new_icvs); + KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc ); + team->t.t_ident = loc; +#else + KMP_DEBUG_ASSERT( team && new_set_nproc ); +#endif // OMP_30_ENABLED team->t.t_id = KMP_GEN_TEAM_ID(); -#if KMP_BARRIER_ICV_PULL - // - // Copy the ICV's to the team structure, where all of the worker threads - // can access them and make their own copies after the barrier. - // + // Copy ICVs to the master thread's implicit taskdata +#if OMP_30_ENABLED load_icvs(new_icvs); - store_icvs(&team->t.t_initial_icvs, new_icvs); - - // - // Set up the master thread's copy of the ICV's. __kmp_fork_call() - // assumes they are already set in the master thread. - // FIXME - change that code to use the team->t.t_initial_icvs copy - // and eliminate this copy. - // __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE ); store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); sync_icvs(); - KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team ) ); +# else + team -> t.t_set_nproc[0] = new_set_nproc; + team -> t.t_set_dynamic[0] = new_set_dynamic; + team -> t.t_set_nested[0] = new_set_nested; + team -> t.t_set_blocktime[0] = new_set_blocktime; + team -> t.t_set_bt_intervals[0] = new_bt_intervals; + team -> t.t_set_bt_set[0] = new_bt_set; +# endif // OMP_30_ENABLED -#elif KMP_BARRIER_ICV_PUSH - // - // Set the ICV's in the master thread only. - // They will be propagated by the fork barrier. - // - __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE ); + KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n", + team->t.t_threads[0], team ) ); +} + +static void +__kmp_setup_icv_copy(kmp_team_t * team, int new_nproc, +#if OMP_30_ENABLED + kmp_internal_control_t * new_icvs, + ident_t * loc +#else + int new_set_nproc, int new_set_dynamic, int new_set_nested, + int new_set_blocktime, int new_bt_intervals, int new_bt_set +#endif // OMP_30_ENABLED + ) +{ + int f; + +#if OMP_30_ENABLED + KMP_DEBUG_ASSERT( team && new_nproc && new_icvs ); + KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc ); +#else + KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc ); +#endif // OMP_30_ENABLED + + // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team. + // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called. +#if KMP_BARRIER_ICV_PULL + // Copy the ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where all of the + // worker threads can access them and make their own copies after the barrier. load_icvs(new_icvs); - store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // the threads arrays should be allocated at this point + store_icvs(&team->t.t_threads[0]->th.th_fixed_icvs, new_icvs); sync_icvs(); + KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team)); - KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n", - 0, team->t.t_threads[0], team ) ); +#elif KMP_BARRIER_ICV_PUSH + // The ICVs will be propagated in the fork barrier, so nothing needs to be done here. + KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team)); #else - // - // Copy the icvs to each of the threads. This takes O(nthreads) time. - // -#if OMP_30_ENABLED + // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time. +# if OMP_30_ENABLED load_icvs(new_icvs); -#endif - for( f=0 ; f<new_nproc ; f++) { +# endif // OMP_30_ENABLED + KMP_DEBUG_ASSERT(team->t.t_threads[0]); // the threads arrays should be allocated at this point + for(f=1 ; f<new_nproc ; f++) { // skip the master thread # if OMP_30_ENABLED // TODO: GEH - pass in better source location info since usually NULL here - KF_TRACE( 10, ( "__kmp_reinitialize_team1: T#%d this_thread=%p team=%p\n", + KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", f, team->t.t_threads[f], team ) ); __kmp_init_implicit_task( loc, team->t.t_threads[f], team, f, FALSE ); store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs); - KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n", + KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n", f, team->t.t_threads[f], team ) ); # else team -> t.t_set_nproc[f] = new_set_nproc; @@ -5226,9 +5145,8 @@ __kmp_reinitialize_team( } # if OMP_30_ENABLED sync_icvs(); -# endif -#endif // KMP_BARRIER_ICV_PUSH || KMP_BARRIER_ICV_PULL - +# endif // OMP_30_ENABLED +#endif // KMP_BARRIER_ICV_PULL } /* initialize the team data structure @@ -5246,6 +5164,8 @@ __kmp_initialize_team( int new_set_blocktime, int new_bt_intervals, int new_bt_set #endif // OMP_30_ENABLED ) { + KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) ); + /* verify */ KMP_DEBUG_ASSERT( team ); KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc ); @@ -5290,18 +5210,18 @@ __kmp_initialize_team( team -> t.t_control_stack_top = NULL; - __kmp_reinitialize_team( - team, new_nproc, - #if OMP_30_ENABLED - new_icvs, - loc - #else - new_set_nproc, new_set_dynamic, new_set_nested, - new_set_blocktime, new_bt_intervals, new_bt_set - #endif // OMP_30_ENABLED - ); + __kmp_reinitialize_team( team, +#if OMP_30_ENABLED + new_icvs, loc +#else + new_set_nproc, new_set_dynamic, new_set_nested, + new_set_blocktime, new_bt_intervals, new_bt_set +#endif // OMP_30_ENABLED + ); + KMP_MB(); + KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) ); } #if KMP_OS_LINUX @@ -5700,15 +5620,15 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, // TODO???: team -> t.t_max_active_levels = new_max_active_levels; team -> t.t_sched = new_icvs->sched; #endif - __kmp_reinitialize_team( team, new_nproc, + __kmp_reinitialize_team( team, #if OMP_30_ENABLED - new_icvs, - root->r.r_uber_thread->th.th_ident + new_icvs, root->r.r_uber_thread->th.th_ident #else - new_set_nproc, new_set_dynamic, new_set_nested, - new_set_blocktime, new_bt_intervals, new_bt_set -#endif - ); + new_set_nproc, new_set_dynamic, new_set_nested, + new_set_blocktime, new_bt_intervals, new_bt_set +#endif // OMP_30_ENABLED + ); + #if OMP_30_ENABLED if ( __kmp_tasking_mode != tskm_immediate_exec ) { @@ -5768,15 +5688,14 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, if(team -> t.t_max_nproc < new_nproc) { /* reallocate larger arrays */ __kmp_reallocate_team_arrays(team, new_nproc); - __kmp_reinitialize_team( team, new_nproc, + __kmp_reinitialize_team( team, #if OMP_30_ENABLED - new_icvs, - NULL // TODO: !!! + new_icvs, NULL #else - new_set_nproc, new_set_dynamic, new_set_nested, - new_set_blocktime, new_bt_intervals, new_bt_set -#endif - ); + new_set_nproc, new_set_dynamic, new_set_nested, + new_set_blocktime, new_bt_intervals, new_bt_set +#endif // OMP_30_ENABLED + ); } #if KMP_OS_LINUX @@ -5859,8 +5778,8 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, # endif #endif - } - else { + } + else { KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" )); #if KMP_MIC // This case can mean that omp_set_num_threads() was called and the hot team size @@ -5877,15 +5796,14 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, team -> t.t_sched = new_icvs->sched; #endif - __kmp_reinitialize_team( team, new_nproc, + __kmp_reinitialize_team( team, #if OMP_30_ENABLED - new_icvs, - root->r.r_uber_thread->th.th_ident + new_icvs, root->r.r_uber_thread->th.th_ident #else - new_set_nproc, new_set_dynamic, new_set_nested, - new_set_blocktime, new_bt_intervals, new_bt_set -#endif - ); + new_set_nproc, new_set_dynamic, new_set_nested, + new_set_blocktime, new_bt_intervals, new_bt_set +#endif // OMP_30_ENABLED + ); #if OMP_30_ENABLED KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", @@ -6000,6 +5918,8 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, * up seems to really hurt performance a lot on the P4, so, let's not use * this... */ __kmp_allocate_team_arrays( team, max_nproc ); + + KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) ); __kmp_initialize_team( team, new_nproc, #if OMP_30_ENABLED new_icvs, @@ -6293,7 +6213,6 @@ __kmp_join_barrier( int gtid ) KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid )); - #if OMP_30_ENABLED if ( __kmp_tasking_mode == tskm_extra_barrier ) { __kmp_tasking_barrier( team, this_thr, gtid ); @@ -6329,25 +6248,6 @@ __kmp_join_barrier( int gtid ) #endif // OMP_30_ENABLED } - #if KMP_OS_WINDOWS - // AC: wait here until monitor has started. This is a fix for CQ232808. - // The reason is that if the library is loaded/unloaded in a loop with small (parallel) - // work in between, then there is high probability that monitor thread started after - // the library shutdown. At shutdown it is too late to cope with the problem, because - // when the master is in DllMain (process detach) the monitor has no chances to start - // (it is blocked), and master has no means to inform the monitor that the library has gone, - // because all the memory which the monitor can access is going to be released/reset. - // - // The moment before barrier_gather sounds appropriate, because master needs to - // wait for all workers anyway, and we want this to happen as late as possible, - // but before the shutdown which may happen after the barrier. - if( KMP_MASTER_TID( tid ) && TCR_4(__kmp_init_monitor) < 2 ) { - __kmp_wait_sleep( this_thr, (volatile kmp_uint32*)&__kmp_init_monitor, 2, 0 - USE_ITT_BUILD_ARG( itt_sync_obj ) - ); - } - #endif - #if USE_ITT_BUILD if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) __kmp_itt_barrier_starting( gtid, itt_sync_obj ); @@ -6390,34 +6290,22 @@ __kmp_join_barrier( int gtid ) USE_ITT_BUILD_ARG( itt_sync_obj ) ); } +#if USE_ITT_BUILD && USE_ITT_NOTIFY // Join barrier - report frame end -#if USE_ITT_BUILD - // Collect information only if the file was opened successfully. - if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file ) - { - ident_t * loc = this_thr->th.th_ident; - if (loc) { - // Use compiler-generated location to mark the frame: - // "<func>$omp$frame@[file:]<line>[:<col>]" - kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 ); - - kmp_uint64 fr_end; -#if defined( __GNUC__ ) -# if !defined( __INTEL_COMPILER ) - fr_end = __kmp_hardware_timestamp(); -# else - fr_end = __rdtsc(); -# endif -#else - fr_end = __rdtsc(); -#endif - K_DIAG( 3, ( "__kmp_join_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n", - gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) ); - - __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n", - str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end ); - - __kmp_str_loc_free( &str_loc ); + if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) { + kmp_uint64 tmp = __itt_get_timestamp(); + ident_t * loc = team->t.t_ident; + switch( __kmp_forkjoin_frames_mode ) { + case 1: + __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc ); + break; + case 2: + __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc ); + break; + case 3: + __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc ); + __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc ); + break; } } #endif /* USE_ITT_BUILD */ @@ -6571,20 +6459,16 @@ __kmp_fork_barrier( int gtid, int tid ) #if OMP_30_ENABLED # if KMP_BARRIER_ICV_PULL - // - // FIXME - after __kmp_fork_call() is modified to not look at the - // master thread's implicit task ICV's, remove the ! KMP_MASTER_TID - // restriction from this if condition. - // - if (! KMP_MASTER_TID( tid ) ) { - // - // Copy the initial ICV's from the team struct to the implicit task - // for this tid. - // - __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid], - team, tid, FALSE ); - load_icvs(&team->t.t_initial_icvs); - store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_initial_icvs); + // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team. + // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called. + // We cannot modify __kmp_fork_call() to look at the fixed ICVs in the master's thread struct, because it is + // not always the case that the threads arrays have been allocated when __kmp_fork_call() is executed. + if (! KMP_MASTER_TID( tid ) ) { // master thread already has ICVs + // Copy the initial ICVs from the master's thread struct to the implicit task for this tid. + KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid )); + load_icvs(&team->t.t_threads[0]->th.th_fixed_icvs); + __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE ); + store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_threads[0]->th.th_fixed_icvs); sync_icvs(); } # endif // KMP_BARRIER_ICV_PULL @@ -6716,13 +6600,13 @@ __kmp_launch_thread( kmp_info_t *this_thr ) void __kmp_internal_end_dest( void *specific_gtid ) { - #ifdef __INTEL_COMPILER + #if KMP_COMPILER_ICC #pragma warning( push ) #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits #endif // Make sure no significant bits are lost int gtid = (kmp_intptr_t)specific_gtid - 1; - #ifdef __INTEL_COMPILER + #if KMP_COMPILER_ICC #pragma warning( pop ) #endif @@ -7503,7 +7387,6 @@ __kmp_do_serial_initialize( void ) __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; } __kmp_max_nth = __kmp_sys_max_nth; - __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub ); // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; @@ -7572,18 +7455,17 @@ __kmp_do_serial_initialize( void ) if ( __kmp_str_match_true( val ) ) { kmp_str_buf_t buffer; __kmp_str_buf_init( & buffer ); - __kmp_i18n_dump_catalog( buffer ); + __kmp_i18n_dump_catalog( & buffer ); __kmp_printf( "%s", buffer.str ); __kmp_str_buf_free( & buffer ); }; // if __kmp_env_free( & val ); #endif + __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub ); // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); - // omalyshe: This initialisation beats env var setting. - //__kmp_load_balance_interval = 1.0; // If the library is shut down properly, both pools must be NULL. Just in case, set them // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed. @@ -7876,38 +7758,6 @@ __kmp_parallel_initialize( void ) __kmp_print_version_2(); } -#if USE_ITT_BUILD - // Create CSV file to report frames - if( __kmp_forkjoin_frames_mode == 1 ) - { - // Open CSV file to write itt frame information - const char * csv_file; -/* Internal AXE variables - char * host_name = __kmp_env_get("INTEL_MRTE_HOST_NAME"); - char * out_dir = __kmp_env_get("INTEL_MRTE_DATA_DIR");*/ - char * host_name = __kmp_env_get("AMPLXE_HOSTNAME"); - char * out_dir = __kmp_env_get("AMPLXE_DATA_DIR"); - - if( out_dir && host_name ) { - csv_file = __kmp_str_format( "%s/omp-frames-hostname-%s.csv", out_dir, host_name ); - __kmp_itt_csv_file = fopen( csv_file, "w" ); - __kmp_str_free( &csv_file ); - } else { -#ifdef KMP_DEBUG - // Create CSV file in the current dir - csv_file = __kmp_str_format( "./omp-frames-hostname-xxx.csv" ); - __kmp_itt_csv_file = fopen( csv_file, "w" ); - __kmp_str_free( &csv_file ); -#endif - } - if( __kmp_itt_csv_file ) { - __kmp_str_buf_init( & __kmp_itt_frame_buffer ); - __kmp_str_buf_print( & __kmp_itt_frame_buffer, "name,start_tsc.TSC,end_tsc,pid,tid\n" ); - } - } - -#endif /* USE_ITT_BUILD */ - /* we have finished parallel initialization */ TCW_SYNC_4(__kmp_init_parallel, TRUE); @@ -8347,16 +8197,6 @@ __kmp_cleanup( void ) __kmp_i18n_catclose(); -#if USE_ITT_BUILD - // Close CSV file for frames - if( __kmp_forkjoin_frames_mode && __kmp_itt_csv_file ) { - fprintf( __kmp_itt_csv_file, __kmp_itt_frame_buffer.str ); - - __kmp_str_buf_free( & __kmp_itt_frame_buffer ); - fclose( __kmp_itt_csv_file ); - } -#endif /* USE_ITT_BUILD */ - KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) ); } @@ -8576,14 +8416,6 @@ __kmp_aux_set_defaults( * internal fast reduction routines */ -// implementation rev. 0.4 -// AT: determine CPU, and always use 'critical method' if non-Intel -// AT: test loc != NULL -// AT: what to return if lck == NULL -// AT: tune the cut-off point for atomic reduce method -// AT: tune what to return depending on the CPU and platform configuration -// AT: tune what to return depending on team size -// AT: move this function out to kmp_csupport.c PACKED_REDUCTION_METHOD_T __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), @@ -8641,22 +8473,10 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid, #error "Unknown or unsupported OS" #endif // KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN - #elif KMP_ARCH_X86 + #elif KMP_ARCH_X86 || KMP_ARCH_ARM #if KMP_OS_LINUX || KMP_OS_WINDOWS - // similar to win_32 - // 4x1x2 fxqlin04, the 'linear,linear' barrier - - // similar to lin_32 - // 4x1x2 fxqwin04, the 'linear,linear' barrier - - // actual measurement shows that the critical section method is better if team_size <= 8; - // what happenes when team_size > 8 ? ( no machine to test ) - - // TO DO: need to run a 32-bit code on Intel(R) 64 - // TO DO: test the 'hyper,hyper,1,1' barrier - // basic tuning if( atomic_available ) { @@ -8667,7 +8487,6 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid, #elif KMP_OS_DARWIN - if( atomic_available && ( num_vars <= 3 ) ) { retval = atomic_reduce_block; } else if( tree_available ) { @@ -8686,18 +8505,6 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid, } - //AT: TO DO: critical block method not implemented by PAROPT - //if( retval == __kmp_critical_reduce_block ) { - // if( lck == NULL ) { // critical block method not implemented by PAROPT - // } - //} - - // tune what to return depending on the CPU and platform configuration - // (sometimes tree method is slower than critical) - - // probably tune what to return depending on team size - - // KMP_FORCE_REDUCTION if( __kmp_force_reduction_method != reduction_method_not_defined ) { |