summaryrefslogtreecommitdiffstats
path: root/openmp/runtime/src/kmp_csupport.c
diff options
context:
space:
mode:
authorJim Cownie <james.h.cownie@intel.com>2014-10-07 16:25:50 +0000
committerJim Cownie <james.h.cownie@intel.com>2014-10-07 16:25:50 +0000
commit4cc4bb4c60c786a253176106065e59d639bbc1a9 (patch)
treedfe5a4f9e4591505dd8a7d333b5b9201c50b52e3 /openmp/runtime/src/kmp_csupport.c
parentf72fa67fc35622a8cd18a2fffd979fb225d82400 (diff)
downloadbcm5719-llvm-4cc4bb4c60c786a253176106065e59d639bbc1a9.tar.gz
bcm5719-llvm-4cc4bb4c60c786a253176106065e59d639bbc1a9.zip
I apologise in advance for the size of this check-in. At Intel we do
understand that this is not friendly, and are working to change our internal code-development to make it easier to make development features available more frequently and in finer (more functional) chunks. Unfortunately we haven't got that in place yet, and unpicking this into multiple separate check-ins would be non-trivial, so please bear with me on this one. We should be better in the future. Apologies over, what do we have here? GGC 4.9 compatibility -------------------- * We have implemented the new entrypoints used by code compiled by GCC 4.9 to implement the same functionality in gcc 4.8. Therefore code compiled with gcc 4.9 that used to work will continue to do so. However, there are some other new entrypoints (associated with task cancellation) which are not implemented. Therefore user code compiled by gcc 4.9 that uses these new features will not link against the LLVM runtime. (It remains unclear how to handle those entrypoints, since the GCC interface has potentially unpleasant performance implications for join barriers even when cancellation is not used) --- new parallel entry points --- new entry points that aren't OpenMP 4.0 related These are implemented fully :- GOMP_parallel_loop_dynamic() GOMP_parallel_loop_guided() GOMP_parallel_loop_runtime() GOMP_parallel_loop_static() GOMP_parallel_sections() GOMP_parallel() --- cancellation entry points --- Currently, these only give a runtime error if OMP_CANCELLATION is true because our plain barriers don't check for cancellation while waiting GOMP_barrier_cancel() GOMP_cancel() GOMP_cancellation_point() GOMP_loop_end_cancel() GOMP_sections_end_cancel() --- taskgroup entry points --- These are implemented fully. GOMP_taskgroup_start() GOMP_taskgroup_end() --- target entry points --- These are empty (as they are in libgomp) GOMP_target() GOMP_target_data() GOMP_target_end_data() GOMP_target_update() GOMP_teams() Improvements in Barriers and Fork/Join -------------------------------------- * Barrier and fork/join code is now in its own file (which makes it easier to understand and modify). * Wait/release code is now templated and in its own file; suspend/resume code is also templated * There's a new, hierarchical, barrier, which exploits the cache-hierarchy of the Intel(r) Xeon Phi(tm) coprocessor to improve fork/join and barrier performance. ***BEWARE*** the new source files have *not* been added to the legacy Cmake build system. If you want to use that fixes wil be required. Statistics Collection Code -------------------------- * New code has been added to collect application statistics (if this is enabled at library compile time; by default it is not). The statistics code itself is generally useful, the lightweight timing code uses the X86 rdtsc instruction, so will require changes for other architectures. The intent of this code is not for users to tune their codes but rather 1) For timing code-paths inside the runtime 2) For gathering general properties of OpenMP codes to focus attention on which OpenMP features are most used. Nested Hot Teams ---------------- * The runtime now maintains more state to reduce the overhead of creating and destroying inner parallel teams. This improves the performance of code that repeatedly uses nested parallelism with the same resource allocation. Set the new KMP_HOT_TEAMS_MAX_LEVEL envirable to a depth to enable this (and, of course, OMP_NESTED=true to enable nested parallelism at all). Improved Intel(r) VTune(Tm) Amplifier support --------------------------------------------- * The runtime provides additional information to Vtune via the itt_notify interface to allow it to display better OpenMP specific analyses of load-imbalance. Support for OpenMP Composite Statements --------------------------------------- * Implement new entrypoints required by some of the OpenMP 4.1 composite statements. Improved ifdefs --------------- * More separation of concepts ("Does this platform do X?") from platforms ("Are we compiling for platform Y?"), which should simplify future porting. ScaleMP* contribution --------------------- Stack padding to improve the performance in their environment where cross-node coherency is managed at the page level. Redesign of wait and release code --------------------------------- The code is simplified and performance improved. Bug Fixes --------- *Fixes for Windows multiple processor groups. *Fix Fortran module build on Linux: offload attribute added. *Fix entry names for distribute-parallel-loop construct to be consistent with the compiler codegen. *Fix an inconsistent error message for KMP_PLACE_THREADS environment variable. llvm-svn: 219214
Diffstat (limited to 'openmp/runtime/src/kmp_csupport.c')
-rw-r--r--openmp/runtime/src/kmp_csupport.c554
1 files changed, 173 insertions, 381 deletions
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c
index 780bd41cb78..af5c6144c2d 100644
--- a/openmp/runtime/src/kmp_csupport.c
+++ b/openmp/runtime/src/kmp_csupport.c
@@ -1,7 +1,7 @@
/*
* kmp_csupport.c -- kfront linkage support for OpenMP.
- * $Revision: 42826 $
- * $Date: 2013-11-20 03:39:45 -0600 (Wed, 20 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
*/
@@ -20,6 +20,7 @@
#include "kmp_i18n.h"
#include "kmp_itt.h"
#include "kmp_error.h"
+#include "kmp_stats.h"
#define MAX_MESSAGE 512
@@ -35,7 +36,7 @@
* @param flags in for future use (currently ignored)
*
* Initialize the runtime library. This call is optional; if it is not made then
- * it will be implicilty called by attempts to use other library functions.
+ * it will be implicitly called by attempts to use other library functions.
*
*/
void
@@ -276,13 +277,18 @@ Do the actual fork and call the microtask in the relevant number of threads.
void
__kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
{
+ KMP_STOP_EXPLICIT_TIMER(OMP_serial);
+ KMP_COUNT_BLOCK(OMP_PARALLEL);
int gtid = __kmp_entry_gtid();
// maybe to save thr_state is enough here
{
va_list ap;
va_start( ap, microtask );
- __kmp_fork_call( loc, gtid, TRUE,
+#if INCLUDE_SSC_MARKS
+ SSC_MARK_FORKING();
+#endif
+ __kmp_fork_call( loc, gtid, fork_context_intel,
argc,
VOLATILE_CAST(microtask_t) microtask,
VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
@@ -293,10 +299,14 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
ap
#endif
);
+#if INCLUDE_SSC_MARKS
+ SSC_MARK_JOINING();
+#endif
__kmp_join_call( loc, gtid );
va_end( ap );
}
+ KMP_START_EXPLICIT_TIMER(OMP_serial);
}
#if OMP_40_ENABLED
@@ -337,17 +347,18 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
va_start( ap, microtask );
// remember teams entry point and nesting level
- this_thr->th.th_team_microtask = microtask;
+ this_thr->th.th_teams_microtask = microtask;
this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
// check if __kmpc_push_num_teams called, set default number of teams otherwise
- if ( this_thr->th.th_set_nth_teams == 0 ) {
+ if ( this_thr->th.th_teams_size.nteams == 0 ) {
__kmp_push_num_teams( loc, gtid, 0, 0 );
}
KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
- KMP_DEBUG_ASSERT(this_thr->th.th_set_nth_teams >= 1);
+ KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+ KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
- __kmp_fork_call( loc, gtid, TRUE,
+ __kmp_fork_call( loc, gtid, fork_context_intel,
argc,
VOLATILE_CAST(microtask_t) __kmp_teams_master,
VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
@@ -358,9 +369,9 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
#endif
);
__kmp_join_call( loc, gtid );
- this_thr->th.th_team_microtask = NULL;
+ this_thr->th.th_teams_microtask = NULL;
this_thr->th.th_teams_level = 0;
-
+ *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
va_end( ap );
}
#endif /* OMP_40_ENABLED */
@@ -393,252 +404,9 @@ when the condition is false.
void
__kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
{
- kmp_info_t *this_thr;
- kmp_team_t *serial_team;
-
- KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
-
- /* Skip all this code for autopar serialized loops since it results in
- unacceptable overhead */
- if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
- return;
-
- if( ! TCR_4( __kmp_init_parallel ) )
- __kmp_parallel_initialize();
-
- this_thr = __kmp_threads[ global_tid ];
- serial_team = this_thr -> th.th_serial_team;
-
- /* utilize the serialized team held by this thread */
- KMP_DEBUG_ASSERT( serial_team );
- KMP_MB();
-
-#if OMP_30_ENABLED
- if ( __kmp_tasking_mode != tskm_immediate_exec ) {
- KMP_DEBUG_ASSERT( this_thr -> th.th_task_team == this_thr -> th.th_team -> t.t_task_team );
- KMP_DEBUG_ASSERT( serial_team -> t.t_task_team == NULL );
- KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
- global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
- this_thr -> th.th_task_team = NULL;
- }
-#endif // OMP_30_ENABLED
-
-#if OMP_40_ENABLED
- kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
- if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
- proc_bind = proc_bind_false;
- }
- else if ( proc_bind == proc_bind_default ) {
- //
- // No proc_bind clause was specified, so use the current value
- // of proc-bind-var for this parallel region.
- //
- proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
- }
- //
- // Reset for next parallel region
- //
- this_thr->th.th_set_proc_bind = proc_bind_default;
-#endif /* OMP_3_ENABLED */
-
- if( this_thr -> th.th_team != serial_team ) {
-#if OMP_30_ENABLED
- // Nested level will be an index in the nested nthreads array
- int level = this_thr->th.th_team->t.t_level;
-#endif
- if( serial_team -> t.t_serialized ) {
- /* this serial team was already used
- * TODO increase performance by making this locks more specific */
- kmp_team_t *new_team;
- int tid = this_thr->th.th_info.ds.ds_tid;
-
- __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
-
- new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
-#if OMP_40_ENABLED
- proc_bind,
-#endif
-#if OMP_30_ENABLED
- & this_thr->th.th_current_task->td_icvs,
-#else
- this_thr->th.th_team->t.t_set_nproc[tid],
- this_thr->th.th_team->t.t_set_dynamic[tid],
- this_thr->th.th_team->t.t_set_nested[tid],
- this_thr->th.th_team->t.t_set_blocktime[tid],
- this_thr->th.th_team->t.t_set_bt_intervals[tid],
- this_thr->th.th_team->t.t_set_bt_set[tid],
-#endif // OMP_30_ENABLED
- 0);
- __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
- KMP_ASSERT( new_team );
-
- /* setup new serialized team and install it */
- new_team -> t.t_threads[0] = this_thr;
- new_team -> t.t_parent = this_thr -> th.th_team;
- serial_team = new_team;
- this_thr -> th.th_serial_team = serial_team;
-
- KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
- global_tid, serial_team ) );
-
-
- /* TODO the above breaks the requirement that if we run out of
- * resources, then we can still guarantee that serialized teams
- * are ok, since we may need to allocate a new one */
- } else {
- KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
- global_tid, serial_team ) );
- }
-
- /* we have to initialize this serial team */
- KMP_DEBUG_ASSERT( serial_team->t.t_threads );
- KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
- KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
- serial_team -> t.t_ident = loc;
- serial_team -> t.t_serialized = 1;
- serial_team -> t.t_nproc = 1;
- serial_team -> t.t_parent = this_thr->th.th_team;
-#if OMP_30_ENABLED
- serial_team -> t.t_sched = this_thr->th.th_team->t.t_sched;
-#endif // OMP_30_ENABLED
- this_thr -> th.th_team = serial_team;
- serial_team -> t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
-
-#if OMP_30_ENABLED
- KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
- global_tid, this_thr->th.th_current_task ) );
- KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
- this_thr->th.th_current_task->td_flags.executing = 0;
-
- __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
-
- /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
- each serialized task represented by team->t.t_serialized? */
- copy_icvs(
- & this_thr->th.th_current_task->td_icvs,
- & this_thr->th.th_current_task->td_parent->td_icvs );
-
- // Thread value exists in the nested nthreads array for the next nested level
- if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
- this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
- }
-
-#if OMP_40_ENABLED
- if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
- this_thr->th.th_current_task->td_icvs.proc_bind
- = __kmp_nested_proc_bind.bind_types[ level + 1 ];
- }
-#endif /* OMP_40_ENABLED */
-
-#else /* pre-3.0 icv's */
- serial_team -> t.t_set_nproc[0] = serial_team->t.t_parent->
- t.t_set_nproc[serial_team->
- t.t_master_tid];
- serial_team -> t.t_set_dynamic[0] = serial_team->t.t_parent->
- t.t_set_dynamic[serial_team->
- t.t_master_tid];
- serial_team -> t.t_set_nested[0] = serial_team->t.t_parent->
- t.t_set_nested[serial_team->
- t.t_master_tid];
- serial_team -> t.t_set_blocktime[0] = serial_team->t.t_parent->
- t.t_set_blocktime[serial_team->
- t.t_master_tid];
- serial_team -> t.t_set_bt_intervals[0] = serial_team->t.t_parent->
- t.t_set_bt_intervals[serial_team->
- t.t_master_tid];
- serial_team -> t.t_set_bt_set[0] = serial_team->t.t_parent->
- t.t_set_bt_set[serial_team->
- t.t_master_tid];
-#endif // OMP_30_ENABLED
- this_thr -> th.th_info.ds.ds_tid = 0;
-
- /* set thread cache values */
- this_thr -> th.th_team_nproc = 1;
- this_thr -> th.th_team_master = this_thr;
- this_thr -> th.th_team_serialized = 1;
-
-#if OMP_30_ENABLED
- serial_team -> t.t_level = serial_team -> t.t_parent -> t.t_level + 1;
- serial_team -> t.t_active_level = serial_team -> t.t_parent -> t.t_active_level;
-#endif // OMP_30_ENABLED
-
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
- if ( __kmp_inherit_fp_control ) {
- __kmp_store_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
- __kmp_store_mxcsr( &serial_team->t.t_mxcsr );
- serial_team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
- serial_team->t.t_fp_control_saved = TRUE;
- } else {
- serial_team->t.t_fp_control_saved = FALSE;
- }
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
- /* check if we need to allocate dispatch buffers stack */
- KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
- if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
- serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
- __kmp_allocate( sizeof( dispatch_private_info_t ) );
- }
- this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
-
- KMP_MB();
-
- } else {
- /* this serialized team is already being used,
- * that's fine, just add another nested level */
- KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
- KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
- KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
- ++ serial_team -> t.t_serialized;
- this_thr -> th.th_team_serialized = serial_team -> t.t_serialized;
-
-#if OMP_30_ENABLED
- // Nested level will be an index in the nested nthreads array
- int level = this_thr->th.th_team->t.t_level;
- // Thread value exists in the nested nthreads array for the next nested level
- if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
- this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
- }
- serial_team -> t.t_level++;
- KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
- global_tid, serial_team, serial_team -> t.t_level ) );
-#else
- KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing team %p for nested serialized parallel region\n",
- global_tid, serial_team ) );
-#endif // OMP_30_ENABLED
-
- /* allocate/push dispatch buffers stack */
- KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
- {
- dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
- __kmp_allocate( sizeof( dispatch_private_info_t ) );
- disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
- serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
- }
- this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
-
- KMP_MB();
- }
-
- if ( __kmp_env_consistency_check )
- __kmp_push_parallel( global_tid, NULL );
-
-// t_level is not available in 2.5 build, so check for OMP_30_ENABLED
-#if USE_ITT_BUILD && OMP_30_ENABLED
- // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
- if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
- {
- __kmp_itt_region_forking( global_tid, 1 );
- }
- if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr )
- {
-#if USE_ITT_NOTIFY
- if( this_thr->th.th_team->t.t_level == 1 ) {
- this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
- }
-#endif
- }
-#endif /* USE_ITT_BUILD */
-
+ __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
+ * kmp_fork_call since the tasks to be done are similar in each case.
+ */
}
/*!
@@ -680,26 +448,13 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
/* If necessary, pop the internal control stack values and replace the team values */
top = serial_team -> t.t_control_stack_top;
if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
-#if OMP_30_ENABLED
- copy_icvs(
- &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs,
- top );
-#else
- serial_team -> t.t_set_nproc[0] = top -> nproc;
- serial_team -> t.t_set_dynamic[0] = top -> dynamic;
- serial_team -> t.t_set_nested[0] = top -> nested;
- serial_team -> t.t_set_blocktime[0] = top -> blocktime;
- serial_team -> t.t_set_bt_intervals[0] = top -> bt_intervals;
- serial_team -> t.t_set_bt_set[0] = top -> bt_set;
-#endif // OMP_30_ENABLED
+ copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
serial_team -> t.t_control_stack_top = top -> next;
__kmp_free(top);
}
-#if OMP_30_ENABLED
//if( serial_team -> t.t_serialized > 1 )
serial_team -> t.t_level--;
-#endif // OMP_30_ENABLED
/* pop dispatch buffers stack */
KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
@@ -735,7 +490,6 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
t.t_dispatch[ serial_team -> t.t_master_tid ];
-#if OMP_30_ENABLED
__kmp_pop_current_task_from_thread( this_thr );
KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
@@ -752,32 +506,37 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
}
-#endif // OMP_30_ENABLED
-
- }
- else {
-
-#if OMP_30_ENABLED
+ } else {
if ( __kmp_tasking_mode != tskm_immediate_exec ) {
KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
global_tid, serial_team, serial_team -> t.t_serialized ) );
}
-#endif // OMP_30_ENABLED
-
}
-// t_level is not available in 2.5 build, so check for OMP_30_ENABLED
-#if USE_ITT_BUILD && OMP_30_ENABLED
+#if USE_ITT_BUILD
+ kmp_uint64 cur_time = 0;
+#if USE_ITT_NOTIFY
+ if( __itt_get_timestamp_ptr ) {
+ cur_time = __itt_get_timestamp();
+ }
+#endif /* USE_ITT_NOTIFY */
+ // Report the barrier
+ if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) {
+ if( this_thr->th.th_team->t.t_level == 0 ) {
+ __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
+ }
+ }
// Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
{
this_thr->th.th_ident = loc;
__kmp_itt_region_joined( global_tid, 1 );
}
- if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) {
- if( this_thr->th.th_team->t.t_level == 0 ) {
- __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, __itt_timestamp_none, 0, loc );
- }
+ if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
+ {
+ this_thr->th.th_ident = loc;
+ // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
+ __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time, cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
}
#endif /* USE_ITT_BUILD */
@@ -805,55 +564,50 @@ __kmpc_flush(ident_t *loc, ...)
/* need explicit __mf() here since use volatile instead in library */
KMP_MB(); /* Flush all pending memory write invalidates. */
- // This is not an OMP 3.0 feature.
- // This macro is used here just not to let the change go to 10.1.
- // This change will go to the mainline first.
- #if OMP_30_ENABLED
- #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
- #if KMP_MIC
- // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
- // We shouldn't need it, though, since the ABI rules require that
- // * If the compiler generates NGO stores it also generates the fence
- // * If users hand-code NGO stores they should insert the fence
- // therefore no incomplete unordered stores should be visible.
- #else
- // C74404
- // This is to address non-temporal store instructions (sfence needed).
- // The clflush instruction is addressed either (mfence needed).
- // Probably the non-temporal load monvtdqa instruction should also be addressed.
- // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
- if ( ! __kmp_cpuinfo.initialized ) {
- __kmp_query_cpuid( & __kmp_cpuinfo );
- }; // if
- if ( ! __kmp_cpuinfo.sse2 ) {
- // CPU cannot execute SSE2 instructions.
- } else {
- #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
- _mm_mfence();
- #else
- __sync_synchronize();
- #endif // KMP_COMPILER_ICC
- }; // if
- #endif // KMP_MIC
- #elif KMP_ARCH_ARM
- // Nothing yet
- #elif KMP_ARCH_PPC64
- // Nothing needed here (we have a real MB above).
- #if KMP_OS_CNK
- // The flushing thread needs to yield here; this prevents a
- // busy-waiting thread from saturating the pipeline. flush is
- // often used in loops like this:
- // while (!flag) {
- // #pragma omp flush(flag)
- // }
- // and adding the yield here is good for at least a 10x speedup
- // when running >2 threads per core (on the NAS LU benchmark).
- __kmp_yield(TRUE);
- #endif
+ #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+ #if KMP_MIC
+ // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+ // We shouldn't need it, though, since the ABI rules require that
+ // * If the compiler generates NGO stores it also generates the fence
+ // * If users hand-code NGO stores they should insert the fence
+ // therefore no incomplete unordered stores should be visible.
#else
- #error Unknown or unsupported architecture
+ // C74404
+ // This is to address non-temporal store instructions (sfence needed).
+ // The clflush instruction is addressed either (mfence needed).
+ // Probably the non-temporal load monvtdqa instruction should also be addressed.
+ // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
+ if ( ! __kmp_cpuinfo.initialized ) {
+ __kmp_query_cpuid( & __kmp_cpuinfo );
+ }; // if
+ if ( ! __kmp_cpuinfo.sse2 ) {
+ // CPU cannot execute SSE2 instructions.
+ } else {
+ #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
+ _mm_mfence();
+ #else
+ __sync_synchronize();
+ #endif // KMP_COMPILER_ICC
+ }; // if
+ #endif // KMP_MIC
+ #elif KMP_ARCH_ARM
+ // Nothing yet
+ #elif KMP_ARCH_PPC64
+ // Nothing needed here (we have a real MB above).
+ #if KMP_OS_CNK
+ // The flushing thread needs to yield here; this prevents a
+ // busy-waiting thread from saturating the pipeline. flush is
+ // often used in loops like this:
+ // while (!flag) {
+ // #pragma omp flush(flag)
+ // }
+ // and adding the yield here is good for at least a 10x speedup
+ // when running >2 threads per core (on the NAS LU benchmark).
+ __kmp_yield(TRUE);
#endif
- #endif // OMP_30_ENABLED
+ #else
+ #error Unknown or unsupported architecture
+ #endif
}
@@ -871,6 +625,8 @@ Execute a barrier.
void
__kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
{
+ KMP_COUNT_BLOCK(OMP_BARRIER);
+ KMP_TIME_BLOCK(OMP_barrier);
int explicit_barrier_flag;
KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
@@ -906,6 +662,7 @@ __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
kmp_int32
__kmpc_master(ident_t *loc, kmp_int32 global_tid)
{
+ KMP_COUNT_BLOCK(OMP_MASTER);
int status = 0;
KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
@@ -1014,11 +771,6 @@ __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
__kmp_parallel_dxo( & gtid, & cid, loc );
}
-inline void
-__kmp_static_yield( int arg ) { // AC: needed in macro __kmp_acquire_user_lock_with_checks
- __kmp_yield( arg );
-}
-
static kmp_user_lock_p
__kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
{
@@ -1082,6 +834,7 @@ This function blocks until the executing thread can enter the critical section.
*/
void
__kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+ KMP_COUNT_BLOCK(OMP_CRITICAL);
kmp_user_lock_p lck;
@@ -1194,6 +947,9 @@ __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
if ( __kmp_env_consistency_check )
__kmp_check_barrier( global_tid, ct_barrier, loc );
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
return (status != 0) ? 0 : 1;
@@ -1243,6 +999,9 @@ __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
__kmp_check_barrier( global_tid, ct_barrier, loc );
}
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
__kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
ret = __kmpc_master (loc, global_tid);
@@ -1280,6 +1039,7 @@ introduce an explicit barrier if it is required.
kmp_int32
__kmpc_single(ident_t *loc, kmp_int32 global_tid)
{
+ KMP_COUNT_BLOCK(OMP_SINGLE);
kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
return rc;
}
@@ -1353,8 +1113,6 @@ ompc_set_nested( int flag )
set__nested( thread, flag ? TRUE : FALSE );
}
-#if OMP_30_ENABLED
-
void
ompc_set_max_active_levels( int max_active_levels )
{
@@ -1384,8 +1142,6 @@ ompc_get_team_size( int level )
return __kmp_get_team_size( __kmp_entry_gtid(), level );
}
-#endif // OMP_30_ENABLED
-
void
kmpc_set_stacksize( int arg )
{
@@ -1427,8 +1183,6 @@ kmpc_set_defaults( char const * str )
__kmp_aux_set_defaults( str, strlen( str ) );
}
-#ifdef OMP_30_ENABLED
-
int
kmpc_set_affinity_mask_proc( int proc, void **mask )
{
@@ -1468,7 +1222,6 @@ kmpc_get_affinity_mask_proc( int proc, void **mask )
#endif
}
-#endif /* OMP_30_ENABLED */
/* -------------------------------------------------------------------------- */
/*!
@@ -1533,6 +1286,9 @@ __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_dat
if (didit) *data_ptr = cpy_data;
/* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+ __kmp_threads[gtid]->th.th_ident = loc;
+#endif
__kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
if (! didit) (*cpy_func)( cpy_data, *data_ptr );
@@ -1540,6 +1296,9 @@ __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_dat
/* Consider next barrier the user-visible barrier for barrier region boundaries */
/* Nesting checks are already handled by the single construct checks */
+#if USE_ITT_NOTIFY
+ __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
+#endif
__kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
}
@@ -1722,6 +1481,7 @@ __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
void
__kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+ KMP_COUNT_BLOCK(OMP_set_lock);
kmp_user_lock_p lck;
if ( ( __kmp_user_lock_kind == lk_tas )
@@ -1866,6 +1626,8 @@ __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
int
__kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
{
+ KMP_COUNT_BLOCK(OMP_test_lock);
+ KMP_TIME_BLOCK(OMP_test_lock);
kmp_user_lock_p lck;
int rc;
@@ -2028,9 +1790,14 @@ __kmpc_reduce_nowait(
kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
kmp_critical_name *lck ) {
+ KMP_COUNT_BLOCK(REDUCE_nowait);
int retval;
PACKED_REDUCTION_METHOD_T packed_reduction_method;
-
+#if OMP_40_ENABLED
+ kmp_team_t *team;
+ kmp_info_t *th;
+ int teams_swapped = 0, task_state;
+#endif
KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
// why do we need this initialization here at all?
@@ -2045,7 +1812,25 @@ __kmpc_reduce_nowait(
if ( __kmp_env_consistency_check )
__kmp_push_sync( global_tid, ct_reduce, loc, NULL );
- // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
+#if OMP_40_ENABLED
+ th = __kmp_thread_from_gtid(global_tid);
+ if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
+ team = th->th.th_team;
+ if( team->t.t_level == th->th.th_teams_level ) {
+ // this is reduction at teams construct
+ KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
+ // Let's swap teams temporarily for the reduction barrier
+ teams_swapped = 1;
+ th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+ th->th.th_team = team->t.t_parent;
+ th->th.th_task_team = th->th.th_team->t.t_task_team;
+ th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+ task_state = th->th.th_task_state;
+ if( th->th.th_task_team )
+ th->th.th_task_state = th->th.th_task_team->tt.tt_state;
+ }
+ }
+#endif // OMP_40_ENABLED
// packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
// the variable should be either a construct-specific or thread-specific property, not a team specific property
@@ -2091,6 +1876,9 @@ __kmpc_reduce_nowait(
// this barrier should be invisible to a customer and to the thread profiler
// (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
@@ -2108,7 +1896,16 @@ __kmpc_reduce_nowait(
KMP_ASSERT( 0 ); // "unexpected method"
}
-
+#if OMP_40_ENABLED
+ if( teams_swapped ) {
+ // Restore thread structure
+ th->th.th_info.ds.ds_tid = 0;
+ th->th.th_team = team;
+ th->th.th_task_team = team->t.t_task_team;
+ th->th.th_team_nproc = team->t.t_nproc;
+ th->th.th_task_state = task_state;
+ }
+#endif
KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
return retval;
@@ -2187,6 +1984,7 @@ __kmpc_reduce(
void (*reduce_func)(void *lhs_data, void *rhs_data),
kmp_critical_name *lck )
{
+ KMP_COUNT_BLOCK(REDUCE_wait);
int retval;
PACKED_REDUCTION_METHOD_T packed_reduction_method;
@@ -2204,8 +2002,6 @@ __kmpc_reduce(
if ( __kmp_env_consistency_check )
__kmp_push_sync( global_tid, ct_reduce, loc, NULL );
- // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
-
packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
__KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
@@ -2228,6 +2024,9 @@ __kmpc_reduce(
//case tree_reduce_block:
// this barrier should be visible to a customer and to the thread profiler
// (it's a terminating barrier on constructs if NOWAIT not specified)
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
+#endif
retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
@@ -2277,6 +2076,9 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
__kmp_end_critical_section_reduce_block( loc, global_tid, lck );
// TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
__kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
} else if( packed_reduction_method == empty_reduce_block ) {
@@ -2284,11 +2086,17 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
// usage: if team size == 1, no synchronization is required ( Intel platforms only )
// TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
__kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
} else if( packed_reduction_method == atomic_reduce_block ) {
// TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
__kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
} else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
@@ -2319,23 +2127,15 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
kmp_uint64
__kmpc_get_taskid() {
- #if OMP_30_ENABLED
-
- kmp_int32 gtid;
- kmp_info_t * thread;
-
- gtid = __kmp_get_gtid();
- if ( gtid < 0 ) {
- return 0;
- }; // if
- thread = __kmp_thread_from_gtid( gtid );
- return thread->th.th_current_task->td_task_id;
-
- #else
+ kmp_int32 gtid;
+ kmp_info_t * thread;
+ gtid = __kmp_get_gtid();
+ if ( gtid < 0 ) {
return 0;
-
- #endif
+ }; // if
+ thread = __kmp_thread_from_gtid( gtid );
+ return thread->th.th_current_task->td_task_id;
} // __kmpc_get_taskid
@@ -2343,25 +2143,17 @@ __kmpc_get_taskid() {
kmp_uint64
__kmpc_get_parent_taskid() {
- #if OMP_30_ENABLED
-
- kmp_int32 gtid;
- kmp_info_t * thread;
- kmp_taskdata_t * parent_task;
-
- gtid = __kmp_get_gtid();
- if ( gtid < 0 ) {
- return 0;
- }; // if
- thread = __kmp_thread_from_gtid( gtid );
- parent_task = thread->th.th_current_task->td_parent;
- return ( parent_task == NULL ? 0 : parent_task->td_task_id );
-
- #else
+ kmp_int32 gtid;
+ kmp_info_t * thread;
+ kmp_taskdata_t * parent_task;
+ gtid = __kmp_get_gtid();
+ if ( gtid < 0 ) {
return 0;
-
- #endif
+ }; // if
+ thread = __kmp_thread_from_gtid( gtid );
+ parent_task = thread->th.th_current_task->td_parent;
+ return ( parent_task == NULL ? 0 : parent_task->td_task_id );
} // __kmpc_get_parent_taskid
OpenPOWER on IntegriCloud