summaryrefslogtreecommitdiffstats
path: root/openmp/runtime/src/kmp_csupport.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'openmp/runtime/src/kmp_csupport.cpp')
-rw-r--r--openmp/runtime/src/kmp_csupport.cpp3337
1 files changed, 3337 insertions, 0 deletions
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
new file mode 100644
index 00000000000..00dc47167a3
--- /dev/null
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -0,0 +1,3337 @@
+/*
+ * kmp_csupport.c -- kfront linkage support for OpenMP.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "omp.h" /* extern "C" declarations of user-visible routines */
+#include "kmp.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_lock.h"
+#include "kmp_error.h"
+#include "kmp_stats.h"
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+#define MAX_MESSAGE 512
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+/* flags will be used in future, e.g., to implement */
+/* openmp_strict library restrictions */
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc in source location information
+ * @param flags in for future use (currently ignored)
+ *
+ * Initialize the runtime library. This call is optional; if it is not made then
+ * it will be implicitly called by attempts to use other library functions.
+ *
+ */
+void
+__kmpc_begin(ident_t *loc, kmp_int32 flags)
+{
+ // By default __kmp_ignore_mppbeg() returns TRUE.
+ if (__kmp_ignore_mppbeg() == FALSE) {
+ __kmp_internal_begin();
+
+ KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
+ }
+}
+
+/*!
+ * @ingroup STARTUP_SHUTDOWN
+ * @param loc source location information
+ *
+ * Shutdown the runtime library. This is also optional, and even if called will not
+ * do anything unless the `KMP_IGNORE_MPPEND` environment variable is set to zero.
+ */
+void
+__kmpc_end(ident_t *loc)
+{
+ // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
+ // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
+ // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
+ // will unregister this root (it can cause library shut down).
+ if (__kmp_ignore_mppend() == FALSE) {
+ KC_TRACE( 10, ("__kmpc_end: called\n" ) );
+ KA_TRACE( 30, ("__kmpc_end\n" ));
+
+ __kmp_internal_end_thread( -1 );
+ }
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The global thread index of the active thread.
+
+This function can be called in any context.
+
+If the runtime has ony been entered at the outermost level from a
+single (necessarily non-OpenMP<sup>*</sup>) thread, then the thread number is that
+which would be returned by omp_get_thread_num() in the outermost
+active parallel construct. (Or zero if there is no active parallel
+construct, since the master thread is necessarily thread zero).
+
+If multiple non-OpenMP threads all enter an OpenMP construct then this
+will be a unique thread identifier among all the threads created by
+the OpenMP runtime (but the value cannote be defined in terms of
+OpenMP thread ids returned by omp_get_thread_num()).
+
+*/
+kmp_int32
+__kmpc_global_thread_num(ident_t *loc)
+{
+ kmp_int32 gtid = __kmp_entry_gtid();
+
+ KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
+
+ return gtid;
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads under control of the OpenMP<sup>*</sup> runtime
+
+This function can be called in any context.
+It returns the total number of threads under the control of the OpenMP runtime. That is
+not a number that can be determined by any OpenMP standard calls, since the library may be
+called from more than one non-OpenMP thread, and this reflects the total over all such calls.
+Similarly the runtime maintains underlying threads even when they are not active (since the cost
+of creating and destroying OS threads is high), this call counts all such threads even if they are not
+waiting for work.
+*/
+kmp_int32
+__kmpc_global_num_threads(ident_t *loc)
+{
+ KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
+
+ return TCR_4(__kmp_nth);
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The thread number of the calling thread in the innermost active parallel construct.
+
+*/
+kmp_int32
+__kmpc_bound_thread_num(ident_t *loc)
+{
+ KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
+ return __kmp_tid_from_gtid( __kmp_entry_gtid() );
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return The number of threads in the innermost active parallel construct.
+*/
+kmp_int32
+__kmpc_bound_num_threads(ident_t *loc)
+{
+ KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
+
+ return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
+}
+
+/*!
+ * @ingroup DEPRECATED
+ * @param loc location description
+ *
+ * This function need not be called. It always returns TRUE.
+ */
+kmp_int32
+__kmpc_ok_to_fork(ident_t *loc)
+{
+#ifndef KMP_DEBUG
+
+ return TRUE;
+
+#else
+
+ const char *semi2;
+ const char *semi3;
+ int line_no;
+
+ if (__kmp_par_range == 0) {
+ return TRUE;
+ }
+ semi2 = loc->psource;
+ if (semi2 == NULL) {
+ return TRUE;
+ }
+ semi2 = strchr(semi2, ';');
+ if (semi2 == NULL) {
+ return TRUE;
+ }
+ semi2 = strchr(semi2 + 1, ';');
+ if (semi2 == NULL) {
+ return TRUE;
+ }
+ if (__kmp_par_range_filename[0]) {
+ const char *name = semi2 - 1;
+ while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
+ name--;
+ }
+ if ((*name == '/') || (*name == ';')) {
+ name++;
+ }
+ if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
+ return __kmp_par_range < 0;
+ }
+ }
+ semi3 = strchr(semi2 + 1, ';');
+ if (__kmp_par_range_routine[0]) {
+ if ((semi3 != NULL) && (semi3 > semi2)
+ && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
+ return __kmp_par_range < 0;
+ }
+ }
+ if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
+ if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
+ return __kmp_par_range > 0;
+ }
+ return __kmp_par_range < 0;
+ }
+ return TRUE;
+
+#endif /* KMP_DEBUG */
+
+}
+
+/*!
+@ingroup THREAD_STATES
+@param loc Source location information.
+@return 1 if this thread is executing inside an active parallel region, zero if not.
+*/
+kmp_int32
+__kmpc_in_parallel( ident_t *loc )
+{
+ return __kmp_entry_thread() -> th.th_root -> r.r_active;
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_threads number of threads requested for this parallel construct
+
+Set the number of threads to be used by the next fork spawned by this thread.
+This call is only required if the parallel construct has a `num_threads` clause.
+*/
+void
+__kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
+{
+ KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
+ global_tid, num_threads ) );
+
+ __kmp_push_num_threads( loc, global_tid, num_threads );
+}
+
+void
+__kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
+{
+ KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
+
+ /* the num_threads are automatically popped */
+}
+
+
+#if OMP_40_ENABLED
+
+void
+__kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
+{
+ KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
+ global_tid, proc_bind ) );
+
+ __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
+}
+
+#endif /* OMP_40_ENABLED */
+
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param argc total number of arguments in the ellipsis
+@param microtask pointer to callback routine consisting of outlined parallel construct
+@param ... pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void
+__kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
+{
+ int gtid = __kmp_entry_gtid();
+
+#if (KMP_STATS_ENABLED)
+ int inParallel = __kmpc_in_parallel(loc);
+ if (inParallel)
+ {
+ KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
+ }
+ else
+ {
+ KMP_COUNT_BLOCK(OMP_PARALLEL);
+ }
+#endif
+
+ // maybe to save thr_state is enough here
+ {
+ va_list ap;
+ va_start( ap, microtask );
+
+#if OMPT_SUPPORT
+ ompt_frame_t* ompt_frame;
+ if (ompt_enabled) {
+ kmp_info_t *master_th = __kmp_threads[ gtid ];
+ kmp_team_t *parent_team = master_th->th.th_team;
+ ompt_lw_taskteam_t *lwt = parent_team->t.ompt_serialized_team_info;
+ if (lwt)
+ ompt_frame = &(lwt->ompt_task_info.frame);
+ else
+ {
+ int tid = __kmp_tid_from_gtid( gtid );
+ ompt_frame = &(parent_team->t.t_implicit_task_taskdata[tid].
+ ompt_task_info.frame);
+ }
+ ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
+ }
+#endif
+
+#if INCLUDE_SSC_MARKS
+ SSC_MARK_FORKING();
+#endif
+ __kmp_fork_call( loc, gtid, fork_context_intel,
+ argc,
+#if OMPT_SUPPORT
+ VOLATILE_CAST(void *) microtask, // "unwrapped" task
+#endif
+ VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
+ VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
+/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+ &ap
+#else
+ ap
+#endif
+ );
+#if INCLUDE_SSC_MARKS
+ SSC_MARK_JOINING();
+#endif
+ __kmp_join_call( loc, gtid
+#if OMPT_SUPPORT
+ , fork_context_intel
+#endif
+ );
+
+ va_end( ap );
+
+ }
+}
+
+#if OMP_40_ENABLED
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+@param num_teams number of teams requested for the teams construct
+@param num_threads number of threads per team requested for the teams construct
+
+Set the number of teams to be used by the teams construct.
+This call is only required if the teams construct has a `num_teams` clause
+or a `thread_limit` clause (or both).
+*/
+void
+__kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
+{
+ KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
+ global_tid, num_teams, num_threads ) );
+
+ __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param argc total number of arguments in the ellipsis
+@param microtask pointer to callback routine consisting of outlined teams construct
+@param ... pointers to shared variables that aren't global
+
+Do the actual fork and call the microtask in the relevant number of threads.
+*/
+void
+__kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
+{
+ int gtid = __kmp_entry_gtid();
+ kmp_info_t *this_thr = __kmp_threads[ gtid ];
+ va_list ap;
+ va_start( ap, microtask );
+
+ KMP_COUNT_BLOCK(OMP_TEAMS);
+
+ // remember teams entry point and nesting level
+ this_thr->th.th_teams_microtask = microtask;
+ this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
+
+#if OMPT_SUPPORT
+ kmp_team_t *parent_team = this_thr->th.th_team;
+ int tid = __kmp_tid_from_gtid( gtid );
+ if (ompt_enabled) {
+ parent_team->t.t_implicit_task_taskdata[tid].
+ ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(1);
+ }
+#endif
+
+ // check if __kmpc_push_num_teams called, set default number of teams otherwise
+ if ( this_thr->th.th_teams_size.nteams == 0 ) {
+ __kmp_push_num_teams( loc, gtid, 0, 0 );
+ }
+ KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
+ KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+ KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
+
+ __kmp_fork_call( loc, gtid, fork_context_intel,
+ argc,
+#if OMPT_SUPPORT
+ VOLATILE_CAST(void *) microtask, // "unwrapped" task
+#endif
+ VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
+ VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
+ &ap
+#else
+ ap
+#endif
+ );
+ __kmp_join_call( loc, gtid
+#if OMPT_SUPPORT
+ , fork_context_intel
+#endif
+ );
+
+ this_thr->th.th_teams_microtask = NULL;
+ this_thr->th.th_teams_level = 0;
+ *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
+ va_end( ap );
+}
+#endif /* OMP_40_ENABLED */
+
+
+//
+// I don't think this function should ever have been exported.
+// The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
+// openmp code ever called it, but it's been exported from the RTL for so
+// long that I'm afraid to remove the definition.
+//
+int
+__kmpc_invoke_task_func( int gtid )
+{
+ return __kmp_invoke_task_func( gtid );
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+
+Enter a serialized parallel construct. This interface is used to handle a
+conditional parallel region, like this,
+@code
+#pragma omp parallel if (condition)
+@endcode
+when the condition is false.
+*/
+void
+__kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+ __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
+ * kmp_fork_call since the tasks to be done are similar in each case.
+ */
+}
+
+/*!
+@ingroup PARALLEL
+@param loc source location information
+@param global_tid global thread number
+
+Leave a serialized parallel construct.
+*/
+void
+__kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+ kmp_internal_control_t *top;
+ kmp_info_t *this_thr;
+ kmp_team_t *serial_team;
+
+ KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
+
+ /* skip all this code for autopar serialized loops since it results in
+ unacceptable overhead */
+ if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
+ return;
+
+ // Not autopar code
+ if( ! TCR_4( __kmp_init_parallel ) )
+ __kmp_parallel_initialize();
+
+ this_thr = __kmp_threads[ global_tid ];
+ serial_team = this_thr->th.th_serial_team;
+
+ #if OMP_45_ENABLED
+ kmp_task_team_t * task_team = this_thr->th.th_task_team;
+
+ // we need to wait for the proxy tasks before finishing the thread
+ if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
+ __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here?
+ #endif
+
+ KMP_MB();
+ KMP_DEBUG_ASSERT( serial_team );
+ KMP_ASSERT( serial_team -> t.t_serialized );
+ KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
+ KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
+ KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
+ KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
+
+ /* If necessary, pop the internal control stack values and replace the team values */
+ top = serial_team -> t.t_control_stack_top;
+ if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
+ copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
+ serial_team -> t.t_control_stack_top = top -> next;
+ __kmp_free(top);
+ }
+
+ //if( serial_team -> t.t_serialized > 1 )
+ serial_team -> t.t_level--;
+
+ /* pop dispatch buffers stack */
+ KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
+ {
+ dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
+ serial_team->t.t_dispatch->th_disp_buffer =
+ serial_team->t.t_dispatch->th_disp_buffer->next;
+ __kmp_free( disp_buffer );
+ }
+
+ -- serial_team -> t.t_serialized;
+ if ( serial_team -> t.t_serialized == 0 ) {
+
+ /* return to the parallel section */
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+ if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
+ __kmp_clear_x87_fpu_status_word();
+ __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
+ __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
+ }
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
+ this_thr -> th.th_team = serial_team -> t.t_parent;
+ this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
+
+ /* restore values cached in the thread */
+ this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
+ this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
+ this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
+
+ /* TODO the below shouldn't need to be adjusted for serialized teams */
+ this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
+ t.t_dispatch[ serial_team -> t.t_master_tid ];
+
+ __kmp_pop_current_task_from_thread( this_thr );
+
+ KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
+ this_thr -> th.th_current_task -> td_flags.executing = 1;
+
+ if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+ // Copy the task team from the new child / old parent team to the thread.
+ this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
+ KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
+ global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
+ }
+ } else {
+ if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+ KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
+ global_tid, serial_team, serial_team -> t.t_serialized ) );
+ }
+ }
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_parallel( global_tid, NULL );
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information.
+
+Execute <tt>flush</tt>. This is implemented as a full memory fence. (Though
+depending on the memory ordering convention obeyed by the compiler
+even that may not be necessary).
+*/
+void
+__kmpc_flush(ident_t *loc)
+{
+ KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
+
+ /* need explicit __mf() here since use volatile instead in library */
+ KMP_MB(); /* Flush all pending memory write invalidates. */
+
+ #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+ #if KMP_MIC
+ // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+ // We shouldn't need it, though, since the ABI rules require that
+ // * If the compiler generates NGO stores it also generates the fence
+ // * If users hand-code NGO stores they should insert the fence
+ // therefore no incomplete unordered stores should be visible.
+ #else
+ // C74404
+ // This is to address non-temporal store instructions (sfence needed).
+ // The clflush instruction is addressed either (mfence needed).
+ // Probably the non-temporal load monvtdqa instruction should also be addressed.
+ // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
+ if ( ! __kmp_cpuinfo.initialized ) {
+ __kmp_query_cpuid( & __kmp_cpuinfo );
+ }; // if
+ if ( ! __kmp_cpuinfo.sse2 ) {
+ // CPU cannot execute SSE2 instructions.
+ } else {
+ #if KMP_COMPILER_ICC
+ _mm_mfence();
+ #elif KMP_COMPILER_MSVC
+ MemoryBarrier();
+ #else
+ __sync_synchronize();
+ #endif // KMP_COMPILER_ICC
+ }; // if
+ #endif // KMP_MIC
+ #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
+ // Nothing to see here move along
+ #elif KMP_ARCH_PPC64
+ // Nothing needed here (we have a real MB above).
+ #if KMP_OS_CNK
+ // The flushing thread needs to yield here; this prevents a
+ // busy-waiting thread from saturating the pipeline. flush is
+ // often used in loops like this:
+ // while (!flag) {
+ // #pragma omp flush(flag)
+ // }
+ // and adding the yield here is good for at least a 10x speedup
+ // when running >2 threads per core (on the NAS LU benchmark).
+ __kmp_yield(TRUE);
+ #endif
+ #else
+ #error Unknown or unsupported architecture
+ #endif
+
+}
+
+/* -------------------------------------------------------------------------- */
+
+/* -------------------------------------------------------------------------- */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Execute a barrier.
+*/
+void
+__kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
+{
+ KMP_COUNT_BLOCK(OMP_BARRIER);
+ KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
+
+ if (! TCR_4(__kmp_init_parallel))
+ __kmp_parallel_initialize();
+
+ if ( __kmp_env_consistency_check ) {
+ if ( loc == 0 ) {
+ KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
+ }; // if
+
+ __kmp_check_barrier( global_tid, ct_barrier, loc );
+ }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ ompt_frame_t * ompt_frame;
+ if (ompt_enabled ) {
+ ompt_frame = __ompt_get_task_frame_internal(0);
+ if ( ompt_frame->reenter_runtime_frame == NULL )
+ ompt_frame->reenter_runtime_frame = __builtin_frame_address(1);
+ }
+#endif
+ __kmp_threads[ global_tid ]->th.th_ident = loc;
+ // TODO: explicit barrier_wait_id:
+ // this function is called when 'barrier' directive is present or
+ // implicit barrier at the end of a worksharing construct.
+ // 1) better to add a per-thread barrier counter to a thread data structure
+ // 2) set to 0 when a new team is created
+ // 4) no sync is required
+
+ __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled ) {
+ ompt_frame->reenter_runtime_frame = NULL;
+ }
+#endif
+}
+
+/* The BARRIER for a MASTER section is always explicit */
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param global_tid global thread number .
+@return 1 if this thread should execute the <tt>master</tt> block, 0 otherwise.
+*/
+kmp_int32
+__kmpc_master(ident_t *loc, kmp_int32 global_tid)
+{
+ int status = 0;
+
+ KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
+
+ if( ! TCR_4( __kmp_init_parallel ) )
+ __kmp_parallel_initialize();
+
+ if( KMP_MASTER_GTID( global_tid )) {
+ KMP_COUNT_BLOCK(OMP_MASTER);
+ KMP_PUSH_PARTITIONED_TIMER(OMP_master);
+ status = 1;
+ }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (status) {
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
+ kmp_info_t *this_thr = __kmp_threads[ global_tid ];
+ kmp_team_t *team = this_thr -> th.th_team;
+
+ int tid = __kmp_tid_from_gtid( global_tid );
+ ompt_callbacks.ompt_callback(ompt_event_master_begin)(
+ team->t.ompt_team_info.parallel_id,
+ team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+ }
+ }
+#endif
+
+ if ( __kmp_env_consistency_check ) {
+#if KMP_USE_DYNAMIC_LOCK
+ if (status)
+ __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
+ else
+ __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
+#else
+ if (status)
+ __kmp_push_sync( global_tid, ct_master, loc, NULL );
+ else
+ __kmp_check_sync( global_tid, ct_master, loc, NULL );
+#endif
+ }
+
+ return status;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param global_tid global thread number .
+
+Mark the end of a <tt>master</tt> region. This should only be called by the thread
+that executes the <tt>master</tt> region.
+*/
+void
+__kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
+{
+ KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
+
+ KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
+ KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ kmp_info_t *this_thr = __kmp_threads[ global_tid ];
+ kmp_team_t *team = this_thr -> th.th_team;
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_master_end)) {
+ int tid = __kmp_tid_from_gtid( global_tid );
+ ompt_callbacks.ompt_callback(ompt_event_master_end)(
+ team->t.ompt_team_info.parallel_id,
+ team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+ }
+#endif
+
+ if ( __kmp_env_consistency_check ) {
+ if( global_tid < 0 )
+ KMP_WARNING( ThreadIdentInvalid );
+
+ if( KMP_MASTER_GTID( global_tid ))
+ __kmp_pop_sync( global_tid, ct_master, loc );
+ }
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param gtid global thread number.
+
+Start execution of an <tt>ordered</tt> construct.
+*/
+void
+__kmpc_ordered( ident_t * loc, kmp_int32 gtid )
+{
+ int cid = 0;
+ kmp_info_t *th;
+ KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+ KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
+
+ if (! TCR_4(__kmp_init_parallel))
+ __kmp_parallel_initialize();
+
+#if USE_ITT_BUILD
+ __kmp_itt_ordered_prep( gtid );
+ // TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+ th = __kmp_threads[ gtid ];
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled) {
+ /* OMPT state update */
+ th->th.ompt_thread_info.wait_id = (uint64_t) loc;
+ th->th.ompt_thread_info.state = ompt_state_wait_ordered;
+
+ /* OMPT event callback */
+ if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
+ ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
+ th->th.ompt_thread_info.wait_id);
+ }
+ }
+#endif
+
+ if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
+ (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
+ else
+ __kmp_parallel_deo( & gtid, & cid, loc );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled) {
+ /* OMPT state update */
+ th->th.ompt_thread_info.state = ompt_state_work_parallel;
+ th->th.ompt_thread_info.wait_id = 0;
+
+ /* OMPT event callback */
+ if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
+ ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
+ th->th.ompt_thread_info.wait_id);
+ }
+ }
+#endif
+
+#if USE_ITT_BUILD
+ __kmp_itt_ordered_start( gtid );
+#endif /* USE_ITT_BUILD */
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param gtid global thread number.
+
+End execution of an <tt>ordered</tt> construct.
+*/
+void
+__kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
+{
+ int cid = 0;
+ kmp_info_t *th;
+
+ KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
+
+#if USE_ITT_BUILD
+ __kmp_itt_ordered_end( gtid );
+ // TODO: ordered_wait_id
+#endif /* USE_ITT_BUILD */
+
+ th = __kmp_threads[ gtid ];
+
+ if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
+ (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
+ else
+ __kmp_parallel_dxo( & gtid, & cid, loc );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
+ ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
+ th->th.ompt_thread_info.wait_id);
+ }
+#endif
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+static __forceinline void
+__kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag)
+{
+ // Pointer to the allocated indirect lock is written to crit, while indexing is ignored.
+ void *idx;
+ kmp_indirect_lock_t **lck;
+ lck = (kmp_indirect_lock_t **)crit;
+ kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
+ KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
+ KMP_SET_I_LOCK_LOCATION(ilk, loc);
+ KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
+ KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
+#if USE_ITT_BUILD
+ __kmp_itt_critical_creating(ilk->lock, loc);
+#endif
+ int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
+ if (status == 0) {
+#if USE_ITT_BUILD
+ __kmp_itt_critical_destroyed(ilk->lock);
+#endif
+ // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit.
+ //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
+ }
+ KMP_DEBUG_ASSERT(*lck != NULL);
+}
+
+// Fast-path acquire tas lock
+#define KMP_ACQUIRE_TAS_LOCK(lock, gtid) { \
+ kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
+ if (l->lk.poll != KMP_LOCK_FREE(tas) || \
+ ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
+ kmp_uint32 spins; \
+ KMP_FSYNC_PREPARE(l); \
+ KMP_INIT_YIELD(spins); \
+ if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
+ KMP_YIELD(TRUE); \
+ } else { \
+ KMP_YIELD_SPIN(spins); \
+ } \
+ kmp_backoff_t backoff = __kmp_spin_backoff_params; \
+ while (l->lk.poll != KMP_LOCK_FREE(tas) || \
+ ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
+ __kmp_spin_backoff(&backoff); \
+ if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
+ KMP_YIELD(TRUE); \
+ } else { \
+ KMP_YIELD_SPIN(spins); \
+ } \
+ } \
+ } \
+ KMP_FSYNC_ACQUIRED(l); \
+}
+
+// Fast-path test tas lock
+#define KMP_TEST_TAS_LOCK(lock, gtid, rc) { \
+ kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
+ rc = l->lk.poll == KMP_LOCK_FREE(tas) && \
+ KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas)); \
+}
+
+// Fast-path release tas lock
+#define KMP_RELEASE_TAS_LOCK(lock, gtid) { \
+ TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \
+ KMP_MB(); \
+}
+
+#if KMP_USE_FUTEX
+
+# include <unistd.h>
+# include <sys/syscall.h>
+# ifndef FUTEX_WAIT
+# define FUTEX_WAIT 0
+# endif
+# ifndef FUTEX_WAKE
+# define FUTEX_WAKE 1
+# endif
+
+// Fast-path acquire futex lock
+#define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) { \
+ kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
+ kmp_int32 gtid_code = (gtid+1) << 1; \
+ KMP_MB(); \
+ KMP_FSYNC_PREPARE(ftx); \
+ kmp_int32 poll_val; \
+ while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
+ KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
+ kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
+ if (!cond) { \
+ if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) { \
+ continue; \
+ } \
+ poll_val |= KMP_LOCK_BUSY(1, futex); \
+ } \
+ kmp_int32 rc; \
+ if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \
+ continue; \
+ } \
+ gtid_code |= 1; \
+ } \
+ KMP_FSYNC_ACQUIRED(ftx); \
+}
+
+// Fast-path test futex lock
+#define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) { \
+ kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
+ if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) { \
+ KMP_FSYNC_ACQUIRED(ftx); \
+ rc = TRUE; \
+ } else { \
+ rc = FALSE; \
+ } \
+}
+
+// Fast-path release futex lock
+#define KMP_RELEASE_FUTEX_LOCK(lock, gtid) { \
+ kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
+ KMP_MB(); \
+ KMP_FSYNC_RELEASING(ftx); \
+ kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
+ if (KMP_LOCK_STRIP(poll_val) & 1) { \
+ syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
+ } \
+ KMP_MB(); \
+ KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
+}
+
+#endif // KMP_USE_FUTEX
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+static kmp_user_lock_p
+__kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
+{
+ kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
+
+ //
+ // Because of the double-check, the following load
+ // doesn't need to be volatile.
+ //
+ kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
+
+ if ( lck == NULL ) {
+ void * idx;
+
+ // Allocate & initialize the lock.
+ // Remember allocated locks in table in order to free them in __kmp_cleanup()
+ lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
+ __kmp_init_user_lock_with_checks( lck );
+ __kmp_set_user_lock_location( lck, loc );
+#if USE_ITT_BUILD
+ __kmp_itt_critical_creating( lck );
+ // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
+ // lock. It is the only place where we can guarantee it. There are chances the lock will
+ // destroyed with no usage, but it is not a problem, because this is not real event seen
+ // by user but rather setting name for object (lock). See more details in kmp_itt.h.
+#endif /* USE_ITT_BUILD */
+
+ //
+ // Use a cmpxchg instruction to slam the start of the critical
+ // section with the lock pointer. If another thread beat us
+ // to it, deallocate the lock, and use the lock that the other
+ // thread allocated.
+ //
+ int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
+
+ if ( status == 0 ) {
+ // Deallocate the lock and reload the value.
+#if USE_ITT_BUILD
+ __kmp_itt_critical_destroyed( lck );
+ // Let ITT know the lock is destroyed and the same memory location may be reused for
+ // another purpose.
+#endif /* USE_ITT_BUILD */
+ __kmp_destroy_user_lock_with_checks( lck );
+ __kmp_user_lock_free( &idx, gtid, lck );
+ lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
+ KMP_DEBUG_ASSERT( lck != NULL );
+ }
+ }
+ return lck;
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param global_tid global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
+some other suitably unique value.
+
+Enter code protected by a `critical` construct.
+This function blocks until the executing thread can enter the critical section.
+*/
+void
+__kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
+{
+#if KMP_USE_DYNAMIC_LOCK
+ __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
+#else
+ KMP_COUNT_BLOCK(OMP_CRITICAL);
+ KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */
+ kmp_user_lock_p lck;
+
+ KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
+
+ //TODO: add THR_OVHD_STATE
+
+ KMP_CHECK_USER_LOCK_INIT();
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+ lck = (kmp_user_lock_p)crit;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+ lck = (kmp_user_lock_p)crit;
+ }
+#endif
+ else { // ticket, queuing or drdpa
+ lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
+ }
+
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_critical, loc, lck );
+
+ /* since the critical directive binds to all threads, not just
+ * the current team we have to check this even if we are in a
+ * serialized team */
+ /* also, even if we are the uber thread, we still have to conduct the lock,
+ * as we have to contend with sibling threads */
+
+#if USE_ITT_BUILD
+ __kmp_itt_critical_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+ // Value of 'crit' should be good for using as a critical_id of the critical section directive.
+ __kmp_acquire_user_lock_with_checks( lck, global_tid );
+
+#if USE_ITT_BUILD
+ __kmp_itt_critical_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+ KMP_START_EXPLICIT_TIMER(OMP_critical);
+ KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// Converts the given hint to an internal lock implementation
+static __forceinline kmp_dyna_lockseq_t
+__kmp_map_hint_to_lock(uintptr_t hint)
+{
+#if KMP_USE_TSX
+# define KMP_TSX_LOCK(seq) lockseq_##seq
+#else
+# define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
+#endif
+
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+# define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
+#else
+# define KMP_CPUINFO_RTM 0
+#endif
+
+ // Hints that do not require further logic
+ if (hint & kmp_lock_hint_hle)
+ return KMP_TSX_LOCK(hle);
+ if (hint & kmp_lock_hint_rtm)
+ return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq;
+ if (hint & kmp_lock_hint_adaptive)
+ return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq;
+
+ // Rule out conflicting hints first by returning the default lock
+ if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
+ return __kmp_user_lock_seq;
+ if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative))
+ return __kmp_user_lock_seq;
+
+ // Do not even consider speculation when it appears to be contended
+ if (hint & omp_lock_hint_contended)
+ return lockseq_queuing;
+
+ // Uncontended lock without speculation
+ if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
+ return lockseq_tas;
+
+ // HLE lock for speculation
+ if (hint & omp_lock_hint_speculative)
+ return KMP_TSX_LOCK(hle);
+
+ return __kmp_user_lock_seq;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param global_tid global thread number.
+@param crit identity of the critical section. This could be a pointer to a lock associated with the critical section,
+or some other suitably unique value.
+@param hint the lock hint.
+
+Enter code protected by a `critical` construct with a hint. The hint value is used to suggest a lock implementation.
+This function blocks until the executing thread can enter the critical section unless the hint suggests use of
+speculative execution and the hardware supports it.
+*/
+void
+__kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint )
+{
+ KMP_COUNT_BLOCK(OMP_CRITICAL);
+ kmp_user_lock_p lck;
+
+ KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
+
+ kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+ // Check if it is initialized.
+ if (*lk == 0) {
+ kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
+ if (KMP_IS_D_LOCK(lckseq)) {
+ KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq));
+ } else {
+ __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
+ }
+ }
+ // Branch for accessing the actual lock object and set operation. This branching is inevitable since
+ // this lock initialization does not follow the normal dispatch path (lock table is not used).
+ if (KMP_EXTRACT_D_TAG(lk) != 0) {
+ lck = (kmp_user_lock_p)lk;
+ if (__kmp_env_consistency_check) {
+ __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
+ }
+# if USE_ITT_BUILD
+ __kmp_itt_critical_acquiring(lck);
+# endif
+# if KMP_USE_INLINED_TAS
+ if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+ KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
+ } else
+# elif KMP_USE_INLINED_FUTEX
+ if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+ KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
+ } else
+# endif
+ {
+ KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+ }
+ } else {
+ kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+ lck = ilk->lock;
+ if (__kmp_env_consistency_check) {
+ __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
+ }
+# if USE_ITT_BUILD
+ __kmp_itt_critical_acquiring(lck);
+# endif
+ KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_critical_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+ KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
+ KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
+} // __kmpc_critical_with_hint
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param global_tid global thread number .
+@param crit identity of the critical section. This could be a pointer to a lock associated with the critical section, or
+some other suitably unique value.
+
+Leave a critical section, releasing any lock that was held during its execution.
+*/
+void
+__kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
+{
+ kmp_user_lock_p lck;
+
+ KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
+
+#if KMP_USE_DYNAMIC_LOCK
+ if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+ lck = (kmp_user_lock_p)crit;
+ KMP_ASSERT(lck != NULL);
+ if (__kmp_env_consistency_check) {
+ __kmp_pop_sync(global_tid, ct_critical, loc);
+ }
+# if USE_ITT_BUILD
+ __kmp_itt_critical_releasing( lck );
+# endif
+# if KMP_USE_INLINED_TAS
+ if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
+ KMP_RELEASE_TAS_LOCK(lck, global_tid);
+ } else
+# elif KMP_USE_INLINED_FUTEX
+ if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
+ KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
+ } else
+# endif
+ {
+ KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+ }
+ } else {
+ kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+ KMP_ASSERT(ilk != NULL);
+ lck = ilk->lock;
+ if (__kmp_env_consistency_check) {
+ __kmp_pop_sync(global_tid, ct_critical, loc);
+ }
+# if USE_ITT_BUILD
+ __kmp_itt_critical_releasing( lck );
+# endif
+ KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
+ }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+ lck = (kmp_user_lock_p)crit;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
+ lck = (kmp_user_lock_p)crit;
+ }
+#endif
+ else { // ticket, queuing or drdpa
+ lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
+ }
+
+ KMP_ASSERT(lck != NULL);
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_sync( global_tid, ct_critical, loc );
+
+#if USE_ITT_BUILD
+ __kmp_itt_critical_releasing( lck );
+#endif /* USE_ITT_BUILD */
+ // Value of 'crit' should be good for using as a critical_id of the critical section directive.
+ __kmp_release_user_lock_with_checks( lck, global_tid );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
+ ompt_callbacks.ompt_callback(ompt_event_release_critical)(
+ (uint64_t) lck);
+ }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+ KMP_POP_PARTITIONED_TIMER();
+ KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master. The barrier is executed inside this function.
+*/
+kmp_int32
+__kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
+{
+ int status;
+
+ KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
+
+ if (! TCR_4(__kmp_init_parallel))
+ __kmp_parallel_initialize();
+
+ if ( __kmp_env_consistency_check )
+ __kmp_check_barrier( global_tid, ct_barrier, loc );
+
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
+
+ return (status != 0) ? 0 : 1;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+
+Complete the execution of a combined barrier and master. This function should
+only be called at the completion of the <tt>master</tt> code. Other threads will
+still be waiting at the barrier and this call releases them.
+*/
+void
+__kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
+{
+ KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
+
+ __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid thread id.
+@return one if the thread should execute the master block, zero otherwise
+
+Start execution of a combined barrier and master(nowait) construct.
+The barrier is executed inside this function.
+There is no equivalent "end" function, since the
+*/
+kmp_int32
+__kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
+{
+ kmp_int32 ret;
+
+ KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
+
+ if (! TCR_4(__kmp_init_parallel))
+ __kmp_parallel_initialize();
+
+ if ( __kmp_env_consistency_check ) {
+ if ( loc == 0 ) {
+ KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
+ }
+ __kmp_check_barrier( global_tid, ct_barrier, loc );
+ }
+
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+ ret = __kmpc_master (loc, global_tid);
+
+ if ( __kmp_env_consistency_check ) {
+ /* there's no __kmpc_end_master called; so the (stats) */
+ /* actions of __kmpc_end_master are done here */
+
+ if ( global_tid < 0 ) {
+ KMP_WARNING( ThreadIdentInvalid );
+ }
+ if (ret) {
+ /* only one thread should do the pop since only */
+ /* one did the push (see __kmpc_master()) */
+
+ __kmp_pop_sync( global_tid, ct_master, loc );
+ }
+ }
+
+ return (ret);
+}
+
+/* The BARRIER for a SINGLE process section is always explicit */
+/*!
+@ingroup WORK_SHARING
+@param loc source location information
+@param global_tid global thread number
+@return One if this thread should execute the single construct, zero otherwise.
+
+Test whether to execute a <tt>single</tt> construct.
+There are no implicit barriers in the two "single" calls, rather the compiler should
+introduce an explicit barrier if it is required.
+*/
+
+kmp_int32
+__kmpc_single(ident_t *loc, kmp_int32 global_tid)
+{
+ kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
+
+ if (rc) {
+ // We are going to execute the single statement, so we should count it.
+ KMP_COUNT_BLOCK(OMP_SINGLE);
+ KMP_PUSH_PARTITIONED_TIMER(OMP_single);
+ }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ kmp_info_t *this_thr = __kmp_threads[ global_tid ];
+ kmp_team_t *team = this_thr -> th.th_team;
+ int tid = __kmp_tid_from_gtid( global_tid );
+
+ if (ompt_enabled) {
+ if (rc) {
+ if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
+ ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
+ team->t.ompt_team_info.parallel_id,
+ team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
+ team->t.ompt_team_info.microtask);
+ }
+ } else {
+ if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
+ ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
+ team->t.ompt_team_info.parallel_id,
+ team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+ }
+ this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
+ }
+ }
+#endif
+
+ return rc;
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc source location information
+@param global_tid global thread number
+
+Mark the end of a <tt>single</tt> construct. This function should
+only be called by the thread that executed the block of code protected
+by the `single` construct.
+*/
+void
+__kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
+{
+ __kmp_exit_single( global_tid );
+ KMP_POP_PARTITIONED_TIMER();
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ kmp_info_t *this_thr = __kmp_threads[ global_tid ];
+ kmp_team_t *team = this_thr -> th.th_team;
+ int tid = __kmp_tid_from_gtid( global_tid );
+
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
+ ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
+ team->t.ompt_team_info.parallel_id,
+ team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
+ }
+#endif
+}
+
+/*!
+@ingroup WORK_SHARING
+@param loc Source location
+@param global_tid Global thread id
+
+Mark the end of a statically scheduled loop.
+*/
+void
+__kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
+{
+ KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
+ ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
+ ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
+ ompt_callbacks.ompt_callback(ompt_event_loop_end)(
+ team_info->parallel_id, task_info->task_id);
+ }
+#endif
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_workshare( global_tid, ct_pdo, loc );
+}
+
+/*
+ * User routines which take C-style arguments (call by value)
+ * different from the Fortran equivalent routines
+ */
+
+void
+ompc_set_num_threads( int arg )
+{
+// !!!!! TODO: check the per-task binding
+ __kmp_set_num_threads( arg, __kmp_entry_gtid() );
+}
+
+void
+ompc_set_dynamic( int flag )
+{
+ kmp_info_t *thread;
+
+ /* For the thread-private implementation of the internal controls */
+ thread = __kmp_entry_thread();
+
+ __kmp_save_internal_controls( thread );
+
+ set__dynamic( thread, flag ? TRUE : FALSE );
+}
+
+void
+ompc_set_nested( int flag )
+{
+ kmp_info_t *thread;
+
+ /* For the thread-private internal controls implementation */
+ thread = __kmp_entry_thread();
+
+ __kmp_save_internal_controls( thread );
+
+ set__nested( thread, flag ? TRUE : FALSE );
+}
+
+void
+ompc_set_max_active_levels( int max_active_levels )
+{
+ /* TO DO */
+ /* we want per-task implementation of this internal control */
+
+ /* For the per-thread internal controls implementation */
+ __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
+}
+
+void
+ompc_set_schedule( omp_sched_t kind, int modifier )
+{
+// !!!!! TODO: check the per-task binding
+ __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
+}
+
+int
+ompc_get_ancestor_thread_num( int level )
+{
+ return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
+}
+
+int
+ompc_get_team_size( int level )
+{
+ return __kmp_get_team_size( __kmp_entry_gtid(), level );
+}
+
+void
+kmpc_set_stacksize( int arg )
+{
+ // __kmp_aux_set_stacksize initializes the library if needed
+ __kmp_aux_set_stacksize( arg );
+}
+
+void
+kmpc_set_stacksize_s( size_t arg )
+{
+ // __kmp_aux_set_stacksize initializes the library if needed
+ __kmp_aux_set_stacksize( arg );
+}
+
+void
+kmpc_set_blocktime( int arg )
+{
+ int gtid, tid;
+ kmp_info_t *thread;
+
+ gtid = __kmp_entry_gtid();
+ tid = __kmp_tid_from_gtid(gtid);
+ thread = __kmp_thread_from_gtid(gtid);
+
+ __kmp_aux_set_blocktime( arg, thread, tid );
+}
+
+void
+kmpc_set_library( int arg )
+{
+ // __kmp_user_set_library initializes the library if needed
+ __kmp_user_set_library( (enum library_type)arg );
+}
+
+void
+kmpc_set_defaults( char const * str )
+{
+ // __kmp_aux_set_defaults initializes the library if needed
+ __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
+}
+
+void
+kmpc_set_disp_num_buffers( int arg )
+{
+ // ignore after initialization because some teams have already
+ // allocated dispatch buffers
+ if( __kmp_init_serial == 0 && arg > 0 )
+ __kmp_dispatch_num_buffers = arg;
+}
+
+int
+kmpc_set_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+ return -1;
+#else
+ if ( ! TCR_4(__kmp_init_middle) ) {
+ __kmp_middle_initialize();
+ }
+ return __kmp_aux_set_affinity_mask_proc( proc, mask );
+#endif
+}
+
+int
+kmpc_unset_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+ return -1;
+#else
+ if ( ! TCR_4(__kmp_init_middle) ) {
+ __kmp_middle_initialize();
+ }
+ return __kmp_aux_unset_affinity_mask_proc( proc, mask );
+#endif
+}
+
+int
+kmpc_get_affinity_mask_proc( int proc, void **mask )
+{
+#if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
+ return -1;
+#else
+ if ( ! TCR_4(__kmp_init_middle) ) {
+ __kmp_middle_initialize();
+ }
+ return __kmp_aux_get_affinity_mask_proc( proc, mask );
+#endif
+}
+
+
+/* -------------------------------------------------------------------------- */
+/*!
+@ingroup THREADPRIVATE
+@param loc source location information
+@param gtid global thread number
+@param cpy_size size of the cpy_data buffer
+@param cpy_data pointer to data to be copied
+@param cpy_func helper function to call for copying data
+@param didit flag variable: 1=single thread; 0=not single thread
+
+__kmpc_copyprivate implements the interface for the private data broadcast needed for
+the copyprivate clause associated with a single region in an OpenMP<sup>*</sup> program (both C and Fortran).
+All threads participating in the parallel region call this routine.
+One of the threads (called the single thread) should have the <tt>didit</tt> variable set to 1
+and all other threads should have that variable set to 0.
+All threads pass a pointer to a data buffer (cpy_data) that they have built.
+
+The OpenMP specification forbids the use of nowait on the single region when a copyprivate
+clause is present. However, @ref __kmpc_copyprivate implements a barrier internally to avoid
+race conditions, so the code generation for the single region should avoid generating a barrier
+after the call to @ref __kmpc_copyprivate.
+
+The <tt>gtid</tt> parameter is the global thread id for the current thread.
+The <tt>loc</tt> parameter is a pointer to source location information.
+
+Internal implementation: The single thread will first copy its descriptor address (cpy_data)
+to a team-private location, then the other threads will each call the function pointed to by
+the parameter cpy_func, which carries out the copy by copying the data using the cpy_data buffer.
+
+The cpy_func routine used for the copy and the contents of the data area defined by cpy_data
+and cpy_size may be built in any fashion that will allow the copy to be done. For instance,
+the cpy_data buffer can hold the actual data to be copied or it may hold a list of pointers
+to the data. The cpy_func routine must interpret the cpy_data buffer appropriately.
+
+The interface to cpy_func is as follows:
+@code
+void cpy_func( void *destination, void *source )
+@endcode
+where void *destination is the cpy_data pointer for the thread being copied to
+and void *source is the cpy_data pointer for the thread being copied from.
+*/
+void
+__kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
+{
+ void **data_ptr;
+
+ KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
+
+ KMP_MB();
+
+ data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
+
+ if ( __kmp_env_consistency_check ) {
+ if ( loc == 0 ) {
+ KMP_WARNING( ConstructIdentInvalid );
+ }
+ }
+
+ /* ToDo: Optimize the following two barriers into some kind of split barrier */
+
+ if (didit) *data_ptr = cpy_data;
+
+ /* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+ __kmp_threads[gtid]->th.th_ident = loc;
+#endif
+ __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
+
+ if (! didit) (*cpy_func)( cpy_data, *data_ptr );
+
+ /* Consider next barrier the user-visible barrier for barrier region boundaries */
+ /* Nesting checks are already handled by the single construct checks */
+
+#if USE_ITT_NOTIFY
+ __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
+#endif
+ __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define INIT_LOCK __kmp_init_user_lock_with_checks
+#define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
+#define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
+#define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
+#define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
+#define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
+#define RELEASE_LOCK __kmp_release_user_lock_with_checks
+#define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
+#define TEST_LOCK __kmp_test_user_lock_with_checks
+#define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
+#define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
+#define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
+
+
+/*
+ * TODO: Make check abort messages use location info & pass it
+ * into with_checks routines
+ */
+
+#if KMP_USE_DYNAMIC_LOCK
+
+// internal lock initializer
+static __forceinline void
+__kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
+{
+ if (KMP_IS_D_LOCK(seq)) {
+ KMP_INIT_D_LOCK(lock, seq);
+#if USE_ITT_BUILD
+ __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
+#endif
+ } else {
+ KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+ kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+ __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+ }
+}
+
+// internal nest lock initializer
+static __forceinline void
+__kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
+{
+#if KMP_USE_TSX
+ // Don't have nested lock implementation for speculative locks
+ if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
+ seq = __kmp_user_lock_seq;
+#endif
+ switch (seq) {
+ case lockseq_tas:
+ seq = lockseq_nested_tas;
+ break;
+#if KMP_USE_FUTEX
+ case lockseq_futex:
+ seq = lockseq_nested_futex;
+ break;
+#endif
+ case lockseq_ticket:
+ seq = lockseq_nested_ticket;
+ break;
+ case lockseq_queuing:
+ seq = lockseq_nested_queuing;
+ break;
+ case lockseq_drdpa:
+ seq = lockseq_nested_drdpa;
+ break;
+ default:
+ seq = lockseq_nested_queuing;
+ }
+ KMP_INIT_I_LOCK(lock, seq);
+#if USE_ITT_BUILD
+ kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
+ __kmp_itt_lock_creating(ilk->lock, loc);
+#endif
+}
+
+/* initialize the lock with a hint */
+void
+__kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
+{
+ KMP_DEBUG_ASSERT(__kmp_init_serial);
+ if (__kmp_env_consistency_check && user_lock == NULL) {
+ KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
+ }
+
+ __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+}
+
+/* initialize the lock with a hint */
+void
+__kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
+{
+ KMP_DEBUG_ASSERT(__kmp_init_serial);
+ if (__kmp_env_consistency_check && user_lock == NULL) {
+ KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
+ }
+
+ __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
+}
+
+#endif // KMP_USE_DYNAMIC_LOCK
+
+/* initialize the lock */
+void
+__kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+ KMP_DEBUG_ASSERT(__kmp_init_serial);
+ if (__kmp_env_consistency_check && user_lock == NULL) {
+ KMP_FATAL(LockIsUninitialized, "omp_init_lock");
+ }
+ __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ static char const * const func = "omp_init_lock";
+ kmp_user_lock_p lck;
+ KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+ if ( __kmp_env_consistency_check ) {
+ if ( user_lock == NULL ) {
+ KMP_FATAL( LockIsUninitialized, func );
+ }
+ }
+
+ KMP_CHECK_USER_LOCK_INIT();
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
+ }
+ INIT_LOCK( lck );
+ __kmp_set_user_lock_location( lck, loc );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck);
+ }
+#endif
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_creating( lck );
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_lock
+
+/* initialize the lock */
+void
+__kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+ KMP_DEBUG_ASSERT(__kmp_init_serial);
+ if (__kmp_env_consistency_check && user_lock == NULL) {
+ KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
+ }
+ __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ static char const * const func = "omp_init_nest_lock";
+ kmp_user_lock_p lck;
+ KMP_DEBUG_ASSERT( __kmp_init_serial );
+
+ if ( __kmp_env_consistency_check ) {
+ if ( user_lock == NULL ) {
+ KMP_FATAL( LockIsUninitialized, func );
+ }
+ }
+
+ KMP_CHECK_USER_LOCK_INIT();
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
+ }
+
+ INIT_NESTED_LOCK( lck );
+ __kmp_set_user_lock_location( lck, loc );
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck);
+ }
+#endif
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_creating( lck );
+#endif /* USE_ITT_BUILD */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_init_nest_lock
+
+void
+__kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+ kmp_user_lock_p lck;
+ if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
+ lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
+ } else {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+ __kmp_itt_lock_destroyed(lck);
+# endif
+ KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+#else
+ kmp_user_lock_p lck;
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
+ }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck);
+ }
+#endif
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_destroyed( lck );
+#endif /* USE_ITT_BUILD */
+ DESTROY_LOCK( lck );
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ ;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ ;
+ }
+#endif
+ else {
+ __kmp_user_lock_free( user_lock, gtid, lck );
+ }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_lock
+
+/* destroy the lock */
+void
+__kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+ kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
+ __kmp_itt_lock_destroyed(ilk->lock);
+# endif
+ KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
+ }
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck);
+ }
+#endif
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_destroyed( lck );
+#endif /* USE_ITT_BUILD */
+
+ DESTROY_NESTED_LOCK( lck );
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+ ;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ ;
+ }
+#endif
+ else {
+ __kmp_user_lock_free( user_lock, gtid, lck );
+ }
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmpc_destroy_nest_lock
+
+void
+__kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+ KMP_COUNT_BLOCK(OMP_set_lock);
+#if KMP_USE_DYNAMIC_LOCK
+ int tag = KMP_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
+# endif
+# if KMP_USE_INLINED_TAS
+ if (tag == locktag_tas && !__kmp_env_consistency_check) {
+ KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
+ } else
+# elif KMP_USE_INLINED_FUTEX
+ if (tag == locktag_futex && !__kmp_env_consistency_check) {
+ KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
+ } else
+# endif
+ {
+ __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+ }
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+# endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+ ACQUIRE_LOCK( lck, gtid );
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck);
+ }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void
+__kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+ KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+#endif
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled) {
+ // missing support here: need to know whether acquired first or not
+ }
+#endif
+
+#else // KMP_USE_DYNAMIC_LOCK
+ int acquire_status;
+ kmp_user_lock_p lck;
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+ ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status );
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquired( lck );
+#endif /* USE_ITT_BUILD */
+
+#if OMPT_SUPPORT && OMPT_TRACE
+ if (ompt_enabled) {
+ if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
+ if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
+ ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck);
+ } else {
+ if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
+ ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck);
+ }
+ }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+void
+__kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+
+ int tag = KMP_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+ __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+# endif
+# if KMP_USE_INLINED_TAS
+ if (tag == locktag_tas && !__kmp_env_consistency_check) {
+ KMP_RELEASE_TAS_LOCK(user_lock, gtid);
+ } else
+# elif KMP_USE_INLINED_FUTEX
+ if (tag == locktag_futex && !__kmp_env_consistency_check) {
+ KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
+ } else
+# endif
+ {
+ __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+ }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+
+ /* Can't use serial interval since not block structured */
+ /* release the lock */
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+ // "fast" path implemented to fix customer performance issue
+#if USE_ITT_BUILD
+ __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
+#endif /* USE_ITT_BUILD */
+ TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
+ KMP_MB();
+ return;
+#else
+ lck = (kmp_user_lock_p)user_lock;
+#endif
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_releasing( lck );
+#endif /* USE_ITT_BUILD */
+
+ RELEASE_LOCK( lck, gtid );
+
+#if OMPT_SUPPORT && OMPT_BLAME
+ if (ompt_enabled &&
+ ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
+ ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
+ }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* release the lock */
+void
+__kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+
+# if USE_ITT_BUILD
+ __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
+# endif
+ KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+
+ /* Can't use serial interval since not block structured */
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
+ // "fast" path implemented to fix customer performance issue
+ kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
+#if USE_ITT_BUILD
+ __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
+#endif /* USE_ITT_BUILD */
+ if ( --(tl->lk.depth_locked) == 0 ) {
+ TCW_4(tl->lk.poll, 0);
+ }
+ KMP_MB();
+ return;
+#else
+ lck = (kmp_user_lock_p)user_lock;
+#endif
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_releasing( lck );
+#endif /* USE_ITT_BUILD */
+
+ int release_status;
+ release_status = RELEASE_NESTED_LOCK( lck, gtid );
+#if OMPT_SUPPORT && OMPT_BLAME
+ if (ompt_enabled) {
+ if (release_status == KMP_LOCK_RELEASED) {
+ if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
+ ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
+ (uint64_t) lck);
+ }
+ } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
+ ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
+ (uint64_t) lck);
+ }
+ }
+#endif
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int
+__kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+ KMP_COUNT_BLOCK(OMP_test_lock);
+
+#if KMP_USE_DYNAMIC_LOCK
+ int rc;
+ int tag = KMP_EXTRACT_D_TAG(user_lock);
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+# if KMP_USE_INLINED_TAS
+ if (tag == locktag_tas && !__kmp_env_consistency_check) {
+ KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
+ } else
+# elif KMP_USE_INLINED_FUTEX
+ if (tag == locktag_futex && !__kmp_env_consistency_check) {
+ KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
+ } else
+# endif
+ {
+ rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
+ }
+ if (rc) {
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+# endif
+ return FTN_TRUE;
+ } else {
+# if USE_ITT_BUILD
+ __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+# endif
+ return FTN_FALSE;
+ }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+ int rc;
+
+ if ( ( __kmp_user_lock_kind == lk_tas )
+ && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+ rc = TEST_LOCK( lck, gtid );
+#if USE_ITT_BUILD
+ if ( rc ) {
+ __kmp_itt_lock_acquired( lck );
+ } else {
+ __kmp_itt_lock_cancelled( lck );
+ }
+#endif /* USE_ITT_BUILD */
+ return ( rc ? FTN_TRUE : FTN_FALSE );
+
+ /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+/* try to acquire the lock */
+int
+__kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
+{
+#if KMP_USE_DYNAMIC_LOCK
+ int rc;
+# if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
+# endif
+ rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
+# if USE_ITT_BUILD
+ if (rc) {
+ __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
+ } else {
+ __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
+ }
+# endif
+ return rc;
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ kmp_user_lock_p lck;
+ int rc;
+
+ if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
+ + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#if KMP_USE_FUTEX
+ else if ( ( __kmp_user_lock_kind == lk_futex )
+ && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
+ <= OMP_NEST_LOCK_T_SIZE ) ) {
+ lck = (kmp_user_lock_p)user_lock;
+ }
+#endif
+ else {
+ lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
+ }
+
+#if USE_ITT_BUILD
+ __kmp_itt_lock_acquiring( lck );
+#endif /* USE_ITT_BUILD */
+
+ rc = TEST_NESTED_LOCK( lck, gtid );
+#if USE_ITT_BUILD
+ if ( rc ) {
+ __kmp_itt_lock_acquired( lck );
+ } else {
+ __kmp_itt_lock_cancelled( lck );
+ }
+#endif /* USE_ITT_BUILD */
+ return rc;
+
+ /* Can't use serial interval since not block structured */
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+
+/*--------------------------------------------------------------------------------------------------------------------*/
+
+/*
+ * Interface to fast scalable reduce methods routines
+ */
+
+// keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
+// another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
+// AT: which solution is better?
+#define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
+ ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
+
+#define __KMP_GET_REDUCTION_METHOD(gtid) \
+ ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
+
+// description of the packed_reduction_method variable: look at the macros in kmp.h
+
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+
+ // this lock was visible to a customer and to the threading profile tool as a serial overhead span
+ // (although it's used for an internal purpose only)
+ // why was it visible in previous implementation?
+ // should we keep it visible in new reduce block?
+ kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+ kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
+ // Check if it is initialized.
+ if (*lk == 0) {
+ if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+ KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq));
+ } else {
+ __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq));
+ }
+ }
+ // Branch for accessing the actual lock object and set operation. This branching is inevitable since
+ // this lock initialization does not follow the normal dispatch path (lock table is not used).
+ if (KMP_EXTRACT_D_TAG(lk) != 0) {
+ lck = (kmp_user_lock_p)lk;
+ KMP_DEBUG_ASSERT(lck != NULL);
+ if (__kmp_env_consistency_check) {
+ __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+ }
+ KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
+ } else {
+ kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
+ lck = ilk->lock;
+ KMP_DEBUG_ASSERT(lck != NULL);
+ if (__kmp_env_consistency_check) {
+ __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
+ }
+ KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
+ }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ // We know that the fast reduction code is only emitted by Intel compilers
+ // with 32 byte critical sections. If there isn't enough space, then we
+ // have to use a pointer.
+ if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
+ lck = (kmp_user_lock_p)crit;
+ }
+ else {
+ lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
+ }
+ KMP_DEBUG_ASSERT( lck != NULL );
+
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_critical, loc, lck );
+
+ __kmp_acquire_user_lock_with_checks( lck, global_tid );
+
+#endif // KMP_USE_DYNAMIC_LOCK
+}
+
+// used in a critical section reduce block
+static __forceinline void
+__kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+
+ kmp_user_lock_p lck;
+
+#if KMP_USE_DYNAMIC_LOCK
+
+ if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
+ lck = (kmp_user_lock_p)crit;
+ if (__kmp_env_consistency_check)
+ __kmp_pop_sync(global_tid, ct_critical, loc);
+ KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
+ } else {
+ kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
+ if (__kmp_env_consistency_check)
+ __kmp_pop_sync(global_tid, ct_critical, loc);
+ KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
+ }
+
+#else // KMP_USE_DYNAMIC_LOCK
+
+ // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
+ // sections. If there isn't enough space, then we have to use a pointer.
+ if ( __kmp_base_user_lock_size > 32 ) {
+ lck = *( (kmp_user_lock_p *) crit );
+ KMP_ASSERT( lck != NULL );
+ } else {
+ lck = (kmp_user_lock_p) crit;
+ }
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_sync( global_tid, ct_critical, loc );
+
+ __kmp_release_user_lock_with_checks( lck, global_tid );
+
+#endif // KMP_USE_DYNAMIC_LOCK
+} // __kmp_end_critical_section_reduce_block
+
+
+/* 2.a.i. Reduce Block without a terminating barrier */
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
+
+The nowait version is used for a reduce clause with the nowait argument.
+*/
+kmp_int32
+__kmpc_reduce_nowait(
+ ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
+ kmp_critical_name *lck ) {
+
+ KMP_COUNT_BLOCK(REDUCE_nowait);
+ int retval = 0;
+ PACKED_REDUCTION_METHOD_T packed_reduction_method;
+#if OMP_40_ENABLED
+ kmp_team_t *team;
+ kmp_info_t *th;
+ int teams_swapped = 0, task_state;
+#endif
+ KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
+
+ // why do we need this initialization here at all?
+ // Reduction clause can not be used as a stand-alone directive.
+
+ // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
+ // possible detection of false-positive race by the threadchecker ???
+ if( ! TCR_4( __kmp_init_parallel ) )
+ __kmp_parallel_initialize();
+
+ // check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
+#else
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
+#endif
+
+#if OMP_40_ENABLED
+ th = __kmp_thread_from_gtid(global_tid);
+ if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
+ team = th->th.th_team;
+ if( team->t.t_level == th->th.th_teams_level ) {
+ // this is reduction at teams construct
+ KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
+ // Let's swap teams temporarily for the reduction barrier
+ teams_swapped = 1;
+ th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+ th->th.th_team = team->t.t_parent;
+ th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+ th->th.th_task_team = th->th.th_team->t.t_task_team[0];
+ task_state = th->th.th_task_state;
+ th->th.th_task_state = 0;
+ }
+ }
+#endif // OMP_40_ENABLED
+
+ // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
+ // the variable should be either a construct-specific or thread-specific property, not a team specific property
+ // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
+ // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
+ // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
+ // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
+ // a thread-specific "th_local.reduction_method" variable is used currently
+ // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
+
+ packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
+ __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
+
+ if( packed_reduction_method == critical_reduce_block ) {
+
+ __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
+ retval = 1;
+
+ } else if( packed_reduction_method == empty_reduce_block ) {
+
+ // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+ retval = 1;
+
+ } else if( packed_reduction_method == atomic_reduce_block ) {
+
+ retval = 2;
+
+ // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
+ // (it's not quite good, because the checking block has been closed by this 'pop',
+ // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+ } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+ //AT: performance issue: a real barrier here
+ //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
+ //AT: (it's not what a customer might expect specifying NOWAIT clause)
+ //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
+ //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
+ // and be more in line with sense of NOWAIT
+ //AT: TO DO: do epcc test and compare times
+
+ // this barrier should be invisible to a customer and to the threading profile tool
+ // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
+ retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
+
+ // all other workers except master should do this pop here
+ // ( none of other workers will get to __kmpc_end_reduce_nowait() )
+ if ( __kmp_env_consistency_check ) {
+ if( retval == 0 ) {
+ __kmp_pop_sync( global_tid, ct_reduce, loc );
+ }
+ }
+
+ } else {
+
+ // should never reach this block
+ KMP_ASSERT( 0 ); // "unexpected method"
+
+ }
+#if OMP_40_ENABLED
+ if( teams_swapped ) {
+ // Restore thread structure
+ th->th.th_info.ds.ds_tid = 0;
+ th->th.th_team = team;
+ th->th.th_team_nproc = team->t.t_nproc;
+ th->th.th_task_team = team->t.t_task_team[task_state];
+ th->th.th_task_state = task_state;
+ }
+#endif
+ KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
+
+ return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a reduce nowait.
+*/
+void
+__kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
+
+ PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+ KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
+
+ packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
+
+ if( packed_reduction_method == critical_reduce_block ) {
+
+ __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
+
+ } else if( packed_reduction_method == empty_reduce_block ) {
+
+ // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
+
+ } else if( packed_reduction_method == atomic_reduce_block ) {
+
+ // neither master nor other workers should get here
+ // (code gen does not generate this call in case 2: atomic reduce block)
+ // actually it's better to remove this elseif at all;
+ // after removal this value will checked by the 'else' and will assert
+
+ } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+ // only master gets here
+
+ } else {
+
+ // should never reach this block
+ KMP_ASSERT( 0 ); // "unexpected method"
+
+ }
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+ KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
+
+ return;
+}
+
+/* 2.a.ii. Reduce Block with a terminating barrier */
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread number
+@param num_vars number of items (variables) to be reduced
+@param reduce_size size of data in bytes to be reduced
+@param reduce_data pointer to data to be reduced
+@param reduce_func callback function providing reduction operation on two operands and returning result of reduction in lhs_data
+@param lck pointer to the unique lock data structure
+@result 1 for the master thread, 0 for all other team threads, 2 for all team threads if atomic reduction needed
+
+A blocking reduce that includes an implicit barrier.
+*/
+kmp_int32
+__kmpc_reduce(
+ ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
+ void (*reduce_func)(void *lhs_data, void *rhs_data),
+ kmp_critical_name *lck )
+{
+ KMP_COUNT_BLOCK(REDUCE_wait);
+ int retval = 0;
+ PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+ KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
+
+ // why do we need this initialization here at all?
+ // Reduction clause can not be a stand-alone directive.
+
+ // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
+ // possible detection of false-positive race by the threadchecker ???
+ if( ! TCR_4( __kmp_init_parallel ) )
+ __kmp_parallel_initialize();
+
+ // check correctness of reduce block nesting
+#if KMP_USE_DYNAMIC_LOCK
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
+#else
+ if ( __kmp_env_consistency_check )
+ __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
+#endif
+
+ packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
+ __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
+
+ if( packed_reduction_method == critical_reduce_block ) {
+
+ __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
+ retval = 1;
+
+ } else if( packed_reduction_method == empty_reduce_block ) {
+
+ // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+ retval = 1;
+
+ } else if( packed_reduction_method == atomic_reduce_block ) {
+
+ retval = 2;
+
+ } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+ //case tree_reduce_block:
+ // this barrier should be visible to a customer and to the threading profile tool
+ // (it's a terminating barrier on constructs if NOWAIT not specified)
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
+#endif
+ retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
+ retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
+
+ // all other workers except master should do this pop here
+ // ( none of other workers except master will enter __kmpc_end_reduce() )
+ if ( __kmp_env_consistency_check ) {
+ if( retval == 0 ) { // 0: all other workers; 1: master
+ __kmp_pop_sync( global_tid, ct_reduce, loc );
+ }
+ }
+
+ } else {
+
+ // should never reach this block
+ KMP_ASSERT( 0 ); // "unexpected method"
+
+ }
+
+ KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
+
+ return retval;
+}
+
+/*!
+@ingroup SYNCHRONIZATION
+@param loc source location information
+@param global_tid global thread id.
+@param lck pointer to the unique lock data structure
+
+Finish the execution of a blocking reduce.
+The <tt>lck</tt> pointer must be the same as that used in the corresponding start function.
+*/
+void
+__kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
+
+ PACKED_REDUCTION_METHOD_T packed_reduction_method;
+
+ KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
+
+ packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
+
+ // this barrier should be visible to a customer and to the threading profile tool
+ // (it's a terminating barrier on constructs if NOWAIT not specified)
+
+ if( packed_reduction_method == critical_reduce_block ) {
+
+ __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
+
+ // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+ } else if( packed_reduction_method == empty_reduce_block ) {
+
+ // usage: if team size == 1, no synchronization is required ( Intel platforms only )
+
+ // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+ } else if( packed_reduction_method == atomic_reduce_block ) {
+
+ // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+ __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
+ __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
+
+ } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
+
+ // only master executes here (master releases all other workers)
+ __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
+
+ } else {
+
+ // should never reach this block
+ KMP_ASSERT( 0 ); // "unexpected method"
+
+ }
+
+ if ( __kmp_env_consistency_check )
+ __kmp_pop_sync( global_tid, ct_reduce, loc );
+
+ KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
+
+ return;
+}
+
+#undef __KMP_GET_REDUCTION_METHOD
+#undef __KMP_SET_REDUCTION_METHOD
+
+/*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
+
+kmp_uint64
+__kmpc_get_taskid() {
+
+ kmp_int32 gtid;
+ kmp_info_t * thread;
+
+ gtid = __kmp_get_gtid();
+ if ( gtid < 0 ) {
+ return 0;
+ }; // if
+ thread = __kmp_thread_from_gtid( gtid );
+ return thread->th.th_current_task->td_task_id;
+
+} // __kmpc_get_taskid
+
+
+kmp_uint64
+__kmpc_get_parent_taskid() {
+
+ kmp_int32 gtid;
+ kmp_info_t * thread;
+ kmp_taskdata_t * parent_task;
+
+ gtid = __kmp_get_gtid();
+ if ( gtid < 0 ) {
+ return 0;
+ }; // if
+ thread = __kmp_thread_from_gtid( gtid );
+ parent_task = thread->th.th_current_task->td_parent;
+ return ( parent_task == NULL ? 0 : parent_task->td_task_id );
+
+} // __kmpc_get_parent_taskid
+
+void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
+{
+ if ( ! __kmp_init_serial ) {
+ __kmp_serial_initialize();
+ }
+ __kmp_place_num_sockets = nS;
+ __kmp_place_socket_offset = sO;
+ __kmp_place_num_cores = nC;
+ __kmp_place_core_offset = cO;
+ __kmp_place_num_threads_per_core = nT;
+}
+
+#if OMP_45_ENABLED
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param gtid global thread number.
+@param num_dims number of associated doacross loops.
+@param dims info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void
+__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
+{
+ int j, idx;
+ kmp_int64 last, trace_count;
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_uint32 *flags;
+ kmp_disp_t *pr_buf = th->th.th_dispatch;
+ dispatch_shared_info_t *sh_buf;
+
+ KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+ gtid, num_dims, !team->t.t_serialized));
+ KMP_DEBUG_ASSERT(dims != NULL);
+ KMP_DEBUG_ASSERT(num_dims > 0);
+
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+ KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+ idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
+ sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+
+ // Save bounds info into allocated private buffer
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+ pr_buf->th_doacross_info =
+ (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
+ // Save also address of num_done in order to access it later without knowing the buffer index
+ pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+ pr_buf->th_doacross_info[2] = dims[0].lo;
+ pr_buf->th_doacross_info[3] = dims[0].up;
+ pr_buf->th_doacross_info[4] = dims[0].st;
+ last = 5;
+ for( j = 1; j < num_dims; ++j ) {
+ kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
+ if( dims[j].st == 1 ) { // most common case
+ // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+ range_length = dims[j].up - dims[j].lo + 1;
+ } else {
+ if( dims[j].st > 0 ) {
+ KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+ range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+ } else { // negative increment
+ KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+ range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+ }
+ }
+ pr_buf->th_doacross_info[last++] = range_length;
+ pr_buf->th_doacross_info[last++] = dims[j].lo;
+ pr_buf->th_doacross_info[last++] = dims[j].up;
+ pr_buf->th_doacross_info[last++] = dims[j].st;
+ }
+
+ // Compute total trip count.
+ // Start with range of dims[0] which we don't need to keep in the buffer.
+ if( dims[0].st == 1 ) { // most common case
+ trace_count = dims[0].up - dims[0].lo + 1;
+ } else if( dims[0].st > 0 ) {
+ KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+ trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+ } else { // negative increment
+ KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+ trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+ }
+ for( j = 1; j < num_dims; ++j ) {
+ trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+ }
+ KMP_DEBUG_ASSERT(trace_count > 0);
+
+ // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers)
+ if( idx != sh_buf->doacross_buf_idx ) {
+ // Shared buffer is occupied, wait for it to be free
+ __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
+ }
+ // Check if we are the first thread. After the CAS the first thread gets 0,
+ // others get 1 if initialization is in progress, allocated pointer otherwise.
+ flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
+ (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
+ if( flags == NULL ) {
+ // we are the first thread, allocate the array of flags
+ kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+ sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
+ } else if( (kmp_int64)flags == 1 ) {
+ // initialization is still in progress, need to wait
+ while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
+ KMP_YIELD(TRUE);
+ }
+ }
+ KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
+ pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not
+ // touch shared buffer on each iteration
+ KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void
+__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
+{
+ kmp_int32 shft, num_dims, i;
+ kmp_uint32 flag;
+ kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf;
+ kmp_int64 lo, up, st;
+
+ KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+
+ // calculate sequential iteration number and check out-of-bounds condition
+ pr_buf = th->th.th_dispatch;
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ num_dims = pr_buf->th_doacross_info[0];
+ lo = pr_buf->th_doacross_info[2];
+ up = pr_buf->th_doacross_info[3];
+ st = pr_buf->th_doacross_info[4];
+ if( st == 1 ) { // most common case
+ if( vec[0] < lo || vec[0] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = vec[0] - lo;
+ } else if( st > 0 ) {
+ if( vec[0] < lo || vec[0] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = (kmp_uint64)(vec[0] - lo) / st;
+ } else { // negative increment
+ if( vec[0] > lo || vec[0] < up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+ }
+ for( i = 1; i < num_dims; ++i ) {
+ kmp_int64 iter, ln;
+ kmp_int32 j = i * 4;
+ ln = pr_buf->th_doacross_info[j + 1];
+ lo = pr_buf->th_doacross_info[j + 2];
+ up = pr_buf->th_doacross_info[j + 3];
+ st = pr_buf->th_doacross_info[j + 4];
+ if( st == 1 ) {
+ if( vec[i] < lo || vec[i] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = vec[i] - lo;
+ } else if( st > 0 ) {
+ if( vec[i] < lo || vec[i] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = (kmp_uint64)(vec[i] - lo) / st;
+ } else { // st < 0
+ if( vec[i] > lo || vec[i] < up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = (kmp_uint64)(lo - vec[i]) / (-st);
+ }
+ iter_number = iter + ln * iter_number;
+ }
+ shft = iter_number % 32; // use 32-bit granularity
+ iter_number >>= 5; // divided by 32
+ flag = 1 << shft;
+ while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
+ KMP_YIELD(TRUE);
+ }
+ KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+ gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
+{
+ kmp_int32 shft, num_dims, i;
+ kmp_uint32 flag;
+ kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf;
+ kmp_int64 lo, st;
+
+ KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+
+ // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
+ pr_buf = th->th.th_dispatch;
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ num_dims = pr_buf->th_doacross_info[0];
+ lo = pr_buf->th_doacross_info[2];
+ st = pr_buf->th_doacross_info[4];
+ if( st == 1 ) { // most common case
+ iter_number = vec[0] - lo;
+ } else if( st > 0 ) {
+ iter_number = (kmp_uint64)(vec[0] - lo) / st;
+ } else { // negative increment
+ iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+ }
+ for( i = 1; i < num_dims; ++i ) {
+ kmp_int64 iter, ln;
+ kmp_int32 j = i * 4;
+ ln = pr_buf->th_doacross_info[j + 1];
+ lo = pr_buf->th_doacross_info[j + 2];
+ st = pr_buf->th_doacross_info[j + 4];
+ if( st == 1 ) {
+ iter = vec[i] - lo;
+ } else if( st > 0 ) {
+ iter = (kmp_uint64)(vec[i] - lo) / st;
+ } else { // st < 0
+ iter = (kmp_uint64)(lo - vec[i]) / (-st);
+ }
+ iter_number = iter + ln * iter_number;
+ }
+ shft = iter_number % 32; // use 32-bit granularity
+ iter_number >>= 5; // divided by 32
+ flag = 1 << shft;
+ if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
+ KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
+ KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
+ gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_fini(ident_t *loc, int gtid)
+{
+ kmp_int64 num_done;
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+ KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+ return; // nothing to do
+ }
+ num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
+ if( num_done == th->th.th_team_nproc ) {
+ // we are the last thread, need to free shared resources
+ int idx = pr_buf->th_doacross_buf_idx - 1;
+ dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
+ KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
+ KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+ __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
+ sh_buf->doacross_flags = NULL;
+ sh_buf->doacross_num_done = 0;
+ sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use
+ }
+ // free private resources (need to keep buffer index forever)
+ __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
+ pr_buf->th_doacross_info = NULL;
+ KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+#endif
+
+// end of file //
+
OpenPOWER on IntegriCloud