diff options
Diffstat (limited to 'openmp/runtime/src/kmp.h')
| -rw-r--r-- | openmp/runtime/src/kmp.h | 640 |
1 files changed, 294 insertions, 346 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index f5dd10f8baa..6daf9735601 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1,8 +1,8 @@ /*! \file */ /* * kmp.h -- KPTS runtime header file. - * $Revision: 42816 $ - * $Date: 2013-11-11 15:33:37 -0600 (Mon, 11 Nov 2013) $ + * $Revision: 43473 $ + * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $ */ @@ -28,8 +28,6 @@ /* Defines for OpenMP 3.0 tasking and auto scheduling */ -#if OMP_30_ENABLED - # ifndef KMP_STATIC_STEAL_ENABLED # define KMP_STATIC_STEAL_ENABLED 1 # endif @@ -56,8 +54,6 @@ #define TASK_EXPLICIT 1 #define TASK_IMPLICIT 0 -#endif // OMP_30_ENABLED - #define KMP_CANCEL_THREADS #define KMP_THREAD_ATTR @@ -79,6 +75,10 @@ #include "kmp_os.h" +#if KMP_STATS_ENABLED +class kmp_stats_list; +#endif + #if KMP_ARCH_X86 || KMP_ARCH_X86_64 #include <xmmintrin.h> #endif @@ -125,6 +125,24 @@ #define USE_FAST_MEMORY 3 #endif +#ifndef KMP_NESTED_HOT_TEAMS +# define KMP_NESTED_HOT_TEAMS 0 +# define USE_NESTED_HOT_ARG(x) +#else +# if KMP_NESTED_HOT_TEAMS +# if OMP_40_ENABLED +# define USE_NESTED_HOT_ARG(x) ,x +# else +// Nested hot teams feature depends on omp 4.0, disable it for earlier versions +# undef KMP_NESTED_HOT_TEAMS +# define KMP_NESTED_HOT_TEAMS 0 +# define USE_NESTED_HOT_ARG(x) +# endif +# else +# define USE_NESTED_HOT_ARG(x) +# endif +#endif + // Assume using BGET compare_exchange instruction instead of lock by default. #ifndef USE_CMP_XCHG_FOR_BGET #define USE_CMP_XCHG_FOR_BGET 1 @@ -459,15 +477,6 @@ typedef int PACKED_REDUCTION_METHOD_T; /* * Only Linux* OS and Windows* OS support thread affinity. */ -#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK && !KMP_ARCH_PPC64 -# define KMP_AFFINITY_SUPPORTED 1 -#elif KMP_OS_DARWIN || KMP_OS_FREEBSD || KMP_OS_CNK || KMP_ARCH_PPC64 -// affinity not supported -# define KMP_AFFINITY_SUPPORTED 0 -#else -# error "Unknown or unsupported OS" -#endif - #if KMP_AFFINITY_SUPPORTED extern size_t __kmp_affin_mask_size; @@ -540,11 +549,14 @@ typedef unsigned char kmp_affin_mask_t; # if KMP_ARCH_X86_64 +// GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later). +# if _MSC_VER < 1600 typedef struct GROUP_AFFINITY { - KAFFINITY mask; - WORD group; - WORD reserved[3]; + KAFFINITY Mask; + WORD Group; + WORD Reserved[3]; } GROUP_AFFINITY; +# endif typedef DWORD_PTR kmp_affin_mask_t; @@ -798,7 +810,6 @@ extern unsigned int __kmp_place_core_offset; #define __kmp_entry_gtid() __kmp_get_global_thread_id_reg() #define __kmp_tid_from_gtid(gtid) ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \ - /*(__kmp_threads[ (gtid) ]->th.th_team_serialized) ? 0 : */ /* TODO remove this check, it is redundant */ \ __kmp_threads[ (gtid) ]->th.th_info.ds.ds_tid ) #define __kmp_get_tid() ( __kmp_tid_from_gtid( __kmp_get_gtid() ) ) @@ -865,6 +876,9 @@ extern unsigned int __kmp_place_core_offset; #define KMP_MAX_STKOFFSET KMP_MAX_STKSIZE #define KMP_DEFAULT_STKOFFSET KMP_MIN_STKOFFSET +#define KMP_MIN_STKPADDING (0) +#define KMP_MAX_STKPADDING (2 * 1024 * 1024) + #define KMP_MIN_MONITOR_WAKEUPS (1) /* min number of times monitor wakes up per second */ #define KMP_MAX_MONITOR_WAKEUPS (1000) /* maximum number of times monitor can wake up per second */ #define KMP_BLOCKTIME_MULTIPLIER (1000) /* number of blocktime units per second */ @@ -952,12 +966,14 @@ extern unsigned int __kmp_place_core_offset; #elif KMP_OS_LINUX # define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ # define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ -#elif KMP_OS_DARWIN || KMP_OS_FREEBSD -/* TODO: tune for OS */ +#elif KMP_OS_DARWIN +/* TODO: tune for KMP_OS_DARWIN */ +# define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ +# define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ +#elif KMP_OS_FREEBSD +/* TODO: tune for KMP_OS_FREEBSD */ # define KMP_INIT_WAIT 1024U /* initial number of spin-tests */ # define KMP_NEXT_WAIT 512U /* susequent number of spin-tests */ -#else -# error "Unknown or unsupported OS" #endif #if KMP_ARCH_X86 || KMP_ARCH_X86_64 @@ -968,12 +984,14 @@ struct kmp_cpuid { kmp_uint32 edx; }; extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p ); -# if KMP_MIC +# if KMP_ARCH_X86 + extern void __kmp_x86_pause( void ); +# elif KMP_MIC static void __kmp_x86_pause( void ) { _mm_delay_32( 100 ); }; # else - extern void __kmp_x86_pause( void ); + static void __kmp_x86_pause( void ) { _mm_pause(); }; # endif -# define KMP_CPU_PAUSE() __kmp_x86_pause() +# define KMP_CPU_PAUSE() __kmp_x86_pause() #elif KMP_ARCH_PPC64 # define KMP_PPC64_PRI_LOW() __asm__ volatile ("or 1, 1, 1") # define KMP_PPC64_PRI_MED() __asm__ volatile ("or 2, 2, 2") @@ -985,7 +1003,7 @@ extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p ); #define KMP_INIT_YIELD(count) { (count) = __kmp_yield_init; } -#define KMP_YIELD(cond) { KMP_CPU_PAUSE(); __kmp_static_yield( (cond) ); } +#define KMP_YIELD(cond) { KMP_CPU_PAUSE(); __kmp_yield( (cond) ); } // Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround, // there should be no yielding since the starting value from KMP_INIT_YIELD() is odd. @@ -1533,6 +1551,9 @@ typedef struct kmp_disp { dispatch_private_info_t *th_disp_buffer; kmp_int32 th_disp_index; void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64 +#if KMP_USE_INTERNODE_ALIGNMENT + char more_padding[INTERNODE_CACHE_LINE]; +#endif } kmp_disp_t; /* ------------------------------------------------------------------------ */ @@ -1557,6 +1578,12 @@ typedef struct kmp_disp { # error "Barrier unused bit must be smaller than barrier bump bit" #endif +// Constants for release barrier wait state: currently, hierarchical only +#define KMP_BARRIER_NOT_WAITING 0 // Normal state; worker not in wait_sleep +#define KMP_BARRIER_OWN_FLAG 1 // Normal state; worker waiting on own b_go flag in release +#define KMP_BARRIER_PARENT_FLAG 2 // Special state; worker waiting on parent's b_go flag in release +#define KMP_BARRIER_SWITCH_TO_OWN_FLAG 3 // Special state; tells worker to shift from parent to own b_go +#define KMP_BARRIER_SWITCHING 4 // Special state; worker resets appropriate flag on wake-up enum barrier_type { bs_plain_barrier = 0, /* 0, All non-fork/join barriers (except reduction barriers if enabled) */ @@ -1576,16 +1603,58 @@ typedef enum kmp_bar_pat { /* Barrier communication patterns */ bp_linear_bar = 0, /* Single level (degenerate) tree */ bp_tree_bar = 1, /* Balanced tree with branching factor 2^n */ bp_hyper_bar = 2, /* Hypercube-embedded tree with min branching factor 2^n */ - bp_last_bar = 3 /* Placeholder to mark the end */ + bp_hierarchical_bar = 3, /* Machine hierarchy tree */ + bp_last_bar = 4 /* Placeholder to mark the end */ } kmp_bar_pat_e; +# define KMP_BARRIER_ICV_PUSH 1 + +/* Record for holding the values of the internal controls stack records */ +typedef struct kmp_internal_control { + int serial_nesting_level; /* corresponds to the value of the th_team_serialized field */ + kmp_int8 nested; /* internal control for nested parallelism (per thread) */ + kmp_int8 dynamic; /* internal control for dynamic adjustment of threads (per thread) */ + kmp_int8 bt_set; /* internal control for whether blocktime is explicitly set */ + int blocktime; /* internal control for blocktime */ + int bt_intervals; /* internal control for blocktime intervals */ + int nproc; /* internal control for #threads for next parallel region (per thread) */ + int max_active_levels; /* internal control for max_active_levels */ + kmp_r_sched_t sched; /* internal control for runtime schedule {sched,chunk} pair */ +#if OMP_40_ENABLED + kmp_proc_bind_t proc_bind; /* internal control for affinity */ +#endif // OMP_40_ENABLED + struct kmp_internal_control *next; +} kmp_internal_control_t; + +static inline void +copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) { + *dst = *src; +} + /* Thread barrier needs volatile barrier fields */ typedef struct KMP_ALIGN_CACHE kmp_bstate { - volatile kmp_uint b_arrived; /* STATE => task reached synch point. */ - #if (KMP_PERF_V19 == KMP_ON) - KMP_ALIGN_CACHE - #endif - volatile kmp_uint b_go; /* STATE => task should proceed. */ + // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all uses of it). + // It is not explicitly aligned below, because we *don't* want it to be padded -- instead, + // we fit b_go into the same cache line with th_fixed_icvs, enabling NGO cache lines + // stores in the hierarchical barrier. + kmp_internal_control_t th_fixed_icvs; // Initial ICVs for the thread + // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with same NGO store + volatile kmp_uint64 b_go; // STATE => task should proceed (hierarchical) + KMP_ALIGN_CACHE volatile kmp_uint64 b_arrived; // STATE => task reached synch point. + kmp_uint32 *skip_per_level; + kmp_uint32 my_level; + kmp_int32 parent_tid; + kmp_uint32 old_tid; + kmp_uint32 depth; + struct kmp_bstate *parent_bar; + kmp_team_t *team; + kmp_uint64 leaf_state; + kmp_uint32 nproc; + kmp_uint8 base_leaf_kids; + kmp_uint8 leaf_kids; + kmp_uint8 offset; + kmp_uint8 wait_flag; + kmp_uint8 use_oncore_barrier; } kmp_bstate_t; union KMP_ALIGN_CACHE kmp_barrier_union { @@ -1698,7 +1767,6 @@ typedef union KMP_ALIGN_CACHE kmp_desc { typedef struct kmp_local { volatile int this_construct; /* count of single's encountered by thread */ - volatile int last_construct; /* cache for team's count used by old algorithm */ void *reduce_data; #if KMP_USE_BGET void *bget_data; @@ -1721,151 +1789,54 @@ typedef struct kmp_local { } kmp_local_t; -/* Record for holding the values of the internal controls stack records */ -typedef struct KMP_ALIGN_CACHE kmp_internal_control { - int serial_nesting_level; /* corresponds to the value of the th_team_serialized field */ - int nested; /* internal control for nested parallelism (per thread) */ - int dynamic; /* internal control for dynamic adjustment of threads (per thread) */ - int nproc; /* internal control for # of threads for next parallel region (per thread) */ - int blocktime; /* internal control for blocktime */ - int bt_intervals; /* internal control for blocktime intervals */ - int bt_set; /* internal control for whether blocktime is explicitly set */ -#if OMP_30_ENABLED - int max_active_levels; /* internal control for max_active_levels */ - kmp_r_sched_t sched; /* internal control for runtime schedule {sched,chunk} pair */ -#endif // OMP_30_ENABLED -#if OMP_40_ENABLED - kmp_proc_bind_t proc_bind; /* internal control for affinity */ -#endif // OMP_40_ENABLED - struct kmp_internal_control *next; - -} kmp_internal_control_t; - -#if OMP_30_ENABLED -static inline void -copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) { - *dst = *src; -} -#endif // OMP_30_ENABLED - -#if OMP_30_ENABLED - - #define get__blocktime( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) - #define get__bt_set( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) - #define get__bt_intervals( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) +#define get__blocktime( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime) +#define get__bt_set( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set) +#define get__bt_intervals( xteam, xtid ) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals) - #define get__nested_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested) - #define get__dynamic_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic) - #define get__nproc_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc) - #define get__sched_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched) +#define get__nested_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested) +#define get__dynamic_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic) +#define get__nproc_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc) +#define get__sched_2(xteam,xtid) ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched) - #define set__blocktime_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime ) = (xval) ) +#define set__blocktime_team( xteam, xtid, xval ) \ + ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime ) = (xval) ) - #define set__bt_intervals_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) ) +#define set__bt_intervals_team( xteam, xtid, xval ) \ + ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) ) - #define set__bt_set_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set ) = (xval) ) +#define set__bt_set_team( xteam, xtid, xval ) \ + ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set ) = (xval) ) +#define set__nested( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.nested ) = (xval) ) +#define get__nested( xthread ) \ + ( ( (xthread)->th.th_current_task->td_icvs.nested ) ? (FTN_TRUE) : (FTN_FALSE) ) - #define set__nested( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_threads[0] ->th.th_current_task->td_icvs.nested ) = \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nested ) = \ - (xval) ) - #define get__nested( xthread ) \ - ( ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nested ) \ - ? (FTN_TRUE) : (FTN_FALSE) ) +#define set__dynamic( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) = (xval) ) +#define get__dynamic( xthread ) \ + ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) ? (FTN_TRUE) : (FTN_FALSE) ) - #define set__dynamic( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_threads[0] ->th.th_current_task->td_icvs.dynamic ) = \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.dynamic ) = \ - (xval) ) - #define get__dynamic( xthread ) \ - ( ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.dynamic ) \ - ? (FTN_TRUE) : (FTN_FALSE) ) +#define set__nproc( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.nproc ) = (xval) ) - #define set__nproc( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_threads[0] ->th.th_current_task->td_icvs.nproc ) = \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nproc ) = \ - (xval) ) +#define set__max_active_levels( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.max_active_levels ) = (xval) ) - #define set__nproc_p( xthread, xval ) \ - ( \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nproc ) = \ - (xval) ) - - #define set__max_active_levels( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_threads[0] ->th.th_current_task->td_icvs.max_active_levels ) = \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.max_active_levels ) = \ - (xval) ) - - #define set__sched( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_threads[0] ->th.th_current_task->td_icvs.sched ) = \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.sched ) = \ - (xval) ) +#define set__sched( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.sched ) = (xval) ) #if OMP_40_ENABLED - #define set__proc_bind( xthread, xval ) \ - ( \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.proc_bind ) = \ - (xval) ) - - #define get__proc_bind( xthread ) \ - ( (xthread)->th.th_team ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.proc_bind ) +#define set__proc_bind( xthread, xval ) \ + ( ( (xthread)->th.th_current_task->td_icvs.proc_bind ) = (xval) ) +#define get__proc_bind( xthread ) \ + ( (xthread)->th.th_current_task->td_icvs.proc_bind ) #endif /* OMP_40_ENABLED */ -#else - - #define get__blocktime( xteam, xtid ) ((xteam)->t.t_set_blocktime[ (xtid)]) - #define get__bt_set( xteam, xtid ) ((xteam)->t.t_set_bt_set[ (xtid)]) - #define get__bt_intervals( xteam, xtid ) ((xteam)->t.t_set_bt_intervals[(xtid)]) - - #define set__nested( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_set_nested[0] ) = \ - ( (xthread)->th.th_team->t.t_set_nested[((xthread)->th.th_info.ds.ds_tid)] ) = \ - (xval) ) - #define get__nested( xthread ) \ - ( ( (xthread)->th.th_team->t.t_set_nested[((xthread)->th.th_info.ds.ds_tid)] ) \ - ? (FTN_TRUE) : (FTN_FALSE) ) - - #define set__dynamic( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_set_dynamic[0] ) = \ - ( (xthread)->th.th_team->t.t_set_dynamic[((xthread)->th.th_info.ds.ds_tid)] ) = \ - (xval) ) - #define get__dynamic( xthread ) \ - ( ( (xthread)->th.th_team->t.t_set_dynamic[((xthread)->th.th_info.ds.ds_tid)] ) \ - ? (FTN_TRUE) : (FTN_FALSE) ) - - #define set__nproc( xthread, xval ) \ - ( ( (xthread)->th.th_serial_team->t.t_set_nproc[0] ) = \ - ( (xthread)->th.th_team->t.t_set_nproc[((xthread)->th.th_info.ds.ds_tid)] ) = \ - (xval) ) - - #define set__nproc_p( xthread, xval ) \ - ( ( (xthread)->th.th_team->t.t_set_nproc[((xthread)->th.th_info.ds.ds_tid)] ) = (xval) ) - - #define set__blocktime_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_set_blocktime[(xtid)] ) = (xval) ) - #define set__bt_intervals_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_set_bt_intervals[(xtid)] ) = (xval) ) - - #define set__bt_set_team( xteam, xtid, xval ) \ - ( ( (xteam)->t.t_set_bt_set[(xtid)] ) = (xval) ) - - #define get__nested_2(xteam,xtid) ( (xteam)->t.t_set_nested[(xtid)] ) - #define get__dynamic_2(xteam,xtid) ( (xteam)->t.t_set_dynamic[(xtid)] ) - #define get__nproc_2(xteam,xtid) ( (xteam)->t.t_set_nproc[(xtid)] ) - #define get__sched_2(xteam,xtid) ( (xteam)->t.t_set_sched[(xtid)] ) - - -#endif - -#if OMP_30_ENABLED /* ------------------------------------------------------------------------ */ // OpenMP tasking data structures // @@ -1931,7 +1902,7 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t; typedef struct kmp_depend_info { kmp_intptr_t base_addr; - size_t len; + size_t len; struct { bool in:1; bool out:1; @@ -1947,13 +1918,13 @@ typedef struct kmp_base_depnode { kmp_depnode_list_t * successors; kmp_task_t * task; - kmp_lock_t lock; + kmp_lock_t lock; #if KMP_SUPPORT_GRAPH_OUTPUT kmp_uint32 id; #endif - volatile kmp_int32 npredecessors; + volatile kmp_int32 npredecessors; volatile kmp_int32 nrefs; } kmp_base_depnode_t; @@ -1965,8 +1936,8 @@ union KMP_ALIGN_CACHE kmp_depnode { struct kmp_dephash_entry { kmp_intptr_t addr; - kmp_depnode_t * last_out; - kmp_depnode_list_t * last_ins; + kmp_depnode_t * last_out; + kmp_depnode_list_t * last_ins; kmp_dephash_entry_t * next_in_bucket; }; @@ -2039,7 +2010,7 @@ struct kmp_taskdata { /* aligned during dynamic ident_t * td_taskwait_ident; kmp_uint32 td_taskwait_counter; kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */ - kmp_internal_control_t td_icvs; /* Internal control variables for the task */ + KMP_ALIGN_CACHE kmp_internal_control_t td_icvs; /* Internal control variables for the task */ volatile kmp_uint32 td_allocated_child_tasks; /* Child tasks (+ current task) not yet deallocated */ volatile kmp_uint32 td_incomplete_child_tasks; /* Child tasks not yet complete */ #if OMP_40_ENABLED @@ -2060,7 +2031,7 @@ KMP_BUILD_ASSERT( sizeof(kmp_taskdata_t) % sizeof(void *) == 0 ); // Data for task team but per thread typedef struct kmp_base_thread_data { kmp_info_p * td_thr; // Pointer back to thread info - // Used only in __kmp_execute_tasks, maybe not avail until task is queued? + // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued? kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque kmp_taskdata_t ** td_deque; // Deque of tasks encountered by td_thr, dynamically allocated kmp_uint32 td_deque_head; // Head of deque (will wrap) @@ -2099,6 +2070,10 @@ typedef struct kmp_base_task_team { volatile kmp_uint32 tt_active; /* is the team still actively executing tasks */ KMP_ALIGN_CACHE +#if KMP_USE_INTERNODE_ALIGNMENT + kmp_int32 tt_padme[INTERNODE_CACHE_LINE/sizeof(kmp_int32)]; +#endif + volatile kmp_uint32 tt_ref_ct; /* #threads accessing struct */ /* (not incl. master) */ kmp_int32 tt_state; /* alternating 0/1 for task team identification */ @@ -2111,8 +2086,6 @@ union KMP_ALIGN_CACHE kmp_task_team { char tt_pad[ KMP_PAD(kmp_base_task_team_t, CACHE_LINE) ]; }; -#endif // OMP_30_ENABLED - #if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 ) // Free lists keep same-size free memory slots for fast memory allocation routines typedef struct kmp_free_list { @@ -2121,6 +2094,20 @@ typedef struct kmp_free_list { void *th_free_list_other; // Non-self free list (to be returned to owner's sync list) } kmp_free_list_t; #endif +#if KMP_NESTED_HOT_TEAMS +// Hot teams array keeps hot teams and their sizes for given thread. +// Hot teams are not put in teams pool, and they don't put threads in threads pool. +typedef struct kmp_hot_team_ptr { + kmp_team_p *hot_team; // pointer to hot_team of given nesting level + kmp_int32 hot_team_nth; // number of threads allocated for the hot_team +} kmp_hot_team_ptr_t; +#endif +#if OMP_40_ENABLED +typedef struct kmp_teams_size { + kmp_int32 nteams; // number of teams in a league + kmp_int32 nth; // number of threads in each team of the league +} kmp_teams_size_t; +#endif /* ------------------------------------------------------------------------ */ // OpenMP thread data structures @@ -2146,7 +2133,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { kmp_info_p *th_team_master; /* the team's master thread */ int th_team_serialized; /* team is serialized */ #if OMP_40_ENABLED - microtask_t th_team_microtask; /* save entry address for teams construct */ + microtask_t th_teams_microtask; /* save entry address for teams construct */ int th_teams_level; /* save initial level of teams construct */ /* it is 0 on device but may be any on host */ #endif @@ -2158,21 +2145,21 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { int th_team_bt_intervals; int th_team_bt_set; - kmp_internal_control_t th_fixed_icvs; /* Initial ICVs for the thread */ - #if KMP_AFFINITY_SUPPORTED kmp_affin_mask_t *th_affin_mask; /* thread's current affinity mask */ #endif - /* * The data set by the master at reinit, then R/W by the worker */ KMP_ALIGN_CACHE int th_set_nproc; /* if > 0, then only use this request for the next fork */ +#if KMP_NESTED_HOT_TEAMS + kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */ +#endif #if OMP_40_ENABLED - int th_set_nth_teams; /* number of threads in parallel nested in teams construct */ kmp_proc_bind_t th_set_proc_bind; /* if != proc_bind_default, use request for next fork */ + kmp_teams_size_t th_teams_size; /* number of teams/threads in teams construct */ # if KMP_AFFINITY_SUPPORTED int th_current_place; /* place currently bound to */ int th_new_place; /* place to bind to in par reg */ @@ -2182,6 +2169,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { #endif #if USE_ITT_BUILD kmp_uint64 th_bar_arrive_time; /* arrival to barrier timestamp */ + kmp_uint64 th_bar_min_time; /* minimum arrival time at the barrier */ kmp_uint64 th_frame_time; /* frame timestamp */ kmp_uint64 th_frame_time_serialized; /* frame timestamp in serialized parallel */ #endif /* USE_ITT_BUILD */ @@ -2200,24 +2188,18 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { volatile kmp_uint32 th_spin_here; /* thread-local location for spinning */ /* while awaiting queuing lock acquire */ - volatile kmp_uint32 *th_sleep_loc; + volatile void *th_sleep_loc; // this points at a kmp_flag<T> -/* - * Two variables used for consistency check - struct cons_header *th_cons and inte th_first moved below - * from here in order to avoid performance regression -*/ ident_t *th_ident; unsigned th_x; // Random number generator data unsigned th_a; // Random number generator data -#if OMP_30_ENABLED /* * Tasking-related data for the thread */ kmp_task_team_t * th_task_team; // Task team struct kmp_taskdata_t * th_current_task; // Innermost Task being executed kmp_uint8 th_task_state; // alternating 0/1 for task team identification -#endif // OMP_30_ENABLED /* * More stuff for keeping track of active/sleeping threads @@ -2229,8 +2211,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { // 32 bits for TCR/TCW - struct cons_header * th_cons; - int th_first; + struct cons_header * th_cons; // used for consistency check /* * Add the syncronizing data which is cache aligned and padded. @@ -2259,6 +2240,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { kmp_itt_mark_t th_itt_mark_single; // alignment ??? #endif /* USE_ITT_BUILD */ +#if KMP_STATS_ENABLED + kmp_stats_list* th_stats; +#endif } kmp_base_info_t; typedef union KMP_ALIGN_CACHE kmp_info { @@ -2291,154 +2275,89 @@ typedef int (*launch_t)( int gtid ); /* Minimum number of ARGV entries to malloc if necessary */ #define KMP_MIN_MALLOC_ARGV_ENTRIES 100 -#if KMP_MIC && OMP_30_ENABLED -# define KMP_BARRIER_ICV_PULL 1 +// Set up how many argv pointers will fit in cache lines containing t_inline_argv. Historically, we +// have supported at least 96 bytes. Using a larger value for more space between the master write/worker +// read section and read/write by all section seems to buy more performance on EPCC PARALLEL. +#if KMP_ARCH_X86 || KMP_ARCH_X86_64 +# define KMP_INLINE_ARGV_BYTES ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) ) #else -# define KMP_BARRIER_ICV_PULL 0 -#endif - -#if (KMP_PERF_V106 == KMP_ON) -// -// Set up how many argv pointers will fit in cache lines containing -// *t_inline_argv. Historically, we have supported at least 96 bytes. -// -// Using a larger value for more space between the master write/worker read -// section and read/write by all section seems to buy more performance -// on EPCC PARALLEL. -// -//# define KMP_INLINE_ARGV_BYTES ( 2 * CACHE_LINE ) -# if KMP_BARRIER_ICV_PULL -# define KMP_INLINE_ARGV_BYTES 192 -//# define KMP_INLINE_ARGV_BYTES ( 2 * CACHE_LINE - ( ( 5 * KMP_PTR_SKIP + 10 * sizeof(int) + sizeof(kmp_int64) ) % CACHE_LINE ) ) -# elif KMP_ARCH_X86 || KMP_ARCH_X86_64 -# define KMP_INLINE_ARGV_BYTES ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) ) -# else -# define KMP_INLINE_ARGV_BYTES ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) ) -# endif -# define KMP_INLINE_ARGV_ENTRIES (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP ) +# define KMP_INLINE_ARGV_BYTES ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) ) #endif +#define KMP_INLINE_ARGV_ENTRIES (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP ) typedef struct KMP_ALIGN_CACHE kmp_base_team { -/* - * Synchronization Data - */ - KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered; + // Synchronization Data --------------------------------------------------------------------------------- + KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered; kmp_balign_team_t t_bar[ bs_last_barrier ]; - - /* count of single directive encountered by team */ - volatile int t_construct; - kmp_lock_t t_single_lock; /* team specific lock */ - -/* - * Master only - */ - KMP_ALIGN_CACHE int t_master_tid; /* tid of master in parent team */ - int t_master_this_cons; /* "this_construct" single counter of master in parent team */ - int t_master_last_cons; /* "last_construct" single counter of master in parent team */ - ident_t *t_ident; /* if volatile, have to change too much other crud to volatile too */ - kmp_team_p *t_parent; /* parent team */ - kmp_team_p *t_next_pool; /* next free team in the team pool */ - kmp_disp_t *t_dispatch; /* thread's dispatch data */ -#if OMP_30_ENABLED - kmp_task_team_t *t_task_team; /* Task team struct */ -#endif /* OMP_30_ENABLED */ + volatile int t_construct; // count of single directive encountered by team + kmp_lock_t t_single_lock; // team specific lock + + // Master only ----------------------------------------------------------------------------------------- + KMP_ALIGN_CACHE int t_master_tid; // tid of master in parent team + int t_master_this_cons; // "this_construct" single counter of master in parent team + ident_t *t_ident; // if volatile, have to change too much other crud to volatile too + kmp_team_p *t_parent; // parent team + kmp_team_p *t_next_pool; // next free team in the team pool + kmp_disp_t *t_dispatch; // thread's dispatch data + kmp_task_team_t *t_task_team; // Task team struct #if OMP_40_ENABLED - kmp_proc_bind_t t_proc_bind; /* bind type for par region */ + kmp_proc_bind_t t_proc_bind; // bind type for par region #endif // OMP_40_ENABLED +#if USE_ITT_BUILD + kmp_uint64 t_region_time; // region begin timestamp +#endif /* USE_ITT_BUILD */ -/* - * Master write, workers read - */ - KMP_ALIGN_CACHE - void **t_argv; + // Master write, workers read -------------------------------------------------------------------------- + KMP_ALIGN_CACHE void **t_argv; int t_argc; -#if (KMP_PERF_V106 == KMP_ON) - /* swap cache lines for t_nproc and t_max_argc */ - int t_nproc; /* number of threads in team */ -#else - int t_max_argc; -#endif + int t_nproc; // number of threads in team microtask_t t_pkfn; - launch_t t_invoke; /* procedure to launch the microtask */ - + launch_t t_invoke; // procedure to launch the microtask #if KMP_ARCH_X86 || KMP_ARCH_X86_64 kmp_int8 t_fp_control_saved; kmp_int8 t_pad2b; - kmp_int16 t_x87_fpu_control_word; /* FP control regs */ + kmp_int16 t_x87_fpu_control_word; // FP control regs kmp_uint32 t_mxcsr; #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ -#if (KMP_PERF_V106 == KMP_ON) void *t_inline_argv[ KMP_INLINE_ARGV_ENTRIES ]; -#endif -#if (KMP_PERF_V19 == KMP_ON) - KMP_ALIGN_CACHE -#endif - kmp_info_t **t_threads; -#if (KMP_PERF_V106 == KMP_ON) - /* swap cache lines for t_nproc and t_max_argc */ + KMP_ALIGN_CACHE kmp_info_t **t_threads; int t_max_argc; -#else - int t_nproc; /* number of threads in team */ -#endif - int t_max_nproc; /* maximum threads this team can handle (this is dynamicly expandable) */ - int t_serialized; /* levels deep of serialized teams */ - dispatch_shared_info_t *t_disp_buffer; /* buffers for dispatch system */ + int t_max_nproc; // maximum threads this team can handle (dynamicly expandable) + int t_serialized; // levels deep of serialized teams + dispatch_shared_info_t *t_disp_buffer; // buffers for dispatch system int t_id; // team's id, assigned by debugger. -#if OMP_30_ENABLED - int t_level; /* nested parallel level */ - int t_active_level; /* nested active parallel level */ - kmp_r_sched_t t_sched; /* run-time schedule for the team */ -#endif // OMP_30_ENABLED + int t_level; // nested parallel level + int t_active_level; // nested active parallel level + kmp_r_sched_t t_sched; // run-time schedule for the team #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED - int t_first_place; /* first & last place in */ - int t_last_place; /* parent thread's partition. */ - /* Restore these values to */ - /* master after par region. */ + int t_first_place; // first & last place in parent thread's partition. + int t_last_place; // Restore these values to master after par region. #endif // OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED #if KMP_MIC - int t_size_changed; /* team size was changed?: 0 - no, 1 - yes, -1 - changed via omp_set_num_threads() call */ + int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call #endif -/* - * Read/write by workers as well - */ + // Read/write by workers as well ----------------------------------------------------------------------- #if KMP_ARCH_X86 || KMP_ARCH_X86_64 - // Using CACHE_LINE=64 reduces memory footprint, - // but causes a big perf regression of epcc 'parallel' and 'barrier' on fxe256lin01. - // This extra padding serves to fix the performance of epcc 'parallel' and 'barrier' when CACHE_LINE=64. - // TODO: investigate more and get rid if this padding. + // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel' + // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel' + // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding. char dummy_padding[1024]; #endif - KMP_ALIGN_CACHE -#if OMP_30_ENABLED - kmp_taskdata_t *t_implicit_task_taskdata; // Taskdata for the thread's implicit task -#else - // Internal control variables for current thread team - // TODO Convert these fields to an array of kmp_internal_control_t which simplifies parameter passing - // and also prevents performance degradation due to false sharing when all threads set a control var - int *t_set_nproc; /* internal control for # of threads for next - parallel region (per thread) */ - int *t_set_nested; /* internal control for nested parallelism (per thread) */ - int *t_set_dynamic; /* internal control for dynamic adjustment of threads (per thread) */ - int *t_set_blocktime; /* internal control for blocktime */ - int *t_set_bt_intervals; /* internal control for blocktime intervals */ - int *t_set_bt_set; /* internal control for whether blocktime is explicitly set */ -#endif // OMP_30_ENABLED - - kmp_internal_control_t *t_control_stack_top; /* internal control stack for additional nested teams. - for SERIALIZED teams nested 2 or more levels deep */ + KMP_ALIGN_CACHE kmp_taskdata_t *t_implicit_task_taskdata; // Taskdata for the thread's implicit task + kmp_internal_control_t *t_control_stack_top; // internal control stack for additional nested teams. + // for SERIALIZED teams nested 2 or more levels deep #if OMP_40_ENABLED - kmp_int32 t_cancel_request; /* typed flag to store request state of cancellation */ + kmp_int32 t_cancel_request; // typed flag to store request state of cancellation #endif - - int t_master_active;/* save on fork, restore on join */ - kmp_taskq_t t_taskq; /* this team's task queue */ - void *t_copypriv_data; /* team specific pointer to copyprivate data array */ + int t_master_active; // save on fork, restore on join + kmp_taskq_t t_taskq; // this team's task queue + void *t_copypriv_data; // team specific pointer to copyprivate data array kmp_uint32 t_copyin_counter; #if USE_ITT_BUILD - void *t_stack_id; /* team specific stack stitching id (for ittnotify) */ + void *t_stack_id; // team specific stack stitching id (for ittnotify) #endif /* USE_ITT_BUILD */ } kmp_base_team_t; @@ -2543,6 +2462,7 @@ extern int __kmp_debug_count; /* Counter for number of lines printe extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase recommended in warnings */ /* end rotating debug buffer */ +#ifdef KMP_DEBUG extern int __kmp_par_range; /* +1 => only go par for constructs in range */ #define KMP_PAR_RANGE_ROUTINE_LEN 1024 @@ -2551,6 +2471,7 @@ extern char __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN]; extern char __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN]; extern int __kmp_par_range_lb; extern int __kmp_par_range_ub; +#endif /* For printing out dynamic storage map for threads and teams */ extern int __kmp_storage_map; /* True means print storage map for threads and teams */ @@ -2607,14 +2528,13 @@ extern enum library_type __kmp_library; extern enum sched_type __kmp_sched; /* default runtime scheduling */ extern enum sched_type __kmp_static; /* default static scheduling method */ extern enum sched_type __kmp_guided; /* default guided scheduling method */ -#if OMP_30_ENABLED extern enum sched_type __kmp_auto; /* default auto scheduling method */ -#endif // OMP_30_ENABLED extern int __kmp_chunk; /* default runtime chunk size */ extern size_t __kmp_stksize; /* stack size per thread */ extern size_t __kmp_monitor_stksize;/* stack size for monitor thread */ extern size_t __kmp_stkoffset; /* stack offset per thread */ +extern int __kmp_stkpadding; /* Should we pad root thread(s) stack */ extern size_t __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */ extern int __kmp_env_chunk; /* was KMP_CHUNK specified? */ @@ -2629,7 +2549,7 @@ extern int __kmp_generate_warnings; /* should we issue warnings? */ extern int __kmp_reserve_warn; /* have we issued reserve_threads warning? */ #ifdef DEBUG_SUSPEND -extern int __kmp_suspend_count; /* count inside __kmp_suspend() */ +extern int __kmp_suspend_count; /* count inside __kmp_suspend_template() */ #endif extern kmp_uint32 __kmp_yield_init; @@ -2693,9 +2613,11 @@ extern kmp_int16 __kmp_init_x87_fpu_control_word; /* init thread's FP control r extern kmp_uint32 __kmp_init_mxcsr; /* init thread's mxscr */ #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ -#if OMP_30_ENABLED extern int __kmp_dflt_max_active_levels; /* max_active_levels for nested parallelism enabled by default a la OMP_MAX_ACTIVE_LEVELS */ -#endif // OMP_30_ENABLED +#if KMP_NESTED_HOT_TEAMS +extern int __kmp_hot_teams_mode; +extern int __kmp_hot_teams_max_level; +#endif # if KMP_OS_LINUX extern enum clock_function_type __kmp_clock_function; @@ -2833,8 +2755,6 @@ static inline kmp_info_t * __kmp_entry_thread() return __kmp_threads[gtid]; } -#if OMP_30_ENABLED - extern void __kmp_set_max_active_levels( int gtid, int new_max_active_levels ); extern int __kmp_get_max_active_levels( int gtid ); extern int __kmp_get_ancestor_thread_num( int gtid, int level ); @@ -2842,8 +2762,6 @@ extern int __kmp_get_team_size( int gtid, int level ); extern void __kmp_set_schedule( int gtid, kmp_sched_t new_sched, int chunk ); extern void __kmp_get_schedule( int gtid, kmp_sched_t * sched, int * chunk ); -#endif // OMP_30_ENABLED - extern unsigned short __kmp_get_random( kmp_info_t * thread ); extern void __kmp_init_random( kmp_info_t * thread ); @@ -2888,8 +2806,6 @@ extern void __kmp_push_num_teams( ident_t *loc, int gtid, int num_teams, int num #endif extern void __kmp_yield( int cond ); -extern void __kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin, - enum kmp_mem_fence_type fetchadd_fence ); extern void __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, @@ -2956,11 +2872,28 @@ extern kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker ); extern kmp_uint32 __kmp_wait_yield_4( kmp_uint32 volatile * spinner, kmp_uint32 checker, kmp_uint32 (*pred) (kmp_uint32, kmp_uint32), void * obj ); extern kmp_uint64 __kmp_wait_yield_8( kmp_uint64 volatile * spinner, kmp_uint64 checker, kmp_uint32 (*pred) (kmp_uint64, kmp_uint64), void * obj ); -extern void __kmp_wait_sleep( kmp_info_t *this_thr, volatile kmp_uint *spinner, kmp_uint checker, kmp_int final_spin +class kmp_flag_32; +class kmp_flag_64; +class kmp_flag_oncore; +extern void __kmp_wait_32(kmp_info_t *this_thr, kmp_flag_32 *flag, int final_spin #if USE_ITT_BUILD - , void * itt_sync_obj -#endif /* USE_ITT_BUILD */ -); + , void * itt_sync_obj +#endif + ); +extern void __kmp_release_32(kmp_flag_32 *flag); +extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin +#if USE_ITT_BUILD + , void * itt_sync_obj +#endif + ); +extern void __kmp_release_64(kmp_flag_64 *flag); +extern void __kmp_wait_oncore(kmp_info_t *this_thr, kmp_flag_oncore *flag, int final_spin +#if USE_ITT_BUILD + , void * itt_sync_obj +#endif + ); +extern void __kmp_release_oncore(kmp_flag_oncore *flag); + extern void __kmp_infinite_loop( void ); extern void __kmp_cleanup( void ); @@ -3003,9 +2936,10 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask); extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask); extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask); extern void __kmp_balanced_affinity( int tid, int team_size ); - #endif /* KMP_AFFINITY_SUPPORTED */ +extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar); + #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM) extern int __kmp_futex_determine_capable( void ); @@ -3035,8 +2969,12 @@ extern void __kmp_reap_monitor( kmp_info_t *th ); extern void __kmp_reap_worker( kmp_info_t *th ); extern void __kmp_terminate_thread( int gtid ); -extern void __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker ); -extern void __kmp_resume( int target_gtid, volatile kmp_uint *spinner ); +extern void __kmp_suspend_32( int th_gtid, kmp_flag_32 *flag ); +extern void __kmp_suspend_64( int th_gtid, kmp_flag_64 *flag ); +extern void __kmp_suspend_oncore( int th_gtid, kmp_flag_oncore *flag ); +extern void __kmp_resume_32( int target_gtid, kmp_flag_32 *flag ); +extern void __kmp_resume_64( int target_gtid, kmp_flag_64 *flag ); +extern void __kmp_resume_oncore( int target_gtid, kmp_flag_oncore *flag ); extern void __kmp_elapsed( double * ); extern void __kmp_elapsed_tick( double * ); @@ -3062,19 +3000,14 @@ extern kmp_info_t * __kmp_allocate_thread( kmp_root_t *root, extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, kmp_proc_bind_t proc_bind, kmp_internal_control_t *new_icvs, - int argc ); -#elif OMP_30_ENABLED -extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, - kmp_internal_control_t *new_icvs, - int argc ); + int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) ); #else extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc, - int new_set_nproc, int new_set_dynamic, int new_set_nested, - int new_set_blocktime, int new_bt_intervals, int new_bt_set, - int argc ); -#endif // OMP_30_ENABLED + kmp_internal_control_t *new_icvs, + int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) ); +#endif // OMP_40_ENABLED extern void __kmp_free_thread( kmp_info_t * ); -extern void __kmp_free_team( kmp_root_t *, kmp_team_t * ); +extern void __kmp_free_team( kmp_root_t *, kmp_team_t * USE_NESTED_HOT_ARG(kmp_info_t *) ); extern kmp_team_t * __kmp_reap_team( kmp_team_t * ); /* ------------------------------------------------------------------------ */ @@ -3094,7 +3027,16 @@ extern int __kmp_barrier( enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) ); extern void __kmp_end_split_barrier ( enum barrier_type bt, int gtid ); -extern int __kmp_fork_call( ident_t *loc, int gtid, int exec_master, +/*! + * Tell the fork call which compiler generated the fork call, and therefore how to deal with the call. + */ +enum fork_context_e +{ + fork_context_gnu, /**< Called from GNU generated code, so must not invoke the microtask internally. */ + fork_context_intel, /**< Called from Intel generated code. */ + fork_context_last +}; +extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context, kmp_int32 argc, microtask_t microtask, launch_t invoker, /* TODO: revert workaround for Intel(R) 64 tracker #96 */ #if (KMP_ARCH_ARM || KMP_ARCH_X86_64) && KMP_OS_LINUX @@ -3110,6 +3052,7 @@ extern void __kmp_join_call( ident_t *loc, int gtid #endif ); +extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid); extern void __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team ); extern void __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team ); extern int __kmp_invoke_task_func( int gtid ); @@ -3120,7 +3063,7 @@ extern void __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_th KMP_EXPORT int __kmpc_invoke_task_func( int gtid ); #if OMP_40_ENABLED extern int __kmp_invoke_teams_master( int gtid ); -extern void __kmp_teams_master( microtask_t microtask, int gtid ); +extern void __kmp_teams_master( int gtid ); #endif extern void __kmp_save_internal_controls( kmp_info_t * thread ); extern void __kmp_user_set_library (enum library_type arg); @@ -3135,7 +3078,6 @@ void ompc_set_nested( int flag ); void ompc_set_dynamic( int flag ); void ompc_set_num_threads( int arg ); -#if OMP_30_ENABLED extern void __kmp_push_current_task_to_thread( kmp_info_t *this_thr, kmp_team_t *team, int tid ); extern void __kmp_pop_current_task_from_thread( kmp_info_t *this_thr ); @@ -3145,12 +3087,25 @@ extern kmp_task_t* __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, extern void __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task ); -extern int __kmp_execute_tasks( kmp_info_t *thread, kmp_int32 gtid, volatile kmp_uint *spinner, - kmp_uint checker, int final_spin, int *thread_finished, +int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, + int *thread_finished, #if USE_ITT_BUILD - void * itt_sync_obj, + void * itt_sync_obj, #endif /* USE_ITT_BUILD */ - int c ); + kmp_int32 is_constrained); +int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void * itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); +int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, + int *thread_finished, +#if USE_ITT_BUILD + void * itt_sync_obj, +#endif /* USE_ITT_BUILD */ + kmp_int32 is_constrained); + extern void __kmp_reap_task_teams( void ); extern void __kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread ); extern void __kmp_wait_to_unref_task_teams( void ); @@ -3163,8 +3118,6 @@ extern void __kmp_task_team_wait ( kmp_info_t *this_thr, kmp_team_t *team ); extern void __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid ); -#endif // OMP_30_ENABLED - extern int __kmp_is_address_mapped( void *addr ); extern kmp_uint64 __kmp_hardware_timestamp(void); @@ -3259,7 +3212,6 @@ KMP_EXPORT kmpc_thunk_t * __kmpc_task_buffer (ident_t *loc, kmp_int32 global_tid /* ------------------------------------------------------------------------ */ -#if OMP_30_ENABLED /* * OMP 3.0 tasking interface routines */ @@ -3288,9 +3240,9 @@ void __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *tas #endif // TASK_UNUSED /* ------------------------------------------------------------------------ */ -#endif // OMP_30_ENABLED #if OMP_40_ENABLED + KMP_EXPORT void __kmpc_taskgroup( ident_t * loc, int gtid ); KMP_EXPORT void __kmpc_end_taskgroup( ident_t * loc, int gtid ); @@ -3301,13 +3253,13 @@ KMP_EXPORT void __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list ); extern void __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task ); -#endif +extern kmp_int32 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate ); -#if OMP_40_ENABLED KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind); KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind); KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid); KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind); + #endif /* @@ -3404,8 +3356,6 @@ kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_si # define KMPC_CONVENTION #endif -#if OMP_30_ENABLED - #ifndef __OMP_H typedef enum omp_sched_t { omp_sched_static = 1, @@ -3424,8 +3374,6 @@ KMP_EXPORT int KMPC_CONVENTION kmpc_set_affinity_mask_proc(int, kmp_affinity_ma KMP_EXPORT int KMPC_CONVENTION kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *); KMP_EXPORT int KMPC_CONVENTION kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *); -#endif // OMP_30_ENABLED - KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int); KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t); KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int); |

