summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/runtime/src/kmp.h12
-rw-r--r--openmp/runtime/src/kmp_dispatch.cpp2244
-rw-r--r--openmp/runtime/src/kmp_dispatch.h507
3 files changed, 1434 insertions, 1329 deletions
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index ff0311b9adb..1c0279bc718 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1508,6 +1508,15 @@ struct shared_table {
/* ------------------------------------------------------------------------ */
+typedef struct kmp_sched_flags {
+ unsigned ordered : 1;
+ unsigned nomerge : 1;
+ unsigned contains_last : 1;
+ unsigned unused : 29;
+} kmp_sched_flags_t;
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
+
#if KMP_STATIC_STEAL_ENABLED
typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
kmp_int32 count;
@@ -1625,13 +1634,12 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info {
dispatch_private_info64_t p64;
} u;
enum sched_type schedule; /* scheduling algorithm */
- kmp_int32 ordered; /* ordered clause specified */
+ kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
kmp_int32 ordered_bumped;
// To retain the structure size after making ordered_iteration scalar
kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
// Stack of buffers for nest of serial regions
struct dispatch_private_info *next;
- kmp_int32 nomerge; /* don't merge iters if serialized */
kmp_int32 type_size; /* the size of types in private_info */
enum cons_type pushed_ws;
} dispatch_private_info_t;
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index a821ec192f3..462e0fa7767 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -34,275 +34,17 @@
#if KMP_OS_WINDOWS && KMP_ARCH_X86
#include <float.h>
#endif
+#include "kmp_lock.h"
+#include "kmp_dispatch.h"
#if OMPT_SUPPORT
#include "ompt-specific.h"
#endif
/* ------------------------------------------------------------------------ */
-
-#if KMP_STATIC_STEAL_ENABLED
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
- typedef typename traits_t<T>::unsigned_t UT;
- typedef typename traits_t<T>::signed_t ST;
- UT count; // unsigned
- T ub;
- /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
- T lb;
- ST st; // signed
- UT tc; // unsigned
- T static_steal_counter; // for static_steal only; maybe better to put after ub
-
- /* parm[1-4] are used in different ways by different scheduling algorithms */
-
- // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
- // a) parm3 is properly aligned and
- // b) all parm1-4 are in the same cache line.
- // Because of parm1-4 are used together, performance seems to be better
- // if they are in the same line (not measured though).
-
- struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
- T parm1;
- T parm2;
- T parm3;
- T parm4;
- };
-
- UT ordered_lower; // unsigned
- UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
- T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#else /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
- typedef typename traits_t<T>::unsigned_t UT;
- typedef typename traits_t<T>::signed_t ST;
- T lb;
- T ub;
- ST st; // signed
- UT tc; // unsigned
-
- T parm1;
- T parm2;
- T parm3;
- T parm4;
-
- UT count; // unsigned
-
- UT ordered_lower; // unsigned
- UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
- T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#endif /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info structure and dispatch_private_info_t type
-template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
- // duplicate alignment here, otherwise size of structure is not correct in our
- // compiler
- union KMP_ALIGN_CACHE private_info_tmpl {
- dispatch_private_infoXX_template<T> p;
- dispatch_private_info64_t p64;
- } u;
- enum sched_type schedule; /* scheduling algorithm */
- kmp_uint32 ordered; /* ordered clause specified */
- kmp_uint32 ordered_bumped;
- // To retain the structure size after making ordered_iteration scalar
- kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
- dispatch_private_info *next; /* stack of buffers for nest of serial regions */
- kmp_uint32 nomerge; /* don't merge iters if serialized */
- kmp_uint32 type_size;
- enum cons_type pushed_ws;
-};
-
-// replaces dispatch_shared_info{32,64} structures and
-// dispatch_shared_info{32,64}_t types
-template <typename UT> struct dispatch_shared_infoXX_template {
- /* chunk index under dynamic, number of idle threads under static-steal;
- iteration index otherwise */
- volatile UT iteration;
- volatile UT num_done;
- volatile UT ordered_iteration;
- // to retain the structure size making ordered_iteration scalar
- UT ordered_dummy[KMP_MAX_ORDERED - 3];
-};
-
-// replaces dispatch_shared_info structure and dispatch_shared_info_t type
-template <typename UT> struct dispatch_shared_info_template {
- // we need union here to keep the structure size
- union shared_info_tmpl {
- dispatch_shared_infoXX_template<UT> s;
- dispatch_shared_info64_t s64;
- } u;
- volatile kmp_uint32 buffer_index;
-#if OMP_45_ENABLED
- volatile kmp_int32 doacross_buf_idx; // teamwise index
- kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
- kmp_int32 doacross_num_done; // count finished threads
-#endif
-#if KMP_USE_HWLOC
- // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
- // machines (> 48 cores). Performance analysis showed that a cache thrash
- // was occurring and this padding helps alleviate the problem.
- char padding[64];
-#endif
-};
-
-/* ------------------------------------------------------------------------ */
-
-#undef USE_TEST_LOCKS
-
-// test_then_add template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
-
-template <>
-__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
- kmp_int32 d) {
- kmp_int32 r;
- r = KMP_TEST_THEN_ADD32(p, d);
- return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
- kmp_int64 d) {
- kmp_int64 r;
- r = KMP_TEST_THEN_ADD64(p, d);
- return r;
-}
-
-// test_then_inc_acq template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
- kmp_int32 r;
- r = KMP_TEST_THEN_INC_ACQ32(p);
- return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
- kmp_int64 r;
- r = KMP_TEST_THEN_INC_ACQ64(p);
- return r;
-}
-
-// test_then_inc template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
- kmp_int32 r;
- r = KMP_TEST_THEN_INC32(p);
- return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
- kmp_int64 r;
- r = KMP_TEST_THEN_INC64(p);
- return r;
-}
-
-// compare_and_swap template (general template should NOT be used)
-template <typename T>
-static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
- kmp_int32 c, kmp_int32 s) {
- return KMP_COMPARE_AND_STORE_REL32(p, c, s);
-}
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
- kmp_int64 c, kmp_int64 s) {
- return KMP_COMPARE_AND_STORE_REL64(p, c, s);
-}
-
-/* Spin wait loop that first does pause, then yield.
- Waits until function returns non-zero when called with *spinner and check.
- Does NOT put threads to sleep.
- Arguments:
- UT is unsigned 4- or 8-byte type
- spinner - memory location to check value
- checker - value which spinner is >, <, ==, etc.
- pred - predicate function to perform binary comparison of some sort
-#if USE_ITT_BUILD
- obj -- is higher-level synchronization object to report to ittnotify.
- It is used to report locks consistently. For example, if lock is
- acquired immediately, its address is reported to ittnotify via
- KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
- and lock routine calls to KMP_WAIT_YIELD(), the later should report the
- same address, not an address of low-level spinner.
-#endif // USE_ITT_BUILD
- TODO: make inline function (move to header file for icl)
-*/
-template <typename UT>
-static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
- kmp_uint32 (*pred)(UT, UT)
- USE_ITT_BUILD_ARG(void *obj)) {
- // note: we may not belong to a team at this point
- volatile UT *spin = spinner;
- UT check = checker;
- kmp_uint32 spins;
- kmp_uint32 (*f)(UT, UT) = pred;
- UT r;
-
- KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
- KMP_INIT_YIELD(spins);
- // main wait spin loop
- while (!f(r = *spin, check)) {
- KMP_FSYNC_SPIN_PREPARE(obj);
- /* GEH - remove this since it was accidentally introduced when kmp_wait was
- split. It causes problems with infinite recursion because of exit lock */
- /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
- __kmp_abort_thread(); */
-
- // if we are oversubscribed, or have waited a bit (and
- // KMP_LIBRARY=throughput, then yield. pause is in the following code
- KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
- KMP_YIELD_SPIN(spins);
- }
- KMP_FSYNC_SPIN_ACQUIRED(obj);
- return r;
-}
-
-template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
- return value == checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
- return value != checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
- return value < checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
- return value >= checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
- return value <= checker;
-}
-
/* ------------------------------------------------------------------------ */
-static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
- ident_t *loc_ref) {
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
kmp_info_t *th;
KMP_DEBUG_ASSERT(gtid_ref);
@@ -320,85 +62,7 @@ static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
}
}
-template <typename UT>
-static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
- typedef typename traits_t<UT>::signed_t ST;
- dispatch_private_info_template<UT> *pr;
-
- int gtid = *gtid_ref;
- // int cid = *cid_ref;
- kmp_info_t *th = __kmp_threads[gtid];
- KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
- KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
- if (__kmp_env_consistency_check) {
- pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- if (pr->pushed_ws != ct_none) {
-#if KMP_USE_DYNAMIC_LOCK
- __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
-#else
- __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
-#endif
- }
- }
-
- if (!th->th.th_team->t.t_serialized) {
- dispatch_shared_info_template<UT> *sh =
- reinterpret_cast<dispatch_shared_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_sh_current);
- UT lower;
-
- if (!__kmp_env_consistency_check) {
- pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- }
- lower = pr->u.p.ordered_lower;
-
-#if !defined(KMP_GOMP_COMPAT)
- if (__kmp_env_consistency_check) {
- if (pr->ordered_bumped) {
- struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
- __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
- ct_ordered_in_pdo, loc_ref,
- &p->stack_data[p->w_top]);
- }
- }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
- KMP_MB();
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
- "ordered_iter:%%%s lower:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
- __kmp_str_free(&buff);
- }
-#endif
-
- __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
- __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
- KMP_MB(); /* is this necessary? */
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
- "ordered_iter:%%%s lower:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
- __kmp_str_free(&buff);
- }
-#endif
- }
- KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
-}
-
-static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
- ident_t *loc_ref) {
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
kmp_info_t *th;
if (__kmp_env_consistency_check) {
@@ -409,121 +73,26 @@ static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
}
}
-template <typename UT>
-static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
- typedef typename traits_t<UT>::signed_t ST;
- dispatch_private_info_template<UT> *pr;
-
- int gtid = *gtid_ref;
- // int cid = *cid_ref;
- kmp_info_t *th = __kmp_threads[gtid];
- KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
- KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
- if (__kmp_env_consistency_check) {
- pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- if (pr->pushed_ws != ct_none) {
- __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
- }
- }
-
- if (!th->th.th_team->t.t_serialized) {
- dispatch_shared_info_template<UT> *sh =
- reinterpret_cast<dispatch_shared_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_sh_current);
-
- if (!__kmp_env_consistency_check) {
- pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
- th->th.th_dispatch->th_dispatch_pr_current);
- }
-
- KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
-#if !defined(KMP_GOMP_COMPAT)
- if (__kmp_env_consistency_check) {
- if (pr->ordered_bumped != 0) {
- struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
- /* How to test it? - OM */
- __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
- ct_ordered_in_pdo, loc_ref,
- &p->stack_data[p->w_top]);
- }
- }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
- KMP_MB(); /* Flush all pending memory write invalidates. */
-
- pr->ordered_bumped += 1;
-
- KD_TRACE(1000,
- ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
- gtid, pr->ordered_bumped));
-
- KMP_MB(); /* Flush all pending memory write invalidates. */
-
- /* TODO use general release procedure? */
- test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
-
- KMP_MB(); /* Flush all pending memory write invalidates. */
- }
- KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
-}
-
-// Computes and returns x to the power of y, where y must a non-negative integer
-template <typename UT>
-static __forceinline long double __kmp_pow(long double x, UT y) {
- long double s = 1.0L;
-
- KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
- // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
- while (y) {
- if (y & 1)
- s *= x;
- x *= x;
- y >>= 1;
- }
- return s;
-}
-
-/* Computes and returns the number of unassigned iterations after idx chunks
- have been assigned (the total number of unassigned iterations in chunks with
- index greater than or equal to idx). __forceinline seems to be broken so that
- if we __forceinline this function, the behavior is wrong
- (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
-template <typename T>
-static __inline typename traits_t<T>::unsigned_t
-__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
- typename traits_t<T>::unsigned_t idx) {
- /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
- ICL 8.1, long double arithmetic may not really have long double precision,
- even with /Qlong_double. Currently, we workaround that in the caller code,
- by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
- of precision is not expected to be a correctness issue, though. */
- typedef typename traits_t<T>::unsigned_t UT;
-
- long double x = tc * __kmp_pow<UT>(base, idx);
- UT r = (UT)x;
- if (x == r)
- return r;
- return r + 1;
-}
-
-// Parameters of the guided-iterative algorithm:
-// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
-// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
-// by default n = 2. For example with n = 3 the chunks distribution will be more
-// flat.
-// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
-static int guided_int_param = 2;
-static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
-
-// UT - unsigned flavor of T, ST - signed flavor of T,
-// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+// Initialize a dispatch_private_info_template<T> buffer for a particular
+// type of schedule,chunk. The loop description is found in lb (lower bound),
+// ub (upper bound), and st (stride). nproc is the number of threads relevant
+// to the scheduling (often the number of threads in a team, but not always if
+// hierarchical scheduling is used). tid is the id of the thread calling
+// the function within the group of nproc threads. It will have a value
+// between 0 and nproc - 1. This is often just the thread id within a team, but
+// is not necessarily the case when using hierarchical scheduling.
+// loc is the source file location of the corresponding loop
+// gtid is the global thread id
template <typename T>
-static void
-__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
- T ub, typename traits_t<T>::signed_t st,
- typename traits_t<T>::signed_t chunk, int push_ws) {
+void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+ dispatch_private_info_template<T> *pr,
+ enum sched_type schedule, T lb, T ub,
+ typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+ kmp_uint64 *cur_chunk,
+#endif
+ typename traits_t<T>::signed_t chunk,
+ T nproc, T tid) {
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
typedef typename traits_t<T>::floating_t DBL;
@@ -532,30 +101,18 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
T tc;
kmp_info_t *th;
kmp_team_t *team;
- kmp_uint32 my_buffer_index;
- dispatch_private_info_template<T> *pr;
- dispatch_shared_info_template<UT> volatile *sh;
- KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
- sizeof(dispatch_private_info));
- KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
- sizeof(dispatch_shared_info));
-
- if (!TCR_4(__kmp_init_parallel))
- __kmp_parallel_initialize();
-
-#if INCLUDE_SSC_MARKS
- SSC_MARK_DISPATCH_INIT();
-#endif
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
- "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
- traits_t<ST>::spec, traits_t<T>::spec,
- traits_t<T>::spec, traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+ buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
+ "pr:%%p lb:%%%s ub:%%%s st:%%%s "
+ "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
+ traits_t<T>::spec, traits_t<T>::spec,
+ traits_t<ST>::spec, traits_t<ST>::spec,
+ traits_t<T>::spec, traits_t<T>::spec);
+ KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
__kmp_str_free(&buff);
}
#endif
@@ -563,10 +120,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
th = __kmp_threads[gtid];
team = th->th.th_team;
active = !team->t.t_serialized;
- th->th.th_ident = loc;
#if USE_ITT_BUILD
- kmp_uint64 cur_chunk = chunk;
int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
__kmp_forkjoin_frames_mode == 3 &&
KMP_MASTER_GTID(gtid) &&
@@ -575,23 +130,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
#endif
team->t.t_active_level == 1;
#endif
- if (!active) {
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- th->th.th_dispatch->th_disp_buffer); /* top of the stack */
- } else {
- KMP_DEBUG_ASSERT(th->th.th_dispatch ==
- &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
-
- my_buffer_index = th->th.th_dispatch->th_disp_index++;
-
- /* What happens when number of threads changes, need to resize buffer? */
- pr = reinterpret_cast<dispatch_private_info_template<T> *>(
- &th->th.th_dispatch
- ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
- &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
- }
-
#if (KMP_STATIC_STEAL_ENABLED)
if (SCHEDULE_HAS_NONMONOTONIC(schedule))
// AC: we now have only one implementation of stealing, so use it
@@ -602,19 +140,19 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
/* Pick up the nomerge/ordered bits from the scheduling type */
if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
- pr->nomerge = TRUE;
+ pr->flags.nomerge = TRUE;
schedule =
(enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
} else {
- pr->nomerge = FALSE;
+ pr->flags.nomerge = FALSE;
}
pr->type_size = traits_t<T>::type_size; // remember the size of variables
if (kmp_ord_lower & schedule) {
- pr->ordered = TRUE;
+ pr->flags.ordered = TRUE;
schedule =
(enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
} else {
- pr->ordered = FALSE;
+ pr->flags.ordered = FALSE;
}
if (schedule == kmp_sch_static) {
@@ -635,15 +173,16 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
// specified)
chunk = team->t.t_sched.chunk;
#if USE_ITT_BUILD
- cur_chunk = chunk;
+ if (cur_chunk)
+ *cur_chunk = chunk;
#endif
#ifdef KMP_DEBUG
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format(
- "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
- traits_t<ST>::spec);
+ buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
+ "schedule:%%d chunk:%%%s\n",
+ traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk));
__kmp_str_free(&buff);
}
@@ -664,9 +203,10 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
{
char *buff;
// create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
- "schedule:%%d chunk:%%%s\n",
- traits_t<ST>::spec);
+ buff = __kmp_str_format(
+ "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
+ "schedule:%%d chunk:%%%s\n",
+ traits_t<ST>::spec);
KD_TRACE(10, (buff, gtid, schedule, chunk));
__kmp_str_free(&buff);
}
@@ -674,11 +214,11 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
}
/* guided analytical not safe for too many threads */
- if (schedule == kmp_sch_guided_analytical_chunked &&
- th->th.th_team_nproc > 1 << 20) {
+ if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
schedule = kmp_sch_guided_iterative_chunked;
KMP_WARNING(DispatchManyThreads);
}
+#if OMP_45_ENABLED
if (schedule == kmp_sch_runtime_simd) {
// compiler provides simd_width in the chunk parameter
schedule = team->t.t_sched.r_sched_type;
@@ -694,7 +234,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
chunk = team->t.t_sched.chunk * chunk;
}
#if USE_ITT_BUILD
- cur_chunk = chunk;
+ if (cur_chunk)
+ *cur_chunk = chunk;
#endif
#ifdef KMP_DEBUG
{
@@ -708,6 +249,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
}
#endif
}
+#endif // OMP_45_ENABLED
pr->u.p.parm1 = chunk;
}
KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
@@ -718,7 +260,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
if (__kmp_env_consistency_check) {
if (st == 0) {
__kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
- (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
+ (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
}
}
// compute trip count
@@ -746,16 +288,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
}
}
- // Any half-decent optimizer will remove this test when the blocks are empty
- // since the macros expand to nothing when statistics are disabled.
- if (schedule == __kmp_static) {
- KMP_COUNT_BLOCK(OMP_FOR_static);
- KMP_COUNT_VALUE(FOR_static_iterations, tc);
- } else {
- KMP_COUNT_BLOCK(OMP_FOR_dynamic);
- KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
- }
-
pr->u.p.lb = lb;
pr->u.p.ub = ub;
pr->u.p.st = st;
@@ -768,44 +300,26 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
/* NOTE: only the active parallel region(s) has active ordered sections */
if (active) {
- if (pr->ordered == 0) {
- th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
- th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
- } else {
+ if (pr->flags.ordered) {
pr->ordered_bumped = 0;
-
pr->u.p.ordered_lower = 1;
pr->u.p.ordered_upper = 0;
-
- th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
- th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
- }
- }
-
- if (__kmp_env_consistency_check) {
- enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
- if (push_ws) {
- __kmp_push_workshare(gtid, ws, loc);
- pr->pushed_ws = ws;
- } else {
- __kmp_check_workshare(gtid, ws, loc);
- pr->pushed_ws = ct_none;
}
}
switch (schedule) {
#if (KMP_STATIC_STEAL_ENABLED)
case kmp_sch_static_steal: {
- T nproc = th->th.th_team_nproc;
T ntc, init;
KD_TRACE(100,
- ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
+ gtid));
ntc = (tc % chunk ? 1 : 0) + tc / chunk;
if (nproc > 1 && ntc >= nproc) {
KMP_COUNT_BLOCK(OMP_FOR_static_steal);
- T id = __kmp_tid_from_gtid(gtid);
+ T id = tid;
T small_chunk, extras;
small_chunk = ntc / nproc;
@@ -832,7 +346,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
}
break;
} else {
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
"kmp_sch_static_balanced\n",
gtid));
schedule = kmp_sch_static_balanced;
@@ -842,14 +356,15 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
} // case
#endif
case kmp_sch_static_balanced: {
- T nproc = th->th.th_team_nproc;
T init, limit;
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
- gtid));
+ KD_TRACE(
+ 100,
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
+ gtid));
if (nproc > 1) {
- T id = __kmp_tid_from_gtid(gtid);
+ T id = tid;
if (tc < nproc) {
if (id < tc) {
@@ -873,7 +388,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
init = 0;
limit = tc - 1;
pr->u.p.parm1 = TRUE;
- } else { // zero trip count
+ } else {
+ // zero trip count
pr->u.p.count = 1; /* means no more chunks to execute */
pr->u.p.parm1 = FALSE;
break;
@@ -882,7 +398,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
#if USE_ITT_BUILD
// Calculate chunk for metadata report
if (itt_need_metadata_reporting)
- cur_chunk = limit - init + 1;
+ if (cur_chunk)
+ *cur_chunk = limit - init + 1;
#endif
if (st == 1) {
pr->u.p.lb = lb + init;
@@ -899,16 +416,17 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
}
}
- if (pr->ordered) {
+ if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
}
break;
} // case
+#if OMP_45_ENABLED
case kmp_sch_static_balanced_chunked: {
// similar to balanced, but chunk adjusted to multiple of simd width
- T nth = th->th.th_team_nproc;
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
+ T nth = nproc;
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
" -> falling-through to static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
@@ -918,12 +436,14 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
pr->u.p.parm1 = tc;
break;
} // case
- case kmp_sch_guided_iterative_chunked:
- case kmp_sch_guided_simd: {
- T nproc = th->th.th_team_nproc;
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
- " case\n",
- gtid));
+ case kmp_sch_guided_simd:
+#endif // OMP_45_ENABLED
+ case kmp_sch_guided_iterative_chunked: {
+ KD_TRACE(
+ 100,
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
+ " case\n",
+ gtid));
if (nproc > 1) {
if ((2L * chunk + 1) * nproc >= tc) {
@@ -936,22 +456,24 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
guided_flt_param / nproc; // may occupy parm3 and parm4
}
} else {
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
"kmp_sch_static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
- gtid));
+ KD_TRACE(
+ 100,
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+ gtid));
pr->u.p.parm1 = tc;
} // if
} // case
break;
case kmp_sch_guided_analytical_chunked: {
- T nproc = th->th.th_team_nproc;
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
- " case\n",
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+ "kmp_sch_guided_analytical_chunked case\n",
gtid));
+
if (nproc > 1) {
if ((2L * chunk + 1) * nproc >= tc) {
/* chunk size too large, switch to dynamic */
@@ -1061,7 +583,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
#endif
} // if
} else {
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
"kmp_sch_static_greedy\n",
gtid));
schedule = kmp_sch_static_greedy;
@@ -1071,18 +593,18 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
} // case
break;
case kmp_sch_static_greedy:
- KD_TRACE(100,
- ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
- pr->u.p.parm1 = (th->th.th_team_nproc > 1)
- ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
- : tc;
+ KD_TRACE(
+ 100,
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+ gtid));
+ pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
break;
case kmp_sch_static_chunked:
case kmp_sch_dynamic_chunked:
if (pr->u.p.parm1 <= 0) {
pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
}
- KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
+ KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
"kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
gtid));
break;
@@ -1091,12 +613,13 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
T parm1, parm2, parm3, parm4;
KD_TRACE(100,
- ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
+ ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
+ gtid));
parm1 = chunk;
/* F : size of the first cycle */
- parm2 = (tc / (2 * th->th.th_team_nproc));
+ parm2 = (tc / (2 * nproc));
if (parm2 < 1) {
parm2 = 1;
@@ -1142,6 +665,111 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
} break;
} // switch
pr->schedule = schedule;
+}
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template <typename T>
+static void
+__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
+ T ub, typename traits_t<T>::signed_t st,
+ typename traits_t<T>::signed_t chunk, int push_ws) {
+ typedef typename traits_t<T>::unsigned_t UT;
+ typedef typename traits_t<T>::signed_t ST;
+ typedef typename traits_t<T>::floating_t DBL;
+
+ int active;
+ kmp_info_t *th;
+ kmp_team_t *team;
+ kmp_uint32 my_buffer_index;
+ dispatch_private_info_template<T> *pr;
+ dispatch_shared_info_template<T> volatile *sh;
+
+ KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
+ sizeof(dispatch_private_info));
+ KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
+ sizeof(dispatch_shared_info));
+
+ if (!TCR_4(__kmp_init_parallel))
+ __kmp_parallel_initialize();
+
+#if INCLUDE_SSC_MARKS
+ SSC_MARK_DISPATCH_INIT();
+#endif
+#ifdef KMP_DEBUG
+ {
+ char *buff;
+ // create format specifiers before the debug output
+ buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
+ "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+ traits_t<ST>::spec, traits_t<T>::spec,
+ traits_t<T>::spec, traits_t<ST>::spec);
+ KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+ __kmp_str_free(&buff);
+ }
+#endif
+ /* setup data */
+ th = __kmp_threads[gtid];
+ team = th->th.th_team;
+ active = !team->t.t_serialized;
+ th->th.th_ident = loc;
+
+#if USE_ITT_BUILD
+ kmp_uint64 cur_chunk = chunk;
+ int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
+ __kmp_forkjoin_frames_mode == 3 &&
+ KMP_MASTER_GTID(gtid) &&
+#if OMP_40_ENABLED
+ th->th.th_teams_microtask == NULL &&
+#endif
+ team->t.t_active_level == 1;
+#endif
+ if (!active) {
+ pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+ th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+ } else {
+ KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+ &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+ my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+ /* What happens when number of threads changes, need to resize buffer? */
+ pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+ &th->th.th_dispatch
+ ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+ sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+ &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+ KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
+ my_buffer_index));
+ }
+
+ __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
+#if USE_ITT_BUILD
+ &cur_chunk,
+#endif
+ chunk, (T)th->th.th_team_nproc,
+ (T)th->th.th_info.ds.ds_tid);
+ if (active) {
+ if (pr->flags.ordered == 0) {
+ th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+ th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+ } else {
+ th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
+ th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
+ }
+ }
+
+ // Any half-decent optimizer will remove this test when the blocks are empty
+ // since the macros expand to nothing
+ // when statistics are disabled.
+ if (schedule == __kmp_static) {
+ KMP_COUNT_BLOCK(OMP_FOR_static);
+ KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc);
+ } else {
+ KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+ KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc);
+ }
+
if (active) {
/* The name of this buffer should be my_buffer_index when it's free to use
* it */
@@ -1162,7 +790,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
th->th.th_dispatch->th_dispatch_sh_current =
CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
#if USE_ITT_BUILD
- if (pr->ordered) {
+ if (pr->flags.ordered) {
__kmp_itt_ordered_init(gtid);
}
// Report loop metadata
@@ -1181,7 +809,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
break;
case kmp_sch_guided_iterative_chunked:
case kmp_sch_guided_analytical_chunked:
+#if OMP_45_ENABLED
case kmp_sch_guided_simd:
+#endif
schedtype = 2;
break;
default:
@@ -1190,7 +820,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
schedtype = 3;
break;
}
- __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
+ __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
}
#endif /* USE_ITT_BUILD */
}
@@ -1208,10 +838,10 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
- KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
- pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
- pr->u.p.parm3, pr->u.p.parm4));
+ KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
+ pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+ pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+ pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
__kmp_str_free(&buff);
}
#endif
@@ -1234,10 +864,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
if (ompt_enabled.ompt_callback_work) {
ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
- kmp_info_t *thr = __kmp_threads[gtid];
ompt_callbacks.ompt_callback(ompt_callback_work)(
ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
- &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+ &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
}
#endif
}
@@ -1389,6 +1018,699 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
#endif /* KMP_GOMP_COMPAT */
+template <typename T>
+int __kmp_dispatch_next_algorithm(int gtid,
+ dispatch_private_info_template<T> *pr,
+ dispatch_shared_info_template<T> volatile *sh,
+ kmp_int32 *p_last, T *p_lb, T *p_ub,
+ typename traits_t<T>::signed_t *p_st, T nproc,
+ T tid) {
+ typedef typename traits_t<T>::unsigned_t UT;
+ typedef typename traits_t<T>::signed_t ST;
+ typedef typename traits_t<T>::floating_t DBL;
+ int status = 0;
+ kmp_int32 last = 0;
+ T start;
+ ST incr;
+ UT limit, trip, init;
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+
+ KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+ &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+ KMP_DEBUG_ASSERT(pr);
+ KMP_DEBUG_ASSERT(sh);
+ KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
+#ifdef KMP_DEBUG
+ {
+ char *buff;
+ // create format specifiers before the debug output
+ buff =
+ __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
+ "sh:%%p nproc:%%%s tid:%%%s\n",
+ traits_t<T>::spec, traits_t<T>::spec);
+ KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
+ __kmp_str_free(&buff);
+ }
+#endif
+
+ // zero trip count
+ if (pr->u.p.tc == 0) {
+ KD_TRACE(10,
+ ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
+ "zero status:%d\n",
+ gtid, status));
+ return 0;
+ }
+
+ switch (pr->schedule) {
+#if (KMP_STATIC_STEAL_ENABLED)
+ case kmp_sch_static_steal: {
+ T chunk = pr->u.p.parm1;
+
+ KD_TRACE(100,
+ ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
+ gtid));
+
+ trip = pr->u.p.tc - 1;
+
+ if (traits_t<T>::type_size > 4) {
+ // use lock for 8-byte and CAS for 4-byte induction
+ // variable. TODO (optional): check and use 16-byte CAS
+ kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+ KMP_DEBUG_ASSERT(lck != NULL);
+ if (pr->u.p.count < (UT)pr->u.p.ub) {
+ __kmp_acquire_lock(lck, gtid);
+ // try to get own chunk of iterations
+ init = (pr->u.p.count)++;
+ status = (init < (UT)pr->u.p.ub);
+ __kmp_release_lock(lck, gtid);
+ } else {
+ status = 0; // no own chunks
+ }
+ if (!status) { // try to steal
+ kmp_info_t **other_threads = team->t.t_threads;
+ int while_limit = nproc; // nproc attempts to find a victim
+ int while_index = 0;
+ // TODO: algorithm of searching for a victim
+ // should be cleaned up and measured
+ while ((!status) && (while_limit != ++while_index)) {
+ T remaining;
+ T victimIdx = pr->u.p.parm4;
+ T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+ dispatch_private_info_template<T> *victim =
+ reinterpret_cast<dispatch_private_info_template<T> *>(
+ other_threads[victimIdx]
+ ->th.th_dispatch->th_dispatch_pr_current);
+ while ((victim == NULL || victim == pr ||
+ (*(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter)) &&
+ oldVictimIdx != victimIdx) {
+ victimIdx = (victimIdx + 1) % nproc;
+ victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+ other_threads[victimIdx]
+ ->th.th_dispatch->th_dispatch_pr_current);
+ }
+ if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter)) {
+ continue; // try once more (nproc attempts in total)
+ // no victim is ready yet to participate in stealing
+ // because all victims are still in kmp_init_dispatch
+ }
+ if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
+ continue; // not enough chunks to steal, goto next victim
+ }
+
+ lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+ KMP_ASSERT(lck != NULL);
+ __kmp_acquire_lock(lck, gtid);
+ limit = victim->u.p.ub; // keep initial ub
+ if (victim->u.p.count >= limit ||
+ (remaining = limit - victim->u.p.count) < 2) {
+ __kmp_release_lock(lck, gtid);
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
+ continue; // not enough chunks to steal
+ }
+ // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+ // by 1
+ if (remaining > 3) {
+ // steal 1/4 of remaining
+ KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
+ init = (victim->u.p.ub -= (remaining >> 2));
+ } else {
+ // steal 1 chunk of 2 or 3 remaining
+ KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
+ init = (victim->u.p.ub -= 1);
+ }
+ __kmp_release_lock(lck, gtid);
+
+ KMP_DEBUG_ASSERT(init + 1 <= limit);
+ pr->u.p.parm4 = victimIdx; // remember victim to steal from
+ status = 1;
+ while_index = 0;
+ // now update own count and ub with stolen range but init chunk
+ __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+ pr->u.p.count = init + 1;
+ pr->u.p.ub = limit;
+ __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+ } // while (search for victim)
+ } // if (try to find victim and steal)
+ } else {
+ // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+ typedef union {
+ struct {
+ UT count;
+ T ub;
+ } p;
+ kmp_int64 b;
+ } union_i4;
+ // All operations on 'count' or 'ub' must be combined atomically
+ // together.
+ {
+ union_i4 vold, vnew;
+ vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+ vnew = vold;
+ vnew.p.count++;
+ while (!KMP_COMPARE_AND_STORE_ACQ64(
+ (volatile kmp_int64 *)&pr->u.p.count,
+ *VOLATILE_CAST(kmp_int64 *) & vold.b,
+ *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+ KMP_CPU_PAUSE();
+ vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+ vnew = vold;
+ vnew.p.count++;
+ }
+ vnew = vold;
+ init = vnew.p.count;
+ status = (init < (UT)vnew.p.ub);
+ }
+
+ if (!status) {
+ kmp_info_t **other_threads = team->t.t_threads;
+ int while_limit = nproc; // nproc attempts to find a victim
+ int while_index = 0;
+
+ // TODO: algorithm of searching for a victim
+ // should be cleaned up and measured
+ while ((!status) && (while_limit != ++while_index)) {
+ union_i4 vold, vnew;
+ kmp_int32 remaining;
+ T victimIdx = pr->u.p.parm4;
+ T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+ dispatch_private_info_template<T> *victim =
+ reinterpret_cast<dispatch_private_info_template<T> *>(
+ other_threads[victimIdx]
+ ->th.th_dispatch->th_dispatch_pr_current);
+ while ((victim == NULL || victim == pr ||
+ (*(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter)) &&
+ oldVictimIdx != victimIdx) {
+ victimIdx = (victimIdx + 1) % nproc;
+ victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+ other_threads[victimIdx]
+ ->th.th_dispatch->th_dispatch_pr_current);
+ }
+ if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+ *(volatile T *)&pr->u.p.static_steal_counter)) {
+ continue; // try once more (nproc attempts in total)
+ // no victim is ready yet to participate in stealing
+ // because all victims are still in kmp_init_dispatch
+ }
+ pr->u.p.parm4 = victimIdx; // new victim found
+ while (1) { // CAS loop if victim has enough chunks to steal
+ vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
+ vnew = vold;
+
+ KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+ if (vnew.p.count >= (UT)vnew.p.ub ||
+ (remaining = vnew.p.ub - vnew.p.count) < 2) {
+ pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
+ break; // not enough chunks to steal, goto next victim
+ }
+ if (remaining > 3) {
+ vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
+ } else {
+ vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+ }
+ KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+ // TODO: Should this be acquire or release?
+ if (KMP_COMPARE_AND_STORE_ACQ64(
+ (volatile kmp_int64 *)&victim->u.p.count,
+ *VOLATILE_CAST(kmp_int64 *) & vold.b,
+ *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+ // stealing succedded
+ KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub);
+ status = 1;
+ while_index = 0;
+ // now update own count and ub
+ init = vnew.p.ub;
+ vold.p.count = init + 1;
+#if KMP_ARCH_X86
+ KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
+#else
+ *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
+#endif
+ break;
+ } // if (check CAS result)
+ KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+ } // while (try to steal from particular victim)
+ } // while (search for victim)
+ } // if (try to find victim and steal)
+ } // if (4-byte induction variable)
+ if (!status) {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ } else {
+ start = pr->u.p.parm2;
+ init *= chunk;
+ limit = chunk + init - 1;
+ incr = pr->u.p.st;
+ KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
+
+ KMP_DEBUG_ASSERT(init <= trip);
+ if ((last = (limit >= trip)) != 0)
+ limit = trip;
+ if (p_st != NULL)
+ *p_st = incr;
+
+ if (incr == 1) {
+ *p_lb = start + init;
+ *p_ub = start + limit;
+ } else {
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ }
+
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } // if
+ break;
+ } // case
+#endif // ( KMP_STATIC_STEAL_ENABLED )
+ case kmp_sch_static_balanced: {
+ KD_TRACE(
+ 10,
+ ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
+ gtid));
+ /* check if thread has any iteration to do */
+ if ((status = !pr->u.p.count) != 0) {
+ pr->u.p.count = 1;
+ *p_lb = pr->u.p.lb;
+ *p_ub = pr->u.p.ub;
+ last = pr->u.p.parm1;
+ if (p_st != NULL)
+ *p_st = pr->u.p.st;
+ } else { /* no iterations to do */
+ pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+ }
+ } // case
+ break;
+ case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
+ merged here */
+ case kmp_sch_static_chunked: {
+ T parm1;
+
+ KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+ "kmp_sch_static_[affinity|chunked] case\n",
+ gtid));
+ parm1 = pr->u.p.parm1;
+
+ trip = pr->u.p.tc - 1;
+ init = parm1 * (pr->u.p.count + tid);
+
+ if ((status = (init <= trip)) != 0) {
+ start = pr->u.p.lb;
+ incr = pr->u.p.st;
+ limit = parm1 + init - 1;
+
+ if ((last = (limit >= trip)) != 0)
+ limit = trip;
+
+ if (p_st != NULL)
+ *p_st = incr;
+
+ pr->u.p.count += nproc;
+
+ if (incr == 1) {
+ *p_lb = start + init;
+ *p_ub = start + limit;
+ } else {
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ }
+
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } // if
+ } // case
+ break;
+
+ case kmp_sch_dynamic_chunked: {
+ T chunk = pr->u.p.parm1;
+
+ KD_TRACE(
+ 100,
+ ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
+ gtid));
+
+ init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+ trip = pr->u.p.tc - 1;
+
+ if ((status = (init <= trip)) == 0) {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ } else {
+ start = pr->u.p.lb;
+ limit = chunk + init - 1;
+ incr = pr->u.p.st;
+
+ if ((last = (limit >= trip)) != 0)
+ limit = trip;
+
+ if (p_st != NULL)
+ *p_st = incr;
+
+ if (incr == 1) {
+ *p_lb = start + init;
+ *p_ub = start + limit;
+ } else {
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ }
+
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } // if
+ } // case
+ break;
+
+ case kmp_sch_guided_iterative_chunked: {
+ T chunkspec = pr->u.p.parm1;
+ KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
+ "iterative case\n",
+ gtid));
+ trip = pr->u.p.tc;
+ // Start atomic part of calculations
+ while (1) {
+ ST remaining; // signed, because can be < 0
+ init = sh->u.s.iteration; // shared value
+ remaining = trip - init;
+ if (remaining <= 0) { // AC: need to compare with 0 first
+ // nothing to do, don't try atomic op
+ status = 0;
+ break;
+ }
+ if ((T)remaining <
+ pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
+ // use dynamic-style shcedule
+ // atomically inrement iterations, get old value
+ init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+ (ST)chunkspec);
+ remaining = trip - init;
+ if (remaining <= 0) {
+ status = 0; // all iterations got by other threads
+ } else {
+ // got some iterations to work on
+ status = 1;
+ if ((T)remaining > chunkspec) {
+ limit = init + chunkspec - 1;
+ } else {
+ last = 1; // the last chunk
+ limit = init + remaining - 1;
+ } // if
+ } // if
+ break;
+ } // if
+ limit = init +
+ (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
+ if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+ (ST)init, (ST)limit)) {
+ // CAS was successful, chunk obtained
+ status = 1;
+ --limit;
+ break;
+ } // if
+ } // while
+ if (status != 0) {
+ start = pr->u.p.lb;
+ incr = pr->u.p.st;
+ if (p_st != NULL)
+ *p_st = incr;
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } else {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ } // if
+ } // case
+ break;
+
+#if OMP_45_ENABLED
+ case kmp_sch_guided_simd: {
+ // same as iterative but curr-chunk adjusted to be multiple of given
+ // chunk
+ T chunk = pr->u.p.parm1;
+ KD_TRACE(100,
+ ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
+ gtid));
+ trip = pr->u.p.tc;
+ // Start atomic part of calculations
+ while (1) {
+ ST remaining; // signed, because can be < 0
+ init = sh->u.s.iteration; // shared value
+ remaining = trip - init;
+ if (remaining <= 0) { // AC: need to compare with 0 first
+ status = 0; // nothing to do, don't try atomic op
+ break;
+ }
+ KMP_DEBUG_ASSERT(init % chunk == 0);
+ // compare with K*nproc*(chunk+1), K=2 by default
+ if ((T)remaining < pr->u.p.parm2) {
+ // use dynamic-style shcedule
+ // atomically inrement iterations, get old value
+ init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+ (ST)chunk);
+ remaining = trip - init;
+ if (remaining <= 0) {
+ status = 0; // all iterations got by other threads
+ } else {
+ // got some iterations to work on
+ status = 1;
+ if ((T)remaining > chunk) {
+ limit = init + chunk - 1;
+ } else {
+ last = 1; // the last chunk
+ limit = init + remaining - 1;
+ } // if
+ } // if
+ break;
+ } // if
+ // divide by K*nproc
+ UT span = remaining * (*(double *)&pr->u.p.parm3);
+ UT rem = span % chunk;
+ if (rem) // adjust so that span%chunk == 0
+ span += chunk - rem;
+ limit = init + span;
+ if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+ (ST)init, (ST)limit)) {
+ // CAS was successful, chunk obtained
+ status = 1;
+ --limit;
+ break;
+ } // if
+ } // while
+ if (status != 0) {
+ start = pr->u.p.lb;
+ incr = pr->u.p.st;
+ if (p_st != NULL)
+ *p_st = incr;
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } else {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ } // if
+ } // case
+ break;
+#endif // OMP_45_ENABLED
+
+ case kmp_sch_guided_analytical_chunked: {
+ T chunkspec = pr->u.p.parm1;
+ UT chunkIdx;
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+ /* for storing original FPCW value for Windows* OS on
+ IA-32 architecture 8-byte version */
+ unsigned int oldFpcw;
+ unsigned int fpcwSet = 0;
+#endif
+ KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+ "kmp_sch_guided_analytical_chunked case\n",
+ gtid));
+
+ trip = pr->u.p.tc;
+
+ KMP_DEBUG_ASSERT(nproc > 1);
+ KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
+
+ while (1) { /* this while loop is a safeguard against unexpected zero
+ chunk sizes */
+ chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+ if (chunkIdx >= (UT)pr->u.p.parm2) {
+ --trip;
+ /* use dynamic-style scheduling */
+ init = chunkIdx * chunkspec + pr->u.p.count;
+ /* need to verify init > 0 in case of overflow in the above
+ * calculation */
+ if ((status = (init > 0 && init <= trip)) != 0) {
+ limit = init + chunkspec - 1;
+
+ if ((last = (limit >= trip)) != 0)
+ limit = trip;
+ }
+ break;
+ } else {
+/* use exponential-style scheduling */
+/* The following check is to workaround the lack of long double precision on
+ Windows* OS.
+ This check works around the possible effect that init != 0 for chunkIdx == 0.
+ */
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+ /* If we haven't already done so, save original
+ FPCW and set precision to 64-bit, as Windows* OS
+ on IA-32 architecture defaults to 53-bit */
+ if (!fpcwSet) {
+ oldFpcw = _control87(0, 0);
+ _control87(_PC_64, _MCW_PC);
+ fpcwSet = 0x30000;
+ }
+#endif
+ if (chunkIdx) {
+ init = __kmp_dispatch_guided_remaining<T>(
+ trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
+ KMP_DEBUG_ASSERT(init);
+ init = trip - init;
+ } else
+ init = 0;
+ limit = trip - __kmp_dispatch_guided_remaining<T>(
+ trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
+ KMP_ASSERT(init <= limit);
+ if (init < limit) {
+ KMP_DEBUG_ASSERT(limit <= trip);
+ --limit;
+ status = 1;
+ break;
+ } // if
+ } // if
+ } // while (1)
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+ /* restore FPCW if necessary
+ AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+ */
+ if (fpcwSet && (oldFpcw & fpcwSet))
+ _control87(oldFpcw, _MCW_PC);
+#endif
+ if (status != 0) {
+ start = pr->u.p.lb;
+ incr = pr->u.p.st;
+ if (p_st != NULL)
+ *p_st = incr;
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ }
+ } else {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ }
+ } // case
+ break;
+
+ case kmp_sch_trapezoidal: {
+ UT index;
+ T parm2 = pr->u.p.parm2;
+ T parm3 = pr->u.p.parm3;
+ T parm4 = pr->u.p.parm4;
+ KD_TRACE(100,
+ ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
+ gtid));
+
+ index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+
+ init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
+ trip = pr->u.p.tc - 1;
+
+ if ((status = ((T)index < parm3 && init <= trip)) == 0) {
+ *p_lb = 0;
+ *p_ub = 0;
+ if (p_st != NULL)
+ *p_st = 0;
+ } else {
+ start = pr->u.p.lb;
+ limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
+ incr = pr->u.p.st;
+
+ if ((last = (limit >= trip)) != 0)
+ limit = trip;
+
+ if (p_st != NULL)
+ *p_st = incr;
+
+ if (incr == 1) {
+ *p_lb = start + init;
+ *p_ub = start + limit;
+ } else {
+ *p_lb = start + init * incr;
+ *p_ub = start + limit * incr;
+ }
+
+ if (pr->flags.ordered) {
+ pr->u.p.ordered_lower = init;
+ pr->u.p.ordered_upper = limit;
+ } // if
+ } // if
+ } // case
+ break;
+ default: {
+ status = 0; // to avoid complaints on uninitialized variable use
+ __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+ KMP_HNT(GetNewerLibrary), // Hint
+ __kmp_msg_null // Variadic argument list terminator
+ );
+ } break;
+ } // switch
+ if (p_last)
+ *p_last = last;
+#ifdef KMP_DEBUG
+ if (pr->flags.ordered) {
+ char *buff;
+ // create format specifiers before the debug output
+ buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+ "ordered_lower:%%%s ordered_upper:%%%s\n",
+ traits_t<UT>::spec, traits_t<UT>::spec);
+ KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
+ __kmp_str_free(&buff);
+ }
+ {
+ char *buff;
+ // create format specifiers before the debug output
+ buff = __kmp_str_format(
+ "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
+ "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
+ traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+ KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
+ __kmp_str_free(&buff);
+ }
+#endif
+ return status;
+}
+
/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
is not called. */
@@ -1421,7 +1743,6 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
typedef typename traits_t<T>::unsigned_t UT;
typedef typename traits_t<T>::signed_t ST;
typedef typename traits_t<T>::floating_t DBL;
-
// This is potentially slightly misleading, schedule(runtime) will appear here
// even if the actual runtme schedule is static. (Which points out a
// disadavantage of schedule(runtime): even when static scheduling is used it
@@ -1434,18 +1755,10 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
kmp_team_t *team = th->th.th_team;
KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
- "p_ub:%%%s p_st:%%%s p_last: %%p\n",
- traits_t<T>::spec, traits_t<T>::spec,
- traits_t<ST>::spec);
- KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
- __kmp_str_free(&buff);
- }
-#endif
+ KD_TRACE(
+ 1000,
+ ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
+ gtid, p_lb, p_ub, p_st, p_last));
if (team->t.t_serialized) {
/* NOTE: serialize this dispatch becase we are not at the active level */
@@ -1465,7 +1778,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
}
}
- } else if (pr->nomerge) {
+ } else if (pr->flags.nomerge) {
kmp_int32 last;
T start;
UT limit, trip, init;
@@ -1513,7 +1826,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
*p_ub = start + limit * incr;
}
- if (pr->ordered) {
+ if (pr->flags.ordered) {
pr->u.p.ordered_lower = init;
pr->u.p.ordered_upper = limit;
#ifdef KMP_DEBUG
@@ -1561,10 +1874,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
return status;
} else {
kmp_int32 last = 0;
- dispatch_shared_info_template<UT> *sh;
- T start;
- ST incr;
- UT limit, trip, init;
+ dispatch_shared_info_template<T> volatile *sh;
KMP_DEBUG_ASSERT(th->th.th_dispatch ==
&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
@@ -1572,735 +1882,14 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
pr = reinterpret_cast<dispatch_private_info_template<T> *>(
th->th.th_dispatch->th_dispatch_pr_current);
KMP_DEBUG_ASSERT(pr);
- sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
+ sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
th->th.th_dispatch->th_dispatch_sh_current);
KMP_DEBUG_ASSERT(sh);
- if (pr->u.p.tc == 0) {
- // zero trip count
- status = 0;
- } else {
- switch (pr->schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
- case kmp_sch_static_steal: {
- T chunk = pr->u.p.parm1;
- int nproc = th->th.th_team_nproc;
-
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
- gtid));
-
- trip = pr->u.p.tc - 1;
-
- if (traits_t<T>::type_size > 4) {
- // use lock for 8-byte and CAS for 4-byte induction
- // variable. TODO (optional): check and use 16-byte CAS
- kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
- KMP_DEBUG_ASSERT(lck != NULL);
- if (pr->u.p.count < (UT)pr->u.p.ub) {
- __kmp_acquire_lock(lck, gtid);
- // try to get own chunk of iterations
- init = (pr->u.p.count)++;
- status = (init < (UT)pr->u.p.ub);
- __kmp_release_lock(lck, gtid);
- } else {
- status = 0; // no own chunks
- }
- if (!status) { // try to steal
- kmp_info_t **other_threads = team->t.t_threads;
- int while_limit = nproc; // nproc attempts to find a victim
- int while_index = 0;
- // TODO: algorithm of searching for a victim
- // should be cleaned up and measured
- while ((!status) && (while_limit != ++while_index)) {
- T remaining;
- T victimIdx = pr->u.p.parm4;
- T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
- dispatch_private_info_template<T> *victim =
- reinterpret_cast<dispatch_private_info_template<T> *>(
- other_threads[victimIdx]
- ->th.th_dispatch->th_dispatch_pr_current);
- while ((victim == NULL || victim == pr ||
- (*(volatile T *)&victim->u.p.static_steal_counter !=
- *(volatile T *)&pr->u.p.static_steal_counter)) &&
- oldVictimIdx != victimIdx) {
- victimIdx = (victimIdx + 1) % nproc;
- victim = reinterpret_cast<dispatch_private_info_template<T> *>(
- other_threads[victimIdx]
- ->th.th_dispatch->th_dispatch_pr_current);
- }
- if (!victim ||
- (*(volatile T *)&victim->u.p.static_steal_counter !=
- *(volatile T *)&pr->u.p.static_steal_counter)) {
- continue; // try once more (nproc attempts in total)
- // no victim is ready yet to participate in stealing
- // because all victims are still in kmp_init_dispatch
- }
- if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
- pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
- continue; // not enough chunks to steal, goto next victim
- }
-
- lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
- KMP_ASSERT(lck != NULL);
- __kmp_acquire_lock(lck, gtid);
- limit = victim->u.p.ub; // keep initial ub
- if (victim->u.p.count >= limit ||
- (remaining = limit - victim->u.p.count) < 2) {
- __kmp_release_lock(lck, gtid);
- pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
- continue; // not enough chunks to steal
- }
- // stealing succeded, reduce victim's ub by 1/4 of undone chunks
- // or by 1
- if (remaining > 3) {
- KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
- init = (victim->u.p.ub -=
- (remaining >> 2)); // steal 1/4 of remaining
- } else {
- KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
- init =
- (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
- }
- __kmp_release_lock(lck, gtid);
-
- KMP_DEBUG_ASSERT(init + 1 <= limit);
- pr->u.p.parm4 = victimIdx; // remember victim to steal from
- status = 1;
- while_index = 0;
- // now update own count and ub with stolen range but init chunk
- __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
- pr->u.p.count = init + 1;
- pr->u.p.ub = limit;
- __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
- } // while (search for victim)
- } // if (try to find victim and steal)
- } else {
- // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
- typedef union {
- struct {
- UT count;
- T ub;
- } p;
- kmp_int64 b;
- } union_i4;
- // All operations on 'count' or 'ub' must be combined atomically
- // together.
- {
- union_i4 vold, vnew;
- vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
- vnew = vold;
- vnew.p.count++;
- while (!KMP_COMPARE_AND_STORE_ACQ64(
- (volatile kmp_int64 *)&pr->u.p.count,
- *VOLATILE_CAST(kmp_int64 *) & vold.b,
- *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
- KMP_CPU_PAUSE();
- vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
- vnew = vold;
- vnew.p.count++;
- }
- vnew = vold;
- init = vnew.p.count;
- status = (init < (UT)vnew.p.ub);
- }
-
- if (!status) {
- kmp_info_t **other_threads = team->t.t_threads;
- int while_limit = nproc; // nproc attempts to find a victim
- int while_index = 0;
-
- // TODO: algorithm of searching for a victim
- // should be cleaned up and measured
- while ((!status) && (while_limit != ++while_index)) {
- union_i4 vold, vnew;
- kmp_int32 remaining;
- T victimIdx = pr->u.p.parm4;
- T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
- dispatch_private_info_template<T> *victim =
- reinterpret_cast<dispatch_private_info_template<T> *>(
- other_threads[victimIdx]
- ->th.th_dispatch->th_dispatch_pr_current);
- while ((victim == NULL || victim == pr ||
- (*(volatile T *)&victim->u.p.static_steal_counter !=
- *(volatile T *)&pr->u.p.static_steal_counter)) &&
- oldVictimIdx != victimIdx) {
- victimIdx = (victimIdx + 1) % nproc;
- victim = reinterpret_cast<dispatch_private_info_template<T> *>(
- other_threads[victimIdx]
- ->th.th_dispatch->th_dispatch_pr_current);
- }
- if (!victim ||
- (*(volatile T *)&victim->u.p.static_steal_counter !=
- *(volatile T *)&pr->u.p.static_steal_counter)) {
- continue; // try once more (nproc attempts in total)
- // no victim is ready yet to participate in stealing
- // because all victims are still in kmp_init_dispatch
- }
- pr->u.p.parm4 = victimIdx; // new victim found
- while (1) { // CAS loop if victim has enough chunks to steal
- vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
- vnew = vold;
-
- KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
- if (vnew.p.count >= (UT)vnew.p.ub ||
- (remaining = vnew.p.ub - vnew.p.count) < 2) {
- pr->u.p.parm4 =
- (victimIdx + 1) % nproc; // shift start victim id
- break; // not enough chunks to steal, goto next victim
- }
- if (remaining > 3) {
- vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
- } else {
- vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
- }
- KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
- // TODO: Should this be acquire or release?
- if (KMP_COMPARE_AND_STORE_ACQ64(
- (volatile kmp_int64 *)&victim->u.p.count,
- *VOLATILE_CAST(kmp_int64 *) & vold.b,
- *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
- // stealing succeeded
- KMP_COUNT_VALUE(FOR_static_steal_stolen,
- vold.p.ub - vnew.p.ub);
- status = 1;
- while_index = 0;
- // now update own count and ub
- init = vnew.p.ub;
- vold.p.count = init + 1;
-#if KMP_ARCH_X86
- KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
- vold.b);
-#else
- *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
-#endif
- break;
- } // if (check CAS result)
- KMP_CPU_PAUSE(); // CAS failed, repeat attempt
- } // while (try to steal from particular victim)
- } // while (search for victim)
- } // if (try to find victim and steal)
- } // if (4-byte induction variable)
- if (!status) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- start = pr->u.p.parm2;
- init *= chunk;
- limit = chunk + init - 1;
- incr = pr->u.p.st;
- KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
-
- KMP_DEBUG_ASSERT(init <= trip);
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- if (p_st != NULL)
- *p_st = incr;
-
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
-
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } // if
- break;
- } // case
-#endif // ( KMP_STATIC_STEAL_ENABLED )
- case kmp_sch_static_balanced: {
- KD_TRACE(
- 100,
- ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
- if ((status = !pr->u.p.count) !=
- 0) { /* check if thread has any iteration to do */
- pr->u.p.count = 1;
- *p_lb = pr->u.p.lb;
- *p_ub = pr->u.p.ub;
- last = pr->u.p.parm1;
- if (p_st != NULL)
- *p_st = pr->u.p.st;
- } else { /* no iterations to do */
- pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
- }
- if (pr->ordered) {
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } // case
- break;
- case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
- merged here */
- case kmp_sch_static_chunked: {
- T parm1;
-
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
- "kmp_sch_static_[affinity|chunked] case\n",
- gtid));
- parm1 = pr->u.p.parm1;
-
- trip = pr->u.p.tc - 1;
- init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
-
- if ((status = (init <= trip)) != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- limit = parm1 + init - 1;
-
- if ((last = (limit >= trip)) != 0)
- limit = trip;
-
- if (p_st != NULL)
- *p_st = incr;
-
- pr->u.p.count += th->th.th_team_nproc;
-
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
-
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } // if
- } // case
- break;
-
- case kmp_sch_dynamic_chunked: {
- T chunk = pr->u.p.parm1;
-
- KD_TRACE(
- 100,
- ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
-
- init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
- trip = pr->u.p.tc - 1;
-
- if ((status = (init <= trip)) == 0) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- start = pr->u.p.lb;
- limit = chunk + init - 1;
- incr = pr->u.p.st;
-
- if ((last = (limit >= trip)) != 0)
- limit = trip;
-
- if (p_st != NULL)
- *p_st = incr;
-
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
-
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } // if
- } // case
- break;
-
- case kmp_sch_guided_iterative_chunked: {
- T chunkspec = pr->u.p.parm1;
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
- "iterative case\n",
- gtid));
- trip = pr->u.p.tc;
- // Start atomic part of calculations
- while (1) {
- ST remaining; // signed, because can be < 0
- init = sh->u.s.iteration; // shared value
- remaining = trip - init;
- if (remaining <= 0) { // AC: need to compare with 0 first
- // nothing to do, don't try atomic op
- status = 0;
- break;
- }
- if ((T)remaining <
- pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
- // use dynamic-style shcedule
- // atomically inrement iterations, get old value
- init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)chunkspec);
- remaining = trip - init;
- if (remaining <= 0) {
- status = 0; // all iterations got by other threads
- } else { // got some iterations to work on
- status = 1;
- if ((T)remaining > chunkspec) {
- limit = init + chunkspec - 1;
- } else {
- last = 1; // the last chunk
- limit = init + remaining - 1;
- } // if
- } // if
- break;
- } // if
- limit = init + (UT)(remaining *
- *(double *)&pr->u.p.parm3); // divide by K*nproc
- if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)init, (ST)limit)) {
- // CAS was successful, chunk obtained
- status = 1;
- --limit;
- break;
- } // if
- } // while
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } // if
- } // case
- break;
-
- case kmp_sch_guided_simd: {
- // same as iterative but curr-chunk adjusted to be multiple of given
- // chunk
- T chunk = pr->u.p.parm1;
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
- gtid));
- trip = pr->u.p.tc;
- // Start atomic part of calculations
- while (1) {
- ST remaining; // signed, because can be < 0
- init = sh->u.s.iteration; // shared value
- remaining = trip - init;
- if (remaining <= 0) { // AC: need to compare with 0 first
- status = 0; // nothing to do, don't try atomic op
- break;
- }
- KMP_DEBUG_ASSERT(init % chunk == 0);
- // compare with K*nproc*(chunk+1), K=2 by default
- if ((T)remaining < pr->u.p.parm2) {
- // use dynamic-style shcedule
- // atomically inrement iterations, get old value
- init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)chunk);
- remaining = trip - init;
- if (remaining <= 0) {
- status = 0; // all iterations got by other threads
- } else {
- // got some iterations to work on
- status = 1;
- if ((T)remaining > chunk) {
- limit = init + chunk - 1;
- } else {
- last = 1; // the last chunk
- limit = init + remaining - 1;
- } // if
- } // if
- break;
- } // if
- // divide by K*nproc
- UT span = remaining * (*(double *)&pr->u.p.parm3);
- UT rem = span % chunk;
- if (rem) // adjust so that span%chunk == 0
- span += chunk - rem;
- limit = init + span;
- if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
- (ST)init, (ST)limit)) {
- // CAS was successful, chunk obtained
- status = 1;
- --limit;
- break;
- } // if
- } // while
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } // if
- } // case
- break;
-
- case kmp_sch_guided_analytical_chunked: {
- T chunkspec = pr->u.p.parm1;
- UT chunkIdx;
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
- /* for storing original FPCW value for Windows* OS on
- IA-32 architecture 8-byte version */
- unsigned int oldFpcw;
- unsigned int fpcwSet = 0;
-#endif
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
- "analytical case\n",
- gtid));
-
- trip = pr->u.p.tc;
-
- KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
- KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
- trip);
-
- while (1) { /* this while loop is a safeguard against unexpected zero
- chunk sizes */
- chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
- if (chunkIdx >= (UT)pr->u.p.parm2) {
- --trip;
- /* use dynamic-style scheduling */
- init = chunkIdx * chunkspec + pr->u.p.count;
- /* need to verify init > 0 in case of overflow in the above
- * calculation */
- if ((status = (init > 0 && init <= trip)) != 0) {
- limit = init + chunkspec - 1;
-
- if ((last = (limit >= trip)) != 0)
- limit = trip;
- }
- break;
- } else {
-/* use exponential-style scheduling */
-/* The following check is to workaround the lack of long double precision on
- Windows* OS.
- This check works around the possible effect that init != 0 for chunkIdx == 0.
- */
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
- /* If we haven't already done so, save original FPCW and set
- precision to 64-bit, as Windows* OS on IA-32 architecture
- defaults to 53-bit */
- if (!fpcwSet) {
- oldFpcw = _control87(0, 0);
- _control87(_PC_64, _MCW_PC);
- fpcwSet = 0x30000;
- }
-#endif
- if (chunkIdx) {
- init = __kmp_dispatch_guided_remaining<T>(
- trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
- KMP_DEBUG_ASSERT(init);
- init = trip - init;
- } else
- init = 0;
- limit = trip - __kmp_dispatch_guided_remaining<T>(
- trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
- KMP_ASSERT(init <= limit);
- if (init < limit) {
- KMP_DEBUG_ASSERT(limit <= trip);
- --limit;
- status = 1;
- break;
- } // if
- } // if
- } // while (1)
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
- /* restore FPCW if necessary
- AC: check fpcwSet flag first because oldFpcw can be uninitialized
- here */
- if (fpcwSet && (oldFpcw & fpcwSet))
- _control87(oldFpcw, _MCW_PC);
-#endif
- if (status != 0) {
- start = pr->u.p.lb;
- incr = pr->u.p.st;
- if (p_st != NULL)
- *p_st = incr;
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- }
- } else {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- }
- } // case
- break;
-
- case kmp_sch_trapezoidal: {
- UT index;
- T parm2 = pr->u.p.parm2;
- T parm3 = pr->u.p.parm3;
- T parm4 = pr->u.p.parm4;
- KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
- gtid));
-
- index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
-
- init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
- trip = pr->u.p.tc - 1;
-
- if ((status = ((T)index < parm3 && init <= trip)) == 0) {
- *p_lb = 0;
- *p_ub = 0;
- if (p_st != NULL)
- *p_st = 0;
- } else {
- start = pr->u.p.lb;
- limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
- incr = pr->u.p.st;
-
- if ((last = (limit >= trip)) != 0)
- limit = trip;
-
- if (p_st != NULL)
- *p_st = incr;
-
- if (incr == 1) {
- *p_lb = start + init;
- *p_ub = start + limit;
- } else {
- *p_lb = start + init * incr;
- *p_ub = start + limit * incr;
- }
-
- if (pr->ordered) {
- pr->u.p.ordered_lower = init;
- pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
- {
- char *buff;
- // create format specifiers before the debug output
- buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
- "ordered_lower:%%%s ordered_upper:%%%s\n",
- traits_t<UT>::spec, traits_t<UT>::spec);
- KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
- pr->u.p.ordered_upper));
- __kmp_str_free(&buff);
- }
-#endif
- } // if
- } // if
- } // case
- break;
- default: {
- status = 0; // to avoid complaints on uninitialized variable use
- __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
- KMP_HNT(GetNewerLibrary), // Hint
- __kmp_msg_null // Variadic argument list terminator
- );
- } break;
- } // switch
- } // if tc == 0;
-
+ status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
+ p_st, th->th.th_team_nproc,
+ th->th.th_info.ds.ds_tid);
+ // status == 0: no more iterations to execute
if (status == 0) {
UT num_done;
@@ -2312,7 +1901,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
traits_t<UT>::spec);
- KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
+ KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
__kmp_str_free(&buff);
}
#endif
@@ -2341,7 +1930,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
sh->u.s.iteration = 0;
/* TODO replace with general release procedure? */
- if (pr->ordered) {
+ if (pr->flags.ordered) {
sh->u.s.ordered_iteration = 0;
}
@@ -2380,9 +1969,10 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
// create format specifiers before the debug output
buff = __kmp_str_format(
"__kmp_dispatch_next: T#%%d normal case: "
- "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
+ "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
- KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
+ KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
+ (p_last ? *p_last : 0), status));
__kmp_str_free(&buff);
}
#endif
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
new file mode 100644
index 00000000000..d5dba0f442a
--- /dev/null
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -0,0 +1,507 @@
+/*
+ * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_H
+#define KMP_DISPATCH_H
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+// Need to raise Win version from XP to Vista here for support of
+// InterlockedExchange64
+#if defined(_WIN32_WINNT) && defined(_M_IX86)
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0502
+#endif
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+#include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename T> struct dispatch_shared_info_template;
+template <typename T> struct dispatch_private_info_template;
+
+template <typename T>
+extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+ dispatch_private_info_template<T> *pr,
+ enum sched_type schedule, T lb, T ub,
+ typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+ kmp_uint64 *cur_chunk,
+#endif
+ typename traits_t<T>::signed_t chunk,
+ T nproc, T unit_id);
+template <typename T>
+extern int __kmp_dispatch_next_algorithm(
+ int gtid, dispatch_private_info_template<T> *pr,
+ dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
+ T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#if KMP_STATIC_STEAL_ENABLED
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+ typedef typename traits_t<T>::unsigned_t UT;
+ typedef typename traits_t<T>::signed_t ST;
+ UT count; // unsigned
+ T ub;
+ /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+ T lb;
+ ST st; // signed
+ UT tc; // unsigned
+ T static_steal_counter; // for static_steal only; maybe better to put after ub
+
+ /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+ // a) parm3 is properly aligned and
+ // b) all parm1-4 are in the same cache line.
+ // Because of parm1-4 are used together, performance seems to be better
+ // if they are in the same line (not measured though).
+
+ struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
+ T parm1;
+ T parm2;
+ T parm3;
+ T parm4;
+ };
+
+ UT ordered_lower; // unsigned
+ UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+ T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+ typedef typename traits_t<T>::unsigned_t UT;
+ typedef typename traits_t<T>::signed_t ST;
+ T lb;
+ T ub;
+ ST st; // signed
+ UT tc; // unsigned
+
+ T parm1;
+ T parm2;
+ T parm3;
+ T parm4;
+
+ UT count; // unsigned
+
+ UT ordered_lower; // unsigned
+ UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+ T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
+ // duplicate alignment here, otherwise size of structure is not correct in our
+ // compiler
+ union KMP_ALIGN_CACHE private_info_tmpl {
+ dispatch_private_infoXX_template<T> p;
+ dispatch_private_info64_t p64;
+ } u;
+ enum sched_type schedule; /* scheduling algorithm */
+ kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+ kmp_uint32 ordered_bumped;
+ // to retain the structure size after making order
+ kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+ dispatch_private_info *next; /* stack of buffers for nest of serial regions */
+ kmp_uint32 type_size;
+ enum cons_type pushed_ws;
+};
+
+// replaces dispatch_shared_info{32,64} structures and
+// dispatch_shared_info{32,64}_t types
+template <typename T> struct dispatch_shared_infoXX_template {
+ typedef typename traits_t<T>::unsigned_t UT;
+ /* chunk index under dynamic, number of idle threads under static-steal;
+ iteration index otherwise */
+ volatile UT iteration;
+ volatile UT num_done;
+ volatile UT ordered_iteration;
+ // to retain the structure size making ordered_iteration scalar
+ UT ordered_dummy[KMP_MAX_ORDERED - 3];
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template <typename T> struct dispatch_shared_info_template {
+ typedef typename traits_t<T>::unsigned_t UT;
+ // we need union here to keep the structure size
+ union shared_info_tmpl {
+ dispatch_shared_infoXX_template<UT> s;
+ dispatch_shared_info64_t s64;
+ } u;
+ volatile kmp_uint32 buffer_index;
+#if OMP_45_ENABLED
+ volatile kmp_int32 doacross_buf_idx; // teamwise index
+ kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+ kmp_int32 doacross_num_done; // count finished threads
+#endif
+#if KMP_USE_HWLOC
+ // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+ // machines (> 48 cores). Performance analysis showed that a cache thrash
+ // was occurring and this padding helps alleviate the problem.
+ char padding[64];
+#endif
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
+
+template <>
+__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
+ kmp_int32 d) {
+ kmp_int32 r;
+ r = KMP_TEST_THEN_ADD32(p, d);
+ return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
+ kmp_int64 d) {
+ kmp_int64 r;
+ r = KMP_TEST_THEN_ADD64(p, d);
+ return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
+ kmp_int32 r;
+ r = KMP_TEST_THEN_INC_ACQ32(p);
+ return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
+ kmp_int64 r;
+ r = KMP_TEST_THEN_INC_ACQ64(p);
+ return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
+ kmp_int32 r;
+ r = KMP_TEST_THEN_INC32(p);
+ return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
+ kmp_int64 r;
+ r = KMP_TEST_THEN_INC64(p);
+ return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template <typename T>
+static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
+ kmp_int32 c, kmp_int32 s) {
+ return KMP_COMPARE_AND_STORE_REL32(p, c, s);
+}
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
+ kmp_int64 c, kmp_int64 s) {
+ return KMP_COMPARE_AND_STORE_REL64(p, c, s);
+}
+
+template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
+ return value >= checker;
+}
+template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
+ return value == checker;
+}
+
+/*
+ Spin wait loop that first does pause, then yield.
+ Waits until function returns non-zero when called with *spinner and check.
+ Does NOT put threads to sleep.
+ Arguments:
+ UT is unsigned 4- or 8-byte type
+ spinner - memory location to check value
+ checker - value which spinner is >, <, ==, etc.
+ pred - predicate function to perform binary comparison of some sort
+#if USE_ITT_BUILD
+ obj -- is higher-level synchronization object to report to ittnotify. It
+ is used to report locks consistently. For example, if lock is acquired
+ immediately, its address is reported to ittnotify via
+ KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
+ and lock routine calls to KMP_WAIT_YIELD(), the later should report the
+ same address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+ TODO: make inline function (move to header file for icl)
+*/
+template <typename UT>
+static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
+ kmp_uint32 (*pred)(UT, UT)
+ USE_ITT_BUILD_ARG(void *obj)) {
+ // note: we may not belong to a team at this point
+ volatile UT *spin = spinner;
+ UT check = checker;
+ kmp_uint32 spins;
+ kmp_uint32 (*f)(UT, UT) = pred;
+ UT r;
+
+ KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
+ KMP_INIT_YIELD(spins);
+ // main wait spin loop
+ while (!f(r = *spin, check)) {
+ KMP_FSYNC_SPIN_PREPARE(obj);
+ /* GEH - remove this since it was accidentally introduced when kmp_wait was
+ split.
+ It causes problems with infinite recursion because of exit lock */
+ /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+ __kmp_abort_thread(); */
+
+ // if we are oversubscribed,
+ // or have waited a bit (and KMP_LIBRARY=throughput, then yield
+ // pause is in the following code
+ KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+ KMP_YIELD_SPIN(spins);
+ }
+ KMP_FSYNC_SPIN_ACQUIRED(obj);
+ return r;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename UT>
+void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+ typedef typename traits_t<UT>::signed_t ST;
+ dispatch_private_info_template<UT> *pr;
+
+ int gtid = *gtid_ref;
+ // int cid = *cid_ref;
+ kmp_info_t *th = __kmp_threads[gtid];
+ KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+ KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
+ if (__kmp_env_consistency_check) {
+ pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_pr_current);
+ if (pr->pushed_ws != ct_none) {
+#if KMP_USE_DYNAMIC_LOCK
+ __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+ __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+ }
+ }
+
+ if (!th->th.th_team->t.t_serialized) {
+ dispatch_shared_info_template<UT> *sh =
+ reinterpret_cast<dispatch_shared_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_sh_current);
+ UT lower;
+
+ if (!__kmp_env_consistency_check) {
+ pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_pr_current);
+ }
+ lower = pr->u.p.ordered_lower;
+
+#if !defined(KMP_GOMP_COMPAT)
+ if (__kmp_env_consistency_check) {
+ if (pr->ordered_bumped) {
+ struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+ __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+ ct_ordered_in_pdo, loc_ref,
+ &p->stack_data[p->w_top]);
+ }
+ }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+ KMP_MB();
+#ifdef KMP_DEBUG
+ {
+ char *buff;
+ // create format specifiers before the debug output
+ buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
+ "ordered_iter:%%%s lower:%%%s\n",
+ traits_t<UT>::spec, traits_t<UT>::spec);
+ KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+ __kmp_str_free(&buff);
+ }
+#endif
+ __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
+ __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+ KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+ {
+ char *buff;
+ // create format specifiers before the debug output
+ buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
+ "ordered_iter:%%%s lower:%%%s\n",
+ traits_t<UT>::spec, traits_t<UT>::spec);
+ KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+ __kmp_str_free(&buff);
+ }
+#endif
+ }
+ KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
+}
+
+template <typename UT>
+void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+ typedef typename traits_t<UT>::signed_t ST;
+ dispatch_private_info_template<UT> *pr;
+
+ int gtid = *gtid_ref;
+ // int cid = *cid_ref;
+ kmp_info_t *th = __kmp_threads[gtid];
+ KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+ KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
+ if (__kmp_env_consistency_check) {
+ pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_pr_current);
+ if (pr->pushed_ws != ct_none) {
+ __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
+ }
+ }
+
+ if (!th->th.th_team->t.t_serialized) {
+ dispatch_shared_info_template<UT> *sh =
+ reinterpret_cast<dispatch_shared_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_sh_current);
+
+ if (!__kmp_env_consistency_check) {
+ pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+ th->th.th_dispatch->th_dispatch_pr_current);
+ }
+
+ KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
+#if !defined(KMP_GOMP_COMPAT)
+ if (__kmp_env_consistency_check) {
+ if (pr->ordered_bumped != 0) {
+ struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+ /* How to test it? - OM */
+ __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+ ct_ordered_in_pdo, loc_ref,
+ &p->stack_data[p->w_top]);
+ }
+ }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+ KMP_MB(); /* Flush all pending memory write invalidates. */
+
+ pr->ordered_bumped += 1;
+
+ KD_TRACE(1000,
+ ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+ gtid, pr->ordered_bumped));
+
+ KMP_MB(); /* Flush all pending memory write invalidates. */
+
+ /* TODO use general release procedure? */
+ test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+
+ KMP_MB(); /* Flush all pending memory write invalidates. */
+ }
+ KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer
+ */
+template <typename UT>
+static __forceinline long double __kmp_pow(long double x, UT y) {
+ long double s = 1.0L;
+
+ KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+ // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+ while (y) {
+ if (y & 1)
+ s *= x;
+ x *= x;
+ y >>= 1;
+ }
+ return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks
+ have been assigned
+ (the total number of unassigned iterations in chunks with index greater than
+ or equal to idx).
+ __forceinline seems to be broken so that if we __forceinline this function,
+ the behavior is wrong
+ (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template <typename T>
+static __inline typename traits_t<T>::unsigned_t
+__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
+ typename traits_t<T>::unsigned_t idx) {
+ /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+ least for ICL 8.1, long double arithmetic may not really have
+ long double precision, even with /Qlong_double. Currently, we
+ workaround that in the caller code, by manipulating the FPCW for
+ Windows* OS on IA-32 architecture. The lack of precision is not
+ expected to be a correctness issue, though.
+ */
+ typedef typename traits_t<T>::unsigned_t UT;
+
+ long double x = tc * __kmp_pow<UT>(base, idx);
+ UT r = (UT)x;
+ if (x == r)
+ return r;
+ return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
+// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more
+// flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static const int guided_int_param = 2;
+static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
+#endif // KMP_DISPATCH_H
OpenPOWER on IntegriCloud