summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/runtime/src/dllexports4
-rw-r--r--openmp/runtime/src/kmp.h26
-rw-r--r--openmp/runtime/src/kmp_csupport.c289
-rw-r--r--openmp/runtime/src/kmp_dispatch.cpp7
-rw-r--r--openmp/runtime/src/kmp_runtime.c23
5 files changed, 341 insertions, 8 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index a8a70c0a364..6ff52521e50 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -389,6 +389,10 @@ kmpc_set_defaults 224
%ifdef OMP_41
__kmpc_proxy_task_completed 259
__kmpc_proxy_task_completed_ooo 260
+ __kmpc_doacross_init 261
+ __kmpc_doacross_wait 262
+ __kmpc_doacross_post 263
+ __kmpc_doacross_fini 264
%endif
%endif
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 71bb323159f..140bdd8caef 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1665,7 +1665,7 @@ typedef struct dispatch_shared_info64 {
volatile kmp_uint64 iteration;
volatile kmp_uint64 num_done;
volatile kmp_uint64 ordered_iteration;
- kmp_int64 ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+ kmp_int64 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
} dispatch_shared_info64_t;
typedef struct dispatch_shared_info {
@@ -1673,8 +1673,12 @@ typedef struct dispatch_shared_info {
dispatch_shared_info32_t s32;
dispatch_shared_info64_t s64;
} u;
-/* volatile kmp_int32 dispatch_abort; depricated */
volatile kmp_uint32 buffer_index;
+#if OMP_41_ENABLED
+ volatile kmp_int32 doacross_buf_idx; // teamwise index
+ volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
+ kmp_int32 doacross_num_done; // count finished threads
+#endif
} dispatch_shared_info_t;
typedef struct kmp_disp {
@@ -1688,7 +1692,13 @@ typedef struct kmp_disp {
dispatch_private_info_t *th_disp_buffer;
kmp_int32 th_disp_index;
+#if OMP_41_ENABLED
+ kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index
+ volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags
+ kmp_int64 *th_doacross_info; // info on loop bounds
+#else
void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+#endif
#if KMP_USE_INTERNODE_ALIGNMENT
char more_padding[INTERNODE_CACHE_LINE];
#endif
@@ -3543,7 +3553,17 @@ KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp
KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
-
+#endif
+#if OMP_41_ENABLED
+struct kmp_dim { // loop bounds info casted to kmp_int64
+ kmp_int64 lo; // lower
+ kmp_int64 up; // upper
+ kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim * dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
#endif
KMP_EXPORT void*
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c
index 905f596c964..50650aac2ac 100644
--- a/openmp/runtime/src/kmp_csupport.c
+++ b/openmp/runtime/src/kmp_csupport.c
@@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
__kmp_place_num_threads_per_core = nT;
}
+#if OMP_41_ENABLED
+/*!
+@ingroup WORK_SHARING
+@param loc source location information.
+@param gtid global thread number.
+@param num_dims number of associated doacross loops.
+@param dims info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void
+__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
+{
+ int j, idx;
+ kmp_int64 last, trace_count;
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_uint32 *flags;
+ kmp_disp_t *pr_buf = th->th.th_dispatch;
+ dispatch_shared_info_t *sh_buf;
+
+ KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+ gtid, num_dims, !team->t.t_serialized));
+ KMP_DEBUG_ASSERT(dims != NULL);
+ KMP_DEBUG_ASSERT(num_dims > 0);
+
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+ KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+ idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
+ sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+
+ // Save bounds info into allocated private buffer
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+ pr_buf->th_doacross_info =
+ (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
+ // Save also address of num_done in order to access it later without knowing the buffer index
+ pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+ pr_buf->th_doacross_info[2] = dims[0].lo;
+ pr_buf->th_doacross_info[3] = dims[0].up;
+ pr_buf->th_doacross_info[4] = dims[0].st;
+ last = 5;
+ for( j = 1; j < num_dims; ++j ) {
+ kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
+ if( dims[j].st == 1 ) { // most common case
+ // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+ range_length = dims[j].up - dims[j].lo + 1;
+ } else {
+ if( dims[j].st > 0 ) {
+ KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+ range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+ } else { // negative increment
+ KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+ range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+ }
+ }
+ pr_buf->th_doacross_info[last++] = range_length;
+ pr_buf->th_doacross_info[last++] = dims[j].lo;
+ pr_buf->th_doacross_info[last++] = dims[j].up;
+ pr_buf->th_doacross_info[last++] = dims[j].st;
+ }
+
+ // Compute total trip count.
+ // Start with range of dims[0] which we don't need to keep in the buffer.
+ if( dims[0].st == 1 ) { // most common case
+ trace_count = dims[0].up - dims[0].lo + 1;
+ } else if( dims[0].st > 0 ) {
+ KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+ trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+ } else { // negative increment
+ KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+ trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+ }
+ for( j = 1; j < num_dims; ++j ) {
+ trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+ }
+ KMP_DEBUG_ASSERT(trace_count > 0);
+
+ // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF)
+ if( idx != sh_buf->doacross_buf_idx ) {
+ // Shared buffer is occupied, wait for it to be free
+ __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
+ }
+ // Check if we are the first thread. After the CAS the first thread gets 0,
+ // others get 1 if initialization is in progress, allocated pointer otherwise.
+ flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
+ (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
+ if( flags == NULL ) {
+ // we are the first thread, allocate the array of flags
+ kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+ sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
+ } else if( (kmp_int64)flags == 1 ) {
+ // initialization is still in progress, need to wait
+ while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
+ KMP_YIELD(TRUE);
+ }
+ }
+ KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
+ pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not
+ // touch shared buffer on each iteration
+ KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void
+__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
+{
+ kmp_int32 shft, num_dims, i;
+ kmp_uint32 flag;
+ kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf;
+ kmp_int64 lo, up, st;
+
+ KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+
+ // calculate sequential iteration number and check out-of-bounds condition
+ pr_buf = th->th.th_dispatch;
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ num_dims = pr_buf->th_doacross_info[0];
+ lo = pr_buf->th_doacross_info[2];
+ up = pr_buf->th_doacross_info[3];
+ st = pr_buf->th_doacross_info[4];
+ if( st == 1 ) { // most common case
+ if( vec[0] < lo || vec[0] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = vec[0] - lo;
+ } else if( st > 0 ) {
+ if( vec[0] < lo || vec[0] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = (kmp_uint64)(vec[0] - lo) / st;
+ } else { // negative increment
+ if( vec[0] > lo || vec[0] < up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[0], lo, up));
+ return;
+ }
+ iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+ }
+ for( i = 1; i < num_dims; ++i ) {
+ kmp_int64 iter, ln;
+ kmp_int32 j = i * 4;
+ ln = pr_buf->th_doacross_info[j + 1];
+ lo = pr_buf->th_doacross_info[j + 2];
+ up = pr_buf->th_doacross_info[j + 3];
+ st = pr_buf->th_doacross_info[j + 4];
+ if( st == 1 ) {
+ if( vec[i] < lo || vec[i] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = vec[i] - lo;
+ } else if( st > 0 ) {
+ if( vec[i] < lo || vec[i] > up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = (kmp_uint64)(vec[i] - lo) / st;
+ } else { // st < 0
+ if( vec[i] > lo || vec[i] < up ) {
+ KA_TRACE(20,(
+ "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+ gtid, vec[i], lo, up));
+ return;
+ }
+ iter = (kmp_uint64)(lo - vec[i]) / (-st);
+ }
+ iter_number = iter + ln * iter_number;
+ }
+ shft = iter_number % 32; // use 32-bit granularity
+ iter_number >>= 5; // divided by 32
+ flag = 1 << shft;
+ while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
+ KMP_YIELD(TRUE);
+ }
+ KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+ gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
+{
+ kmp_int32 shft, num_dims, i;
+ kmp_uint32 flag;
+ kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf;
+ kmp_int64 lo, st;
+
+ KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
+ return; // no dependencies if team is serialized
+ }
+
+ // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
+ pr_buf = th->th.th_dispatch;
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+ num_dims = pr_buf->th_doacross_info[0];
+ lo = pr_buf->th_doacross_info[2];
+ st = pr_buf->th_doacross_info[4];
+ if( st == 1 ) { // most common case
+ iter_number = vec[0] - lo;
+ } else if( st > 0 ) {
+ iter_number = (kmp_uint64)(vec[0] - lo) / st;
+ } else { // negative increment
+ iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+ }
+ for( i = 1; i < num_dims; ++i ) {
+ kmp_int64 iter, ln;
+ kmp_int32 j = i * 4;
+ ln = pr_buf->th_doacross_info[j + 1];
+ lo = pr_buf->th_doacross_info[j + 2];
+ st = pr_buf->th_doacross_info[j + 4];
+ if( st == 1 ) {
+ iter = vec[i] - lo;
+ } else if( st > 0 ) {
+ iter = (kmp_uint64)(vec[i] - lo) / st;
+ } else { // st < 0
+ iter = (kmp_uint64)(lo - vec[i]) / (-st);
+ }
+ iter_number = iter + ln * iter_number;
+ }
+ shft = iter_number % 32; // use 32-bit granularity
+ iter_number >>= 5; // divided by 32
+ flag = 1 << shft;
+ if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
+ KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
+ KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
+ gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_fini(ident_t *loc, int gtid)
+{
+ kmp_int64 num_done;
+ kmp_info_t *th = __kmp_threads[gtid];
+ kmp_team_t *team = th->th.th_team;
+ kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+ KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+ if( team->t.t_serialized ) {
+ KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+ return; // nothing to do
+ }
+ num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
+ if( num_done == th->th.th_team_nproc ) {
+ // we are the last thread, need to free shared resources
+ int idx = pr_buf->th_doacross_buf_idx - 1;
+ dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+ KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
+ KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
+ KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+ __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
+ sh_buf->doacross_flags = NULL;
+ sh_buf->doacross_num_done = 0;
+ sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use
+ }
+ // free private resources (need to keep buffer index forever)
+ __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
+ pr_buf->th_doacross_info = NULL;
+ KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+#endif
+
// end of file //
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index 8f1852b392a..23d736a1b5a 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -163,7 +163,7 @@ struct dispatch_shared_infoXX_template {
volatile UT iteration;
volatile UT num_done;
volatile UT ordered_iteration;
- UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
+ UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
};
// replaces dispatch_shared_info structure and dispatch_shared_info_t type
@@ -175,6 +175,11 @@ struct dispatch_shared_info_template {
dispatch_shared_info64_t s64;
} u;
volatile kmp_uint32 buffer_index;
+#if OMP_41_ENABLED
+ volatile kmp_int32 doacross_buf_idx; // teamwise index
+ kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+ kmp_int32 doacross_num_done; // count finished threads
+#endif
};
/* ------------------------------------------------------------------------ */
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c
index 7a2fa7bac55..7b31eb90e6f 100644
--- a/openmp/runtime/src/kmp_runtime.c
+++ b/openmp/runtime/src/kmp_runtime.c
@@ -3046,8 +3046,12 @@ __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
team->t.t_max_nproc = max_nth;
/* setup dispatch buffers */
- for(i = 0 ; i < num_disp_buff; ++i)
+ for(i = 0 ; i < num_disp_buff; ++i) {
team->t.t_disp_buffer[i].buffer_index = i;
+#if OMP_41_ENABLED
+ team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+ }
}
static void
@@ -4121,7 +4125,9 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
dispatch->th_disp_index = 0;
-
+#if OMP_41_ENABLED
+ dispatch->th_doacross_buf_idx = 0;
+#endif
if( ! dispatch->th_disp_buffer ) {
dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
@@ -6813,7 +6819,9 @@ __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
//KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
-
+#if OMP_41_ENABLED
+ dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
+#endif
if( __kmp_env_consistency_check )
__kmp_push_parallel( gtid, team->t.t_ident );
@@ -7050,10 +7058,17 @@ __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
if ( team->t.t_max_nproc > 1 ) {
int i;
- for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
+ for (i = 0; i < KMP_MAX_DISP_BUF; ++i) {
team->t.t_disp_buffer[ i ].buffer_index = i;
+#if OMP_41_ENABLED
+ team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+ }
} else {
team->t.t_disp_buffer[ 0 ].buffer_index = 0;
+#if OMP_41_ENABLED
+ team->t.t_disp_buffer[0].doacross_buf_idx = 0;
+#endif
}
KMP_MB(); /* Flush all pending memory write invalidates. */
OpenPOWER on IntegriCloud