diff options
author | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2016-03-02 22:42:06 +0000 |
---|---|---|
committer | Jonathan Peyton <jonathan.l.peyton@intel.com> | 2016-03-02 22:42:06 +0000 |
commit | 71909c57ca96f50978a2a6b9507f47a8d99326a7 (patch) | |
tree | 2644822b6fb6dd2bca865a552a3850862e1235da /openmp/runtime/src/kmp_csupport.c | |
parent | 02e1132afb7a9f5ddcd4fdcbbbc7dabe0d098297 (diff) | |
download | bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.tar.gz bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.zip |
Add new OpenMP 4.5 doacross loop nest feature
From the standard: A doacross loop nest is a loop nest that has cross-iteration
dependence. An iteration is dependent on one or more lexicographically earlier
iterations. The ordered clause parameter on a loop directive identifies the
loop(s) associated with the doacross loop nest.
The init/fini routines allocate/free doacross buffer(s) for each loop for each
thread. The wait routine waits for a flag designated by the dependence vector.
The post routine sets the flag designated by current iteration vector. We use
a similar technique of shared buffer indices that covers up to 7 nowait loops
executed simultaneously by different threads (number 7 has no real meaning,
just heuristic value). Also, the size of structures are kept intact via
reducing dummy arrays.
This needs to be put into the OpenMP runtime library in order for the compiler
team to develop the compiler side of the implementation.
Differential Revision: http://reviews.llvm.org/D17399
llvm-svn: 262532
Diffstat (limited to 'openmp/runtime/src/kmp_csupport.c')
-rw-r--r-- | openmp/runtime/src/kmp_csupport.c | 289 |
1 files changed, 289 insertions, 0 deletions
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c index 905f596c964..50650aac2ac 100644 --- a/openmp/runtime/src/kmp_csupport.c +++ b/openmp/runtime/src/kmp_csupport.c @@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT) __kmp_place_num_threads_per_core = nT; } +#if OMP_41_ENABLED +/*! +@ingroup WORK_SHARING +@param loc source location information. +@param gtid global thread number. +@param num_dims number of associated doacross loops. +@param dims info on loops bounds. + +Initialize doacross loop information. +Expect compiler send us inclusive bounds, +e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. +*/ +void +__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims) +{ + int j, idx; + kmp_int64 last, trace_count; + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_uint32 *flags; + kmp_disp_t *pr_buf = th->th.th_dispatch; + dispatch_shared_info_t *sh_buf; + + KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", + gtid, num_dims, !team->t.t_serialized)); + KMP_DEBUG_ASSERT(dims != NULL); + KMP_DEBUG_ASSERT(num_dims > 0); + + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + KMP_DEBUG_ASSERT(team->t.t_nproc > 1); + idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop + sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF]; + + // Save bounds info into allocated private buffer + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); + pr_buf->th_doacross_info = + (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1)); + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions + // Save also address of num_done in order to access it later without knowing the buffer index + pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; + pr_buf->th_doacross_info[2] = dims[0].lo; + pr_buf->th_doacross_info[3] = dims[0].up; + pr_buf->th_doacross_info[4] = dims[0].st; + last = 5; + for( j = 1; j < num_dims; ++j ) { + kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0] + if( dims[j].st == 1 ) { // most common case + // AC: should we care of ranges bigger than LLONG_MAX? (not for now) + range_length = dims[j].up - dims[j].lo + 1; + } else { + if( dims[j].st > 0 ) { + KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); + range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; + } else { // negative increment + KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); + range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; + } + } + pr_buf->th_doacross_info[last++] = range_length; + pr_buf->th_doacross_info[last++] = dims[j].lo; + pr_buf->th_doacross_info[last++] = dims[j].up; + pr_buf->th_doacross_info[last++] = dims[j].st; + } + + // Compute total trip count. + // Start with range of dims[0] which we don't need to keep in the buffer. + if( dims[0].st == 1 ) { // most common case + trace_count = dims[0].up - dims[0].lo + 1; + } else if( dims[0].st > 0 ) { + KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); + trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; + } else { // negative increment + KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); + trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; + } + for( j = 1; j < num_dims; ++j ) { + trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges + } + KMP_DEBUG_ASSERT(trace_count > 0); + + // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF) + if( idx != sh_buf->doacross_buf_idx ) { + // Shared buffer is occupied, wait for it to be free + __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL ); + } + // Check if we are the first thread. After the CAS the first thread gets 0, + // others get 1 if initialization is in progress, allocated pointer otherwise. + flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64( + (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1); + if( flags == NULL ) { + // we are the first thread, allocate the array of flags + kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration + sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1); + } else if( (kmp_int64)flags == 1 ) { + // initialization is still in progress, need to wait + while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) { + KMP_YIELD(TRUE); + } + } + KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer + pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not + // touch shared buffer on each iteration + KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid)); +} + +void +__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) +{ + kmp_int32 shft, num_dims, i; + kmp_uint32 flag; + kmp_int64 iter_number; // iteration number of "collapsed" loop nest + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf; + kmp_int64 lo, up, st; + + KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + + // calculate sequential iteration number and check out-of-bounds condition + pr_buf = th->th.th_dispatch; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + num_dims = pr_buf->th_doacross_info[0]; + lo = pr_buf->th_doacross_info[2]; + up = pr_buf->th_doacross_info[3]; + st = pr_buf->th_doacross_info[4]; + if( st == 1 ) { // most common case + if( vec[0] < lo || vec[0] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = vec[0] - lo; + } else if( st > 0 ) { + if( vec[0] < lo || vec[0] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = (kmp_uint64)(vec[0] - lo) / st; + } else { // negative increment + if( vec[0] > lo || vec[0] < up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = (kmp_uint64)(lo - vec[0]) / (-st); + } + for( i = 1; i < num_dims; ++i ) { + kmp_int64 iter, ln; + kmp_int32 j = i * 4; + ln = pr_buf->th_doacross_info[j + 1]; + lo = pr_buf->th_doacross_info[j + 2]; + up = pr_buf->th_doacross_info[j + 3]; + st = pr_buf->th_doacross_info[j + 4]; + if( st == 1 ) { + if( vec[i] < lo || vec[i] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = vec[i] - lo; + } else if( st > 0 ) { + if( vec[i] < lo || vec[i] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = (kmp_uint64)(vec[i] - lo) / st; + } else { // st < 0 + if( vec[i] > lo || vec[i] < up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = (kmp_uint64)(lo - vec[i]) / (-st); + } + iter_number = iter + ln * iter_number; + } + shft = iter_number % 32; // use 32-bit granularity + iter_number >>= 5; // divided by 32 + flag = 1 << shft; + while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) { + KMP_YIELD(TRUE); + } + KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", + gtid, (iter_number<<5)+shft)); +} + +void +__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) +{ + kmp_int32 shft, num_dims, i; + kmp_uint32 flag; + kmp_int64 iter_number; // iteration number of "collapsed" loop nest + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf; + kmp_int64 lo, st; + + KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + + // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks) + pr_buf = th->th.th_dispatch; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + num_dims = pr_buf->th_doacross_info[0]; + lo = pr_buf->th_doacross_info[2]; + st = pr_buf->th_doacross_info[4]; + if( st == 1 ) { // most common case + iter_number = vec[0] - lo; + } else if( st > 0 ) { + iter_number = (kmp_uint64)(vec[0] - lo) / st; + } else { // negative increment + iter_number = (kmp_uint64)(lo - vec[0]) / (-st); + } + for( i = 1; i < num_dims; ++i ) { + kmp_int64 iter, ln; + kmp_int32 j = i * 4; + ln = pr_buf->th_doacross_info[j + 1]; + lo = pr_buf->th_doacross_info[j + 2]; + st = pr_buf->th_doacross_info[j + 4]; + if( st == 1 ) { + iter = vec[i] - lo; + } else if( st > 0 ) { + iter = (kmp_uint64)(vec[i] - lo) / st; + } else { // st < 0 + iter = (kmp_uint64)(lo - vec[i]) / (-st); + } + iter_number = iter + ln * iter_number; + } + shft = iter_number % 32; // use 32-bit granularity + iter_number >>= 5; // divided by 32 + flag = 1 << shft; + if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) + KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag ); + KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", + gtid, (iter_number<<5)+shft)); +} + +void +__kmpc_doacross_fini(ident_t *loc, int gtid) +{ + kmp_int64 num_done; + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf = th->th.th_dispatch; + + KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team)); + return; // nothing to do + } + num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1; + if( num_done == th->th.th_team_nproc ) { + // we are the last thread, need to free shared resources + int idx = pr_buf->th_doacross_buf_idx - 1; + dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF]; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done); + KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done); + KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); + __kmp_thread_free(th, (void*)sh_buf->doacross_flags); + sh_buf->doacross_flags = NULL; + sh_buf->doacross_num_done = 0; + sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use + } + // free private resources (need to keep buffer index forever) + __kmp_thread_free(th, (void*)pr_buf->th_doacross_info); + pr_buf->th_doacross_info = NULL; + KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid)); +} +#endif + // end of file // |