Add new OpenMP 4.5 doacross loop nest feature

From the standard: A doacross loop nest is a loop nest that has cross-iteration dependence. An iteration is dependent on one or more lexicographically earlier iterations. The ordered clause parameter on a loop directive identifies the loop(s) associated with the doacross loop nest. The init/fini routines allocate/free doacross buffer(s) for each loop for each thread. The wait routine waits for a flag designated by the dependence vector. The post routine sets the flag designated by current iteration vector. We use a similar technique of shared buffer indices that covers up to 7 nowait loops executed simultaneously by different threads (number 7 has no real meaning, just heuristic value). Also, the size of structures are kept intact via reducing dummy arrays. This needs to be put into the OpenMP runtime library in order for the compiler team to develop the compiler side of the implementation. Differential Revision: http://reviews.llvm.org/D17399 llvm-svn: 262532
author: Jonathan Peyton <jonathan.l.peyton@intel.com> 2016-03-02 22:42:06 +0000
committer: Jonathan Peyton <jonathan.l.peyton@intel.com> 2016-03-02 22:42:06 +0000
commit: 71909c57ca96f50978a2a6b9507f47a8d99326a7 (patch)
tree: 2644822b6fb6dd2bca865a552a3850862e1235da /openmp/runtime/src/kmp_csupport.c
parent: 02e1132afb7a9f5ddcd4fdcbbbc7dabe0d098297 (diff)
download: bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.tar.gz
bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.zip
1 files changed, 289 insertions, 0 deletions
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c
index 905f596c964..50650aac2ac 100644
--- a/openmp/runtime/src/kmp_csupport.c
+++ b/openmp/runtime/src/kmp_csupport.c
@@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
     __kmp_place_num_threads_per_core = nT;
 }
 
+#if OMP_41_ENABLED
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+@param num_dims  number of associated doacross loops.
+@param dims  info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void
+__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
+{
+    int j, idx;
+    kmp_int64 last, trace_count;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_uint32 *flags;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+    dispatch_shared_info_t *sh_buf;
+
+    KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+                 gtid, num_dims, !team->t.t_serialized));
+    KMP_DEBUG_ASSERT(dims != NULL);
+    KMP_DEBUG_ASSERT(num_dims > 0);
+
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+    KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+    idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
+    sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+
+    // Save bounds info into allocated private buffer
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+    pr_buf->th_doacross_info =
+        (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
+    // Save also address of num_done in order to access it later without knowing the buffer index
+    pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+    pr_buf->th_doacross_info[2] = dims[0].lo;
+    pr_buf->th_doacross_info[3] = dims[0].up;
+    pr_buf->th_doacross_info[4] = dims[0].st;
+    last = 5;
+    for( j = 1; j < num_dims; ++j ) {
+        kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
+        if( dims[j].st == 1 ) { // most common case
+            // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+            range_length = dims[j].up - dims[j].lo + 1;
+        } else {
+            if( dims[j].st > 0 ) {
+                KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+                range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+            } else {            // negative increment
+                KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+                range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+            }
+        }
+        pr_buf->th_doacross_info[last++] = range_length;
+        pr_buf->th_doacross_info[last++] = dims[j].lo;
+        pr_buf->th_doacross_info[last++] = dims[j].up;
+        pr_buf->th_doacross_info[last++] = dims[j].st;
+    }
+
+    // Compute total trip count.
+    // Start with range of dims[0] which we don't need to keep in the buffer.
+    if( dims[0].st == 1 ) { // most common case
+        trace_count = dims[0].up - dims[0].lo + 1;
+    } else if( dims[0].st > 0 ) {
+        KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+        trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+    } else {   // negative increment
+        KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+        trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+    }
+    for( j = 1; j < num_dims; ++j ) {
+        trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+    }
+    KMP_DEBUG_ASSERT(trace_count > 0);
+
+    // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF)
+    if( idx != sh_buf->doacross_buf_idx ) {
+        // Shared buffer is occupied, wait for it to be free
+        __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
+    }
+    // Check if we are the first thread. After the CAS the first thread gets 0,
+    // others get 1 if initialization is in progress, allocated pointer otherwise.
+    flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
+        (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
+    if( flags == NULL ) {
+        // we are the first thread, allocate the array of flags
+        kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+        sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
+    } else if( (kmp_int64)flags == 1 ) {
+        // initialization is still in progress, need to wait
+        while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
+            KMP_YIELD(TRUE);
+        }
+    }
+    KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
+    pr_buf->th_doacross_flags = sh_buf->doacross_flags;      // save private copy in order to not
+                                                             // touch shared buffer on each iteration
+    KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void
+__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, up, st;
+
+    KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number and check out-of-bounds condition
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    up = pr_buf->th_doacross_info[3];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        if( vec[0] > lo || vec[0] < up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        up = pr_buf->th_doacross_info[j + 3];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            if( vec[i] > lo || vec[i] < up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
+        KMP_YIELD(TRUE);
+    }
+    KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, st;
+
+    KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
+        KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
+    KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_fini(ident_t *loc, int gtid)
+{
+    kmp_int64 num_done;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+    KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+        return; // nothing to do
+    }
+    num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
+    if( num_done == th->th.th_team_nproc ) {
+        // we are the last thread, need to free shared resources
+        int idx = pr_buf->th_doacross_buf_idx - 1;
+        dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+        KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+        __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
+        sh_buf->doacross_flags = NULL;
+        sh_buf->doacross_num_done = 0;
+        sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use
+    }
+    // free private resources (need to keep buffer index forever)
+    __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
+    pr_buf->th_doacross_info = NULL;
+    KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+#endif
+
 // end of file //
author	Jonathan Peyton <jonathan.l.peyton@intel.com>	2016-03-02 22:42:06 +0000
committer	Jonathan Peyton <jonathan.l.peyton@intel.com>	2016-03-02 22:42:06 +0000
commit	71909c57ca96f50978a2a6b9507f47a8d99326a7 (patch)
tree	2644822b6fb6dd2bca865a552a3850862e1235da /openmp/runtime/src/kmp_csupport.c
parent	02e1132afb7a9f5ddcd4fdcbbbc7dabe0d098297 (diff)
download	bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.tar.gz bcm5719-llvm-71909c57ca96f50978a2a6b9507f47a8d99326a7.zip