5 files changed, 341 insertions, 8 deletions
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index a8a70c0a364..6ff52521e50 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -389,6 +389,10 @@ kmpc_set_defaults                           224
     %ifdef OMP_41
 	__kmpc_proxy_task_completed	    259
 	__kmpc_proxy_task_completed_ooo	    260
+        __kmpc_doacross_init                261
+        __kmpc_doacross_wait                262
+        __kmpc_doacross_post                263
+        __kmpc_doacross_fini                264
     %endif
 %endif
 
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 71bb323159f..140bdd8caef 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1665,7 +1665,7 @@ typedef struct dispatch_shared_info64 {
     volatile kmp_uint64      iteration;
     volatile kmp_uint64      num_done;
     volatile kmp_uint64      ordered_iteration;
-    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar
+    kmp_int64   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar
 } dispatch_shared_info64_t;
 
 typedef struct dispatch_shared_info {
@@ -1673,8 +1673,12 @@ typedef struct dispatch_shared_info {
         dispatch_shared_info32_t  s32;
         dispatch_shared_info64_t  s64;
     } u;
-/*    volatile kmp_int32      dispatch_abort;  depricated */
     volatile kmp_uint32     buffer_index;
+#if OMP_41_ENABLED
+    volatile kmp_int32      doacross_buf_idx;  // teamwise index
+    volatile kmp_uint32    *doacross_flags;    // shared array of iteration flags (0/1)
+    kmp_int32               doacross_num_done; // count finished threads
+#endif
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
@@ -1688,7 +1692,13 @@ typedef struct kmp_disp {
 
     dispatch_private_info_t *th_disp_buffer;
     kmp_int32                th_disp_index;
+#if OMP_41_ENABLED
+    kmp_int32                th_doacross_buf_idx; // thread's doacross buffer index
+    volatile kmp_uint32     *th_doacross_flags;   // pointer to shared array of flags
+    kmp_int64               *th_doacross_info;    // info on loop bounds
+#else
     void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+#endif
 #if KMP_USE_INTERNODE_ALIGNMENT
     char more_padding[INTERNODE_CACHE_LINE];
 #endif
@@ -3543,7 +3553,17 @@ KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp
 KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind );
 KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads );
 KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...);
-
+#endif
+#if OMP_41_ENABLED
+struct kmp_dim {  // loop bounds info casted to kmp_int64
+    kmp_int64 lo; // lower
+    kmp_int64 up; // upper
+    kmp_int64 st; // stride
+};
+KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim * dims);
+KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec);
+KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid);
 #endif
 
 KMP_EXPORT void*
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c
index 905f596c964..50650aac2ac 100644
--- a/openmp/runtime/src/kmp_csupport.c
+++ b/openmp/runtime/src/kmp_csupport.c
@@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
     __kmp_place_num_threads_per_core = nT;
 }
 
+#if OMP_41_ENABLED
+/*!
+@ingroup WORK_SHARING
+@param loc  source location information.
+@param gtid  global thread number.
+@param num_dims  number of associated doacross loops.
+@param dims  info on loops bounds.
+
+Initialize doacross loop information.
+Expect compiler send us inclusive bounds,
+e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2.
+*/
+void
+__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
+{
+    int j, idx;
+    kmp_int64 last, trace_count;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_uint32 *flags;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+    dispatch_shared_info_t *sh_buf;
+
+    KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
+                 gtid, num_dims, !team->t.t_serialized));
+    KMP_DEBUG_ASSERT(dims != NULL);
+    KMP_DEBUG_ASSERT(num_dims > 0);
+
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+    KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
+    idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
+    sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+
+    // Save bounds info into allocated private buffer
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
+    pr_buf->th_doacross_info =
+        (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
+    // Save also address of num_done in order to access it later without knowing the buffer index
+    pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
+    pr_buf->th_doacross_info[2] = dims[0].lo;
+    pr_buf->th_doacross_info[3] = dims[0].up;
+    pr_buf->th_doacross_info[4] = dims[0].st;
+    last = 5;
+    for( j = 1; j < num_dims; ++j ) {
+        kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
+        if( dims[j].st == 1 ) { // most common case
+            // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
+            range_length = dims[j].up - dims[j].lo + 1;
+        } else {
+            if( dims[j].st > 0 ) {
+                KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
+                range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
+            } else {            // negative increment
+                KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
+                range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
+            }
+        }
+        pr_buf->th_doacross_info[last++] = range_length;
+        pr_buf->th_doacross_info[last++] = dims[j].lo;
+        pr_buf->th_doacross_info[last++] = dims[j].up;
+        pr_buf->th_doacross_info[last++] = dims[j].st;
+    }
+
+    // Compute total trip count.
+    // Start with range of dims[0] which we don't need to keep in the buffer.
+    if( dims[0].st == 1 ) { // most common case
+        trace_count = dims[0].up - dims[0].lo + 1;
+    } else if( dims[0].st > 0 ) {
+        KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
+        trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
+    } else {   // negative increment
+        KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
+        trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
+    }
+    for( j = 1; j < num_dims; ++j ) {
+        trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
+    }
+    KMP_DEBUG_ASSERT(trace_count > 0);
+
+    // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF)
+    if( idx != sh_buf->doacross_buf_idx ) {
+        // Shared buffer is occupied, wait for it to be free
+        __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
+    }
+    // Check if we are the first thread. After the CAS the first thread gets 0,
+    // others get 1 if initialization is in progress, allocated pointer otherwise.
+    flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
+        (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
+    if( flags == NULL ) {
+        // we are the first thread, allocate the array of flags
+        kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
+        sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
+    } else if( (kmp_int64)flags == 1 ) {
+        // initialization is still in progress, need to wait
+        while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
+            KMP_YIELD(TRUE);
+        }
+    }
+    KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
+    pr_buf->th_doacross_flags = sh_buf->doacross_flags;      // save private copy in order to not
+                                                             // touch shared buffer on each iteration
+    KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
+}
+
+void
+__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, up, st;
+
+    KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number and check out-of-bounds condition
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    up = pr_buf->th_doacross_info[3];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        if( vec[0] < lo || vec[0] > up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        if( vec[0] > lo || vec[0] < up ) {
+            KA_TRACE(20,(
+                "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                gtid, vec[0], lo, up));
+            return;
+        }
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        up = pr_buf->th_doacross_info[j + 3];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            if( vec[i] < lo || vec[i] > up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            if( vec[i] > lo || vec[i] < up ) {
+                KA_TRACE(20,(
+                    "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
+                    gtid, vec[i], lo, up));
+                return;
+            }
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
+        KMP_YIELD(TRUE);
+    }
+    KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
+{
+    kmp_int32 shft, num_dims, i;
+    kmp_uint32 flag;
+    kmp_int64 iter_number; // iteration number of "collapsed" loop nest
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf;
+    kmp_int64 lo, st;
+
+    KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
+        return; // no dependencies if team is serialized
+    }
+
+    // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
+    pr_buf = th->th.th_dispatch;
+    KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
+    num_dims = pr_buf->th_doacross_info[0];
+    lo = pr_buf->th_doacross_info[2];
+    st = pr_buf->th_doacross_info[4];
+    if( st == 1 ) { // most common case
+        iter_number = vec[0] - lo;
+    } else if( st > 0 ) {
+        iter_number = (kmp_uint64)(vec[0] - lo) / st;
+    } else {        // negative increment
+        iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
+    }
+    for( i = 1; i < num_dims; ++i ) {
+        kmp_int64 iter, ln;
+        kmp_int32 j = i * 4;
+        ln = pr_buf->th_doacross_info[j + 1];
+        lo = pr_buf->th_doacross_info[j + 2];
+        st = pr_buf->th_doacross_info[j + 4];
+        if( st == 1 ) {
+            iter = vec[i] - lo;
+        } else if( st > 0 ) {
+            iter = (kmp_uint64)(vec[i] - lo) / st;
+        } else {   // st < 0
+            iter = (kmp_uint64)(lo - vec[i]) / (-st);
+        }
+        iter_number = iter + ln * iter_number;
+    }
+    shft = iter_number % 32; // use 32-bit granularity
+    iter_number >>= 5;       // divided by 32
+    flag = 1 << shft;
+    if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
+        KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
+    KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
+                 gtid, (iter_number<<5)+shft));
+}
+
+void
+__kmpc_doacross_fini(ident_t *loc, int gtid)
+{
+    kmp_int64 num_done;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_team_t *team = th->th.th_team;
+    kmp_disp_t *pr_buf = th->th.th_dispatch;
+
+    KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
+    if( team->t.t_serialized ) {
+        KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
+        return; // nothing to do
+    }
+    num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
+    if( num_done == th->th.th_team_nproc ) {
+        // we are the last thread, need to free shared resources
+        int idx = pr_buf->th_doacross_buf_idx - 1;
+        dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF];
+        KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
+        KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
+        __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
+        sh_buf->doacross_flags = NULL;
+        sh_buf->doacross_num_done = 0;
+        sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use
+    }
+    // free private resources (need to keep buffer index forever)
+    __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
+    pr_buf->th_doacross_info = NULL;
+    KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
+}
+#endif
+
 // end of file //
 
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index 8f1852b392a..23d736a1b5a 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -163,7 +163,7 @@ struct dispatch_shared_infoXX_template {
     volatile UT     iteration;
     volatile UT     num_done;
     volatile UT     ordered_iteration;
-    UT   ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
+    UT   ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
 };
 
 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
@@ -175,6 +175,11 @@ struct dispatch_shared_info_template {
         dispatch_shared_info64_t               s64;
     } u;
     volatile kmp_uint32     buffer_index;
+#if OMP_41_ENABLED
+    volatile kmp_int32      doacross_buf_idx;  // teamwise index
+    kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
+    kmp_int32               doacross_num_done; // count finished threads
+#endif
 };
 
 /* ------------------------------------------------------------------------ */
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c
index 7a2fa7bac55..7b31eb90e6f 100644
--- a/openmp/runtime/src/kmp_runtime.c
+++ b/openmp/runtime/src/kmp_runtime.c
@@ -3046,8 +3046,12 @@ __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
     team->t.t_max_nproc = max_nth;
 
     /* setup dispatch buffers */
-    for(i = 0 ; i < num_disp_buff; ++i)
+    for(i = 0 ; i < num_disp_buff; ++i) {
         team->t.t_disp_buffer[i].buffer_index = i;
+#if OMP_41_ENABLED
+        team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+    }
 }
 
 static void
@@ -4121,7 +4125,9 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
 
         dispatch->th_disp_index = 0;
-
+#if OMP_41_ENABLED
+        dispatch->th_doacross_buf_idx = 0;
+#endif
         if( ! dispatch->th_disp_buffer )  {
             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
 
@@ -6813,7 +6819,9 @@ __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
 
     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
-
+#if OMP_41_ENABLED
+    dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
+#endif
     if( __kmp_env_consistency_check )
         __kmp_push_parallel( gtid, team->t.t_ident );
 
@@ -7050,10 +7058,17 @@ __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
     if ( team->t.t_max_nproc > 1 ) {
         int i;
-        for (i = 0; i <  KMP_MAX_DISP_BUF; ++i)
+        for (i = 0; i <  KMP_MAX_DISP_BUF; ++i) {
             team->t.t_disp_buffer[ i ].buffer_index = i;
+#if OMP_41_ENABLED
+            team->t.t_disp_buffer[i].doacross_buf_idx = i;
+#endif
+        }
     } else {
         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
+#if OMP_41_ENABLED
+        team->t.t_disp_buffer[0].doacross_buf_idx = 0;
+#endif
     }
 
     KMP_MB();       /* Flush all pending memory write invalidates.  */