2 files changed, 108 insertions, 228 deletions
diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c
index 361fce93bc5..888386a6548 100644
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@@ -1806,17 +1806,17 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
                                                int *thread_finished
                                                USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
 {
-    kmp_task_team_t *     task_team;
+    kmp_task_team_t *     task_team = thread->th.th_task_team;
     kmp_thread_data_t *   threads_data;
     kmp_task_t *          task;
+    kmp_info_t *          other_thread;
     kmp_taskdata_t *      current_task = thread -> th.th_current_task;
     volatile kmp_uint32 * unfinished_threads;
-    kmp_int32             nthreads, last_stolen, k, tid;
+    kmp_int32             nthreads, victim=-2, use_own_tasks=1, new_victim=0, tid=thread->th.th_info.ds.ds_tid;
 
     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
     KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] );
 
-    task_team = thread -> th.th_task_team;
     if (task_team == NULL) return FALSE;
 
     KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
@@ -1834,277 +1834,154 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
 #endif
     KMP_DEBUG_ASSERT( (int)(TCR_4(*unfinished_threads)) >= 0 );
 
-    // Choose tasks from our own work queue.
-    start:
-    while (( task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained )) != NULL ) {
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-            if ( itt_sync_obj == NULL ) {
-                // we are at fork barrier where we could not get the object reliably
-                itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
+    while (1) { // Outer loop keeps trying to find tasks in case of single thread getting tasks from target constructs
+        while (1) { // Inner loop to find a task and execute it
+            task = NULL;
+            if (use_own_tasks) { // check on own queue first
+                task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained );
             }
-            __kmp_itt_task_starting( itt_sync_obj );
-        }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-        __kmp_invoke_task( gtid, task, current_task );
-#if USE_ITT_BUILD
-        if ( itt_sync_obj != NULL )
-            __kmp_itt_task_finished( itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-        // If this thread is only partway through the barrier and the condition
-        // is met, then return now, so that the barrier gather/release pattern can proceed.
-        // If this thread is in the last spin loop in the barrier, waiting to be
-        // released, we know that the termination condition will not be satisified,
-        // so don't waste any cycles checking it.
-        if (flag == NULL || (!final_spin && flag->done_check())) {
-            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #1): T#%d spin condition satisfied\n", gtid) );
-            return TRUE;
-        }
-        if (thread->th.th_task_team == NULL) break;
-        KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
-    }
-
-    // This thread's work queue is empty.  If we are in the final spin loop
-    // of the barrier, check and see if the termination condition is satisfied.
-#if OMP_41_ENABLED
-    // The work queue may be empty but there might be proxy tasks still executing
-    if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
-#else
-    if (final_spin)
-#endif
-    {
-        // First, decrement the #unfinished threads, if that has not already
-        // been done.  This decrement might be to the spin location, and
-        // result in the termination condition being satisfied.
-        if (! *thread_finished) {
-            kmp_uint32 count;
-
-            count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-            KA_TRACE(20, ("__kmp_execute_tasks_template(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n",
-                          gtid, count, task_team) );
-            *thread_finished = TRUE;
-        }
-
-        // It is now unsafe to reference thread->th.th_team !!!
-        // Decrementing task_team->tt.tt_unfinished_threads can allow the master
-        // thread to pass through the barrier, where it might reset each thread's
-        // th.th_team field for the next parallel region.
-        // If we can steal more work, we know that this has not happened yet.
-        if (flag != NULL && flag->done_check()) {
-            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #2): T#%d spin condition satisfied\n", gtid) );
-            return TRUE;
-        }
-    }
-
-    if (thread->th.th_task_team == NULL) return FALSE;
-#if OMP_41_ENABLED
-    // check if there are other threads to steal from, otherwise go back
-    if ( nthreads  == 1 )
-        goto start;
-#endif
+            if ((task == NULL) && (nthreads > 1)) { // Steal a task
+                int asleep = 1;
+                use_own_tasks = 0;
+                // Try to steal from the last place I stole from successfully.
+                if (victim == -2) { // haven't stolen anything yet
+                    victim = threads_data[tid].td.td_deque_last_stolen;
+                    if (victim != -1) // if we have a last stolen from victim, get the thread
+                        other_thread = threads_data[victim].td.td_thr;
+                }
+                if (victim != -1) { // found last victim
+                    asleep = 0;
+                }
+                else if (!new_victim) { // no recent steals and we haven't already used a new victim; select a random thread
+                    do { // Find a different thread to steal work from.
+                        // Pick a random thread. Initial plan was to cycle through all the threads, and only return if
+                        // we tried to steal from every thread, and failed.  Arch says that's not such a great idea.
+                        victim = __kmp_get_random(thread) % (nthreads - 1);
+                        if (victim >= tid) {
+                            ++victim;  // Adjusts random distribution to exclude self
+                        }
+                        // Found a potential victim
+                        other_thread = threads_data[victim].td.td_thr;
+                        // There is a slight chance that __kmp_enable_tasking() did not wake up all threads
+                        // waiting at the barrier.  If victim is sleeping, then wake it up.  Since we were going to
+                        // pay the cache miss penalty for referencing another thread's kmp_info_t struct anyway,
+                        // the check shouldn't cost too much performance at this point. In extra barrier mode, tasks
+                        // do not sleep at the separate tasking barrier, so this isn't a problem.
+                        asleep = 0;
+                        if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
+                             (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
+                             (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) {
+                            asleep = 1;
+                            __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
+                            // A sleeping thread should not have any tasks on it's queue. There is a slight
+                            // possibility that it resumes, steals a task from another thread, which spawns more
+                            // tasks, all in the time that it takes this thread to check => don't write an assertion
+                            // that the victim's queue is empty.  Try stealing from a different thread.
+                        }
+                    } while (asleep);
+                }
 
-    // Try to steal from the last place I stole from successfully.
-    tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid );
-    last_stolen = threads_data[ tid ].td.td_deque_last_stolen;
+                if (!asleep) {
+                    // We have a victim to try to steal from
+                    task = __kmp_steal_task(other_thread, gtid, task_team, unfinished_threads, thread_finished, is_constrained);
+                }
+                if (task != NULL) { // set last stolen to victim
+                    if (threads_data[tid].td.td_deque_last_stolen != victim) {
+                        threads_data[tid].td.td_deque_last_stolen = victim;
+                        // The pre-refactored code did not try more than 1 successful new vicitm,
+                        // unless the last one generated more local tasks; new_victim keeps track of this
+                        new_victim = 1;
+                    }
+                }
+                else { // No tasks found; unset last_stolen
+                    KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
+                    victim = -2; // no successful victim found
+                }
+            }
 
-    if (last_stolen != -1) {
-        kmp_info_t *other_thread = threads_data[last_stolen].td.td_thr;
+            if (task == NULL) // break out of tasking loop
+                break;
 
-        while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
-                                         thread_finished, is_constrained )) != NULL)
-        {
+            // Found a task; execute it
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
             if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-                if ( itt_sync_obj == NULL ) {
-                    // we are at fork barrier where we could not get the object reliably
-                    itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
+                if ( itt_sync_obj == NULL ) { // we are at fork barrier where we could not get the object reliably
+                    itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
                 }
                 __kmp_itt_task_starting( itt_sync_obj );
             }
 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
             __kmp_invoke_task( gtid, task, current_task );
 #if USE_ITT_BUILD
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );
+            if ( itt_sync_obj != NULL ) __kmp_itt_task_finished( itt_sync_obj );
 #endif /* USE_ITT_BUILD */
-
-            // Check to see if this thread can proceed.
+            // If this thread is only partway through the barrier and the condition is met, then return now,
+            // so that the barrier gather/release pattern can proceed. If this thread is in the last spin loop
+            // in the barrier, waiting to be released, we know that the termination condition will not be
+            // satisified, so don't waste any cycles checking it.
             if (flag == NULL || (!final_spin && flag->done_check())) {
-                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #3): T#%d spin condition satisfied\n",
-                              gtid) );
+                KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
                 return TRUE;
             }
-
-            if (thread->th.th_task_team == NULL) break;
+            if (thread->th.th_task_team == NULL) {
+                break;
+            }
             KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
-            // If the execution of the stolen task resulted in more tasks being
-            // placed on our run queue, then restart the whole process.
-            if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
-                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
-                              gtid) );
-                goto start;
+            // If execution of a stolen task results in more tasks being placed on our run queue, reset use_own_tasks
+            if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", gtid));
+                use_own_tasks = 1;
+                new_victim = 0;
             }
         }
 
-        // Don't give priority to stealing from this thread anymore.
-        threads_data[ tid ].td.td_deque_last_stolen = -1;
-
-        // The victims's work queue is empty.  If we are in the final spin loop
-        // of the barrier, check and see if the termination condition is satisfied.
+        // The task source has been exhausted. If in final spin loop of barrier, check if termination condition is satisfied.
 #if OMP_41_ENABLED
         // The work queue may be empty but there might be proxy tasks still executing
-        if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
+        if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
 #else
         if (final_spin)
 #endif
         {
-            // First, decrement the #unfinished threads, if that has not already
-            // been done.  This decrement might be to the spin location, and
-            // result in the termination condition being satisfied.
+            // First, decrement the #unfinished threads, if that has not already been done.  This decrement
+            // might be to the spin location, and result in the termination condition being satisfied.
             if (! *thread_finished) {
                 kmp_uint32 count;
 
                 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #2): T#%d dec unfinished_threads to %d "
-                              "task_team=%p\n", gtid, count, task_team) );
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec unfinished_threads to %d task_team=%p\n",
+                              gtid, count, task_team) );
                 *thread_finished = TRUE;
             }
 
-            // If __kmp_tasking_mode != tskm_immediate_exec
-            // then it is now unsafe to reference thread->th.th_team !!!
-            // Decrementing task_team->tt.tt_unfinished_threads can allow the master
-            // thread to pass through the barrier, where it might reset each thread's
-            // th.th_team field for the next parallel region.
+            // It is now unsafe to reference thread->th.th_team !!!
+            // Decrementing task_team->tt.tt_unfinished_threads can allow the master thread to pass through
+            // the barrier, where it might reset each thread's th.th_team field for the next parallel region.
             // If we can steal more work, we know that this has not happened yet.
             if (flag != NULL && flag->done_check()) {
-                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #4): T#%d spin condition satisfied\n",
-                              gtid) );
+                KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) );
                 return TRUE;
             }
         }
-        if (thread->th.th_task_team == NULL) return FALSE;
-    }
-
-    // Find a different thread to steal work from.  Pick a random thread.
-    // My initial plan was to cycle through all the threads, and only return
-    // if we tried to steal from every thread, and failed.  Arch says that's
-    // not such a great idea.
-    // GEH - need yield code in this loop for throughput library mode?
-    new_victim:
-    k = __kmp_get_random( thread ) % (nthreads - 1);
-    if ( k >= thread -> th.th_info.ds.ds_tid ) {
-        ++k;               // Adjusts random distribution to exclude self
-    }
-    {
-        kmp_info_t *other_thread = threads_data[k].td.td_thr;
-        int first;
-
-        // There is a slight chance that __kmp_enable_tasking() did not wake up
-        // all threads waiting at the barrier.  If this thread is sleeping, then
-        // wake it up.  Since we were going to pay the cache miss penalty
-        // for referencing another thread's kmp_info_t struct anyway, the check
-        // shouldn't cost too much performance at this point.
-        // In extra barrier mode, tasks do not sleep at the separate tasking
-        // barrier, so this isn't a problem.
-        if ( ( __kmp_tasking_mode == tskm_task_teams ) &&
-             (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
-             (TCR_PTR(other_thread->th.th_sleep_loc) != NULL))
-        {
-            __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
-            // A sleeping thread should not have any tasks on it's queue.
-            // There is a slight possibility that it resumes, steals a task from
-            // another thread, which spawns more tasks, all in the time that it takes
-            // this thread to check => don't write an assertion that the victim's
-            // queue is empty.  Try stealing from a different thread.
-            goto new_victim;
-        }
-
-        // Now try to steal work from the selected thread
-        first = TRUE;
-        while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads,
-                                         thread_finished, is_constrained )) != NULL)
-        {
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-            if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-                if ( itt_sync_obj == NULL ) {
-                    // we are at fork barrier where we could not get the object reliably
-                    itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-                }
-                __kmp_itt_task_starting( itt_sync_obj );
-            }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-            __kmp_invoke_task( gtid, task, current_task );
-#if USE_ITT_BUILD
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-            // Try stealing from this victim again, in the future.
-            if (first) {
-                threads_data[ tid ].td.td_deque_last_stolen = k;
-                first = FALSE;
-            }
-
-            // Check to see if this thread can proceed.
-            if (flag == NULL || (!final_spin && flag->done_check())) {
-                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #5): T#%d spin condition satisfied\n",
-                              gtid) );
-                return TRUE;
-            }
-            if (thread->th.th_task_team == NULL) break;
-            KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
 
-            // If the execution of the stolen task resulted in more tasks being
-            // placed on our run queue, then restart the whole process.
-            if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
-                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
-                              gtid) );
-                goto start;
-            }
+        // If this thread's task team is NULL, master has recognized that there are no more tasks; bail out
+        if (thread->th.th_task_team == NULL) {
+            KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid) );
+            return FALSE;
         }
 
-        // The victims's work queue is empty.  If we are in the final spin loop
-        // of the barrier, check and see if the termination condition is satisfied.
-        // Going on and finding a new victim to steal from is expensive, as it
-        // involves a lot of cache misses, so we definitely want to re-check the
-        // termination condition before doing that.
 #if OMP_41_ENABLED
-        // The work queue may be empty but there might be proxy tasks still executing
-        if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0)
-#else
-        if (final_spin)
+        // We could be getting tasks from target constructs; if this is the only thread, keep trying to execute
+        // tasks from own queue
+        if (nthreads == 1)
+            use_own_tasks = 1;
+        else
 #endif
         {
-            // First, decrement the #unfinished threads, if that has not already
-            // been done.  This decrement might be to the spin location, and
-            // result in the termination condition being satisfied.
-            if (! *thread_finished) {
-                kmp_uint32 count;
-
-                count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #3): T#%d dec unfinished_threads to %d; "
-                              "task_team=%p\n",
-                              gtid, count, task_team) );
-                *thread_finished = TRUE;
-            }
-
-            // If __kmp_tasking_mode != tskm_immediate_exec,
-            // then it is now unsafe to reference thread->th.th_team !!!
-            // Decrementing task_team->tt.tt_unfinished_threads can allow the master
-            // thread to pass through the barrier, where it might reset each thread's
-            // th.th_team field for the next parallel region.
-            // If we can steal more work, we know that this has not happened yet.
-            if (flag != NULL && flag->done_check()) {
-                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #6): T#%d spin condition satisfied\n", gtid) );
-                return TRUE;
-            }
+            KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid) );
+            return FALSE;
         }
-        if (thread->th.th_task_team == NULL) return FALSE;
     }
-
-    KA_TRACE(15, ("__kmp_execute_tasks_template(exit #7): T#%d can't find work\n", gtid) );
-    return FALSE;
 }
 
 int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 7f18da4b1b7..64e3b618d2b 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -26,6 +26,9 @@ def append_dynamic_library_path(path):
     else:
         config.environment[name] = path
 
+for name,value in os.environ.items():
+    config.environment[name] = value
+
 # name: The name of this test suite.
 config.name = 'libomp'