Fork performance improvements

Most of this is modifications to check for differences before updating data fields in team struct. There is also some rearrangement of the team struct. Patch by Diego Caballero Differential Revision: http://reviews.llvm.org/D20487 llvm-svn: 270468
author: Jonathan Peyton <jonathan.l.peyton@intel.com> 2016-05-23 18:01:19 +0000
committer: Jonathan Peyton <jonathan.l.peyton@intel.com> 2016-05-23 18:01:19 +0000
commit: b044e4fa31bde25b145f197360022e2994b328b2 (patch)
tree: 215931d6458dcfa2dd652e3295f60ba42a6167c4
parent: 13a0d4981374db4252d0062645d3cbb21da289ec (diff)
download: bcm5719-llvm-b044e4fa31bde25b145f197360022e2994b328b2.tar.gz
bcm5719-llvm-b044e4fa31bde25b145f197360022e2994b328b2.zip
3 files changed, 51 insertions, 31 deletions
diff --git a/openmp/CREDITS.txt b/openmp/CREDITS.txt
index 4556ddeaed7..a8ab67ab18e 100644
--- a/openmp/CREDITS.txt
+++ b/openmp/CREDITS.txt
@@ -51,3 +51,7 @@ D: Making build work for FreeBSD.
 
 N: Cheng Wang
 D: Contributor to testsuite from OpenUH
+
+N: Diego Caballero
+E: diego.l.caballero@gmail.com
+D: Fork performance improvements
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 48602168f6f..bac516c9e05 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1951,6 +1951,9 @@ typedef struct kmp_local {
 
 } kmp_local_t;
 
+#define KMP_CHECK_UPDATE(a, b) if ((a) != (b)) (a) = (b)
+#define KMP_CHECK_UPDATE_SYNC(a, b) if ((a) != (b)) TCW_SYNC_PTR((a), (b))
+
 #define get__blocktime( xteam, xtid )     ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
 #define get__bt_set( xteam, xtid )        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
 #define get__bt_intervals( xteam, xtid )  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
@@ -2196,7 +2199,7 @@ struct kmp_taskdata {                                 /* aligned during dynamic
     kmp_uint32              td_taskwait_counter;
     kmp_int32               td_taskwait_thread;       /* gtid + 1 of thread encountered taskwait */
     KMP_ALIGN_CACHE kmp_internal_control_t  td_icvs;  /* Internal control variables for the task */
-    volatile kmp_uint32     td_allocated_child_tasks;  /* Child tasks (+ current task) not yet deallocated */
+    KMP_ALIGN_CACHE volatile kmp_uint32 td_allocated_child_tasks;  /* Child tasks (+ current task) not yet deallocated */
     volatile kmp_uint32     td_incomplete_child_tasks; /* Child tasks not yet complete */
 #if OMP_40_ENABLED
     kmp_taskgroup_t *       td_taskgroup;         // Each task keeps pointer to its current taskgroup
@@ -2515,12 +2518,14 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
     void                    *t_inline_argv[ KMP_INLINE_ARGV_ENTRIES ];
 
     KMP_ALIGN_CACHE kmp_info_t **t_threads;
-    int                      t_max_argc;
+    kmp_taskdata_t *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
+    int                      t_level;          // nested parallel level
+
+    KMP_ALIGN_CACHE int      t_max_argc;
     int                      t_max_nproc;    // maximum threads this team can handle (dynamicly expandable)
     int                      t_serialized;   // levels deep of serialized teams
     dispatch_shared_info_t  *t_disp_buffer;  // buffers for dispatch system
     int                      t_id;           // team's id, assigned by debugger.
-    int                      t_level;        // nested parallel level
     int                      t_active_level; // nested active parallel level
     kmp_r_sched_t            t_sched;        // run-time schedule for the team
 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
@@ -2536,8 +2541,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
     // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
     char dummy_padding[1024];
 #endif
-    KMP_ALIGN_CACHE kmp_taskdata_t *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
-    kmp_internal_control_t  *t_control_stack_top;  // internal control stack for additional nested teams.
+    KMP_ALIGN_CACHE kmp_internal_control_t *t_control_stack_top;  // internal control stack for additional nested teams.
                                                    // for SERIALIZED teams nested 2 or more levels deep
 #if OMP_40_ENABLED
     kmp_int32                t_cancel_request; // typed flag to store request state of cancellation
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c
index 6b0115f4ee6..56fa1b01542 100644
--- a/openmp/runtime/src/kmp_runtime.c
+++ b/openmp/runtime/src/kmp_runtime.c
@@ -2003,32 +2003,38 @@ __kmp_fork_call(
     KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
 
     /* setup the new team */
-    team->t.t_master_tid = master_tid;
-    team->t.t_master_this_cons = master_this_cons;
-    team->t.t_ident      = loc;
-    team->t.t_parent     = parent_team;
-    TCW_SYNC_PTR(team->t.t_pkfn, microtask);
+    KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
+    KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
+    KMP_CHECK_UPDATE(team->t.t_ident, loc);
+    KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
+    KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
 #if OMPT_SUPPORT
-    TCW_SYNC_PTR(team->t.ompt_team_info.microtask, unwrapped_task);
+    KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
 #endif
-    team->t.t_invoke     = invoker;  /* TODO move this to root, maybe */
+    KMP_CHECK_UPDATE(team->t.t_invoke, invoker);  /* TODO move this to root, maybe */
     // TODO: parent_team->t.t_level == INT_MAX ???
 #if OMP_40_ENABLED
     if ( !master_th->th.th_teams_microtask || level > teams_level ) {
 #endif /* OMP_40_ENABLED */
-        team->t.t_level        = parent_team->t.t_level + 1;
-        team->t.t_active_level = parent_team->t.t_active_level + 1;
+        int new_level = parent_team->t.t_level + 1;
+        KMP_CHECK_UPDATE(team->t.t_level, new_level);
+        new_level = parent_team->t.t_active_level + 1;
+        KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
 #if OMP_40_ENABLED
     } else {
         // AC: Do not increase parallel level at start of the teams construct
-        team->t.t_level        = parent_team->t.t_level;
-        team->t.t_active_level = parent_team->t.t_active_level;
+        int new_level = parent_team->t.t_level;
+        KMP_CHECK_UPDATE(team->t.t_level, new_level);
+        new_level = parent_team->t.t_active_level;
+        KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
     }
 #endif /* OMP_40_ENABLED */
-    team->t.t_sched      = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
+    kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
+    if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || new_sched.chunk != new_sched.chunk)
+        team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
 
 #if OMP_40_ENABLED
-    team->t.t_cancel_request = cancel_noreq;
+    KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
 #endif
 
     // Update the floating point rounding in the team if required.
@@ -2095,23 +2101,27 @@ __kmp_fork_call(
 #if OMP_40_ENABLED
     if ( ap ) {
 #endif /* OMP_40_ENABLED */
-        for ( i=argc-1; i >= 0; --i )
+        for ( i=argc-1; i >= 0; --i ) {
 // TODO: revert workaround for Intel(R) 64 tracker #96
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
-            *argv++ = va_arg( *ap, void * );
+            void *new_argv = va_arg(*ap, void *);
 #else
-            *argv++ = va_arg( ap, void * );
+            void *new_argv = va_arg(ap, void *);
 #endif
+            KMP_CHECK_UPDATE(*argv, new_argv);
+            argv++;
+        }
 #if OMP_40_ENABLED
     } else {
-        for ( i=0; i < argc; ++i )
+        for ( i=0; i < argc; ++i ) {
             // Get args from parent team for teams construct
-            argv[i] = team->t.t_parent->t.t_argv[i];
+            KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
+        }
     }
 #endif /* OMP_40_ENABLED */
 
     /* now actually fork the threads */
-    team->t.t_master_active = master_active;
+    KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
         root->r.r_active = TRUE;
 
@@ -4320,9 +4330,9 @@ __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ide
                     team->t.t_threads[0], team ) );
     KMP_DEBUG_ASSERT( team && new_icvs);
     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
-    team->t.t_ident = loc;
+    KMP_CHECK_UPDATE(team->t.t_ident, loc);
 
-    team->t.t_id = KMP_GEN_TEAM_ID();
+    KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
 
     // Copy ICVs to the master thread's implicit taskdata
     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
@@ -4774,11 +4784,13 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             if ( team->t.t_size_changed == -1 ) {
                 team->t.t_size_changed = 1;
             } else {
-                team->t.t_size_changed = 0;
+                KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
             }
 
             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
-            team->t.t_sched =  new_icvs->sched;
+            kmp_r_sched_t new_sched = new_icvs->sched;
+            if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || new_sched.chunk != new_sched.chunk)
+                team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
 
             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
 
@@ -4795,7 +4807,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                   team->t.t_last_place ) );
             }
             else {
-                team->t.t_proc_bind = new_proc_bind;
+                KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
                 __kmp_partition_places( team );
             }
 # else
@@ -5016,7 +5028,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 
         /* reallocate space for arguments if necessary */
         __kmp_alloc_argv_entries( argc, team, TRUE );
-        team->t.t_argc     = argc;
+        KMP_CHECK_UPDATE(team->t.t_argc, argc);
         //
         // The hot team re-uses the previous task team,
         // if untouched during the previous release->gather phase.
@@ -5059,7 +5071,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 
             /* reallocate space for arguments if necessary */
             __kmp_alloc_argv_entries( argc, team, TRUE );
-            team->t.t_argc     = argc;
+            KMP_CHECK_UPDATE(team->t.t_argc, argc);
 
             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
author	Jonathan Peyton <jonathan.l.peyton@intel.com>	2016-05-23 18:01:19 +0000
committer	Jonathan Peyton <jonathan.l.peyton@intel.com>	2016-05-23 18:01:19 +0000
commit	b044e4fa31bde25b145f197360022e2994b328b2 (patch)
tree	215931d6458dcfa2dd652e3295f60ba42a6167c4
parent	13a0d4981374db4252d0062645d3cbb21da289ec (diff)
download	bcm5719-llvm-b044e4fa31bde25b145f197360022e2994b328b2.tar.gz bcm5719-llvm-b044e4fa31bde25b145f197360022e2994b328b2.zip