1 files changed, 257 insertions, 450 deletions
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c
index 7d66b9bc304..37c372bd89e 100644
--- a/openmp/runtime/src/kmp_runtime.c
+++ b/openmp/runtime/src/kmp_runtime.c
@@ -1,7 +1,7 @@
 /*
  * kmp_runtime.c -- KPTS runtime support library
- * $Revision: 42642 $
- * $Date: 2013-09-06 01:57:24 -0500 (Fri, 06 Sep 2013) $
+ * $Revision: 42839 $
+ * $Date: 2013-11-24 13:01:00 -0600 (Sun, 24 Nov 2013) $
  */
 
 
@@ -88,6 +88,8 @@ char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: "
 #endif /* KMP_DEBUG */
 
 
+#define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
+
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
@@ -472,8 +474,7 @@ __kmp_wait_sleep( kmp_info_t *this_thr,
                     __kmp_unref_task_team( task_team, this_thr );
                 } else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) {
                     __kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag
-                                         USE_ITT_BUILD_ARG( itt_sync_obj )
-                                         );
+                                         USE_ITT_BUILD_ARG( itt_sync_obj ), 0);
                 }
             }; // if
         }; // if
@@ -994,7 +995,7 @@ DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
 }
 
 # endif /* KMP_OS_WINDOWS */
-#endif /* GUIDEDLL_EXPORTS
+#endif /* GUIDEDLL_EXPORTS */
 
 
 /* ------------------------------------------------------------------------ */
@@ -1190,10 +1191,8 @@ __kmp_linear_barrier_gather( enum barrier_type bt,
         register kmp_balign_team_t *team_bar  = & team -> t.t_bar[ bt ];
         register int                nproc     = this_thr -> th.th_team_nproc;
         register int                i;
-        register kmp_uint           new_state;
-
         /* Don't have to worry about sleep bit here or atomic since team setting */
-        new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
+        register kmp_uint           new_state  = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
 
         /* Collect all the worker team member threads. */
         for (i = 1; i < nproc; i++) {
@@ -1341,7 +1340,7 @@ __kmp_tree_barrier_gather( enum barrier_type bt,
         /* Need to update the team arrived pointer if we are the master thread */
 
         if ( nproc > 1 )
-            /* New value was already computed in above loop */
+            /* New value was already computed above */
             team -> t.t_bar[ bt ].b_arrived = new_state;
         else
             team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
@@ -1380,6 +1379,12 @@ __kmp_hyper_barrier_gather( enum barrier_type bt,
 
     KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
 
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
+        this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
+    }
+#endif
     /*
      * We now perform a hypercube-embedded tree gather to wait until all
      * of the threads have arrived, and reduce any required data
@@ -1417,6 +1422,9 @@ __kmp_hyper_barrier_gather( enum barrier_type bt,
 
         /* parent threads wait for children to arrive */
 
+        if (new_state == KMP_BARRIER_UNUSED_STATE)
+            new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
+
         for ( child = 1, child_tid = tid + (1 << level);
               child < branch_factor && child_tid < num_threads;
               child++, child_tid += (1 << level) )
@@ -1429,10 +1437,6 @@ __kmp_hyper_barrier_gather( enum barrier_type bt,
             if ( child+1 < branch_factor && next_child_tid < num_threads )
                 KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived );
 #endif /* KMP_CACHE_MANAGE */
-            /* Only read this arrived flag once per thread that needs it */
-            if (new_state == KMP_BARRIER_UNUSED_STATE)
-                new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
-
             KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
                             "arrived(%p) == %u\n",
                             gtid, team->t.t_id, tid,
@@ -1444,6 +1448,12 @@ __kmp_hyper_barrier_gather( enum barrier_type bt,
                               USE_ITT_BUILD_ARG (itt_sync_obj)
                               );
 
+#if USE_ITT_BUILD
+            // Barrier imbalance - write min of the thread time and a child time to the thread.
+            if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
+                this_thr->th.th_bar_arrive_time = KMP_MIN( this_thr->th.th_bar_arrive_time, child_thr->th.th_bar_arrive_time );
+            }
+#endif
             if (reduce) {
 
                 KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
@@ -1729,7 +1739,6 @@ __kmp_tree_barrier_release( enum barrier_type bt,
 
 /* The reverse versions seem to beat the forward versions overall */
 #define KMP_REVERSE_HYPER_BAR
-#ifdef KMP_REVERSE_HYPER_BAR
 static void
 __kmp_hyper_barrier_release( enum barrier_type bt,
                              kmp_info_t *this_thr,
@@ -1751,15 +1760,13 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
     register kmp_uint32     offset;
     register kmp_uint32     level;
 
-    /*
-     * We now perform a hypercube-embedded tree release for all
-     * of the threads that have been gathered, but in the exact
-     * reverse order from the corresponding gather (for load balance.
-     */
+    /* Perform a hypercube-embedded tree release for all of the threads
+       that have been gathered.  If KMP_REVERSE_HYPER_BAR is defined (default)
+       the threads are released in the reverse order of the corresponding gather,
+       otherwise threads are released in the same order. */
 
     if ( ! KMP_MASTER_TID( tid )) {
         /* worker threads */
-
         KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
           gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
 
@@ -1807,7 +1814,7 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
 
         TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
         KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
-          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
+                        gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
 
         KMP_MB();       /* Flush all pending memory write invalidates.  */
 
@@ -1822,6 +1829,7 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
     num_threads = this_thr -> th.th_team_nproc;
     other_threads = team -> t.t_threads;
 
+#ifdef KMP_REVERSE_HYPER_BAR
     /* count up to correct level for parent */
     for ( level = 0, offset = 1;
           offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0);
@@ -1831,7 +1839,14 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
     for ( level -= branch_bits, offset >>= branch_bits;
           offset != 0;
           level -= branch_bits, offset >>= branch_bits )
+#else
+    /* Go down the tree, level by level */
+    for ( level = 0, offset = 1;
+          offset < num_threads;
+          level += branch_bits, offset <<= branch_bits )
+#endif // KMP_REVERSE_HYPER_BAR
     {
+#ifdef KMP_REVERSE_HYPER_BAR
         /* Now go in reverse order through the children, highest to lowest.
            Initial setting of child is conservative here. */
         child = num_threads >> ((level==0)?level:level-1);
@@ -1839,8 +1854,18 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
                   child_tid = tid + (child << level);
               child >= 1;
               child--, child_tid -= (1 << level) )
-        {
+#else
+        if (((tid >> level) & (branch_factor - 1)) != 0)
+            /* No need to go any lower than this, since this is the level
+               parent would be notified */
+            break;
 
+        /* iterate through children on this level of the tree */
+        for ( child = 1, child_tid = tid + (1 << level);
+              child < branch_factor && child_tid < num_threads;
+              child++, child_tid += (1 << level) )
+#endif // KMP_REVERSE_HYPER_BAR
+        {
             if ( child_tid >= num_threads ) continue;   /* child doesn't exist so keep going */
             else {
                 register kmp_info_t   *child_thr = other_threads[ child_tid ];
@@ -1848,7 +1873,11 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
 #if KMP_CACHE_MANAGE
                 register kmp_uint32 next_child_tid = child_tid - (1 << level);
                 /* prefetch next thread's go count */
+#ifdef KMP_REVERSE_HYPER_BAR
                 if ( child-1 >= 1 && next_child_tid < num_threads )
+#else
+                if ( child+1 < branch_factor && next_child_tid < num_threads )
+#endif // KMP_REVERSE_HYPER_BAR
                     KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
 #endif /* KMP_CACHE_MANAGE */
 
@@ -1880,154 +1909,6 @@ __kmp_hyper_barrier_release( enum barrier_type bt,
       gtid, team->t.t_id, tid, bt ) );
 }
 
-#else /* !KMP_REVERSE_HYPER_BAR */
-
-static void
-__kmp_hyper_barrier_release( enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs )
-{
-    /* handle fork barrier workers who aren't part of a team yet */
-    register kmp_team_t    *team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_info_t   **other_threads;
-    register kmp_uint32     num_threads;
-    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
-    register kmp_uint32     branch_factor = 1 << branch_bits;
-    register kmp_uint32     child;
-    register kmp_uint32     child_tid;
-    register kmp_uint32     offset;
-    register kmp_uint32     level;
-
-    /*
-     * We now perform a hypercube-embedded tree release for all
-     * of the threads that have been gathered, but in the same order
-     * as the gather.
-     */
-
-    if ( ! KMP_MASTER_TID( tid )) {
-        /* worker threads */
-
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
-          gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
-
-        /* wait for parent thread to release us */
-        __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE, NULL );
-
-#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
-            // cancel wait on previous parallel region...
-            __kmp_itt_task_starting( itt_sync_obj );
-
-            if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-                return;
-
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );  // call prepare as early as possible for "new" barrier
-
-        } else
-#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
-        //
-        // early exit for reaping threads releasing forkjoin barrier
-        //
-        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-            return;
-
-        //
-        // The worker thread may now assume that the team is valid.
-        //
-#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
-        // libguide only code (cannot use *itt_task* routines)
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            __kmp_itt_barrier_starting( gtid, itt_sync_obj );  // no need to call releasing, but we have paired calls...
-        }
-#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-        tid = __kmp_tid_from_gtid( gtid );
-
-        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
-                        gtid, ( team != NULL ) ? team->t.t_id : -1, tid,
-                        &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
-
-        KMP_MB();       /* Flush all pending memory write invalidates.  */
-
-    } else {  /* KMP_MASTER_TID(tid) */
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) enter for barrier type %d\n",
-          gtid, team->t.t_id, tid, bt ) );
-    }
-
-    /* Now set up team parameters since workers have been released */
-    if ( team == NULL )  {
-        /* handle fork barrier workers who are now part of a team */
-        tid = __kmp_tid_from_gtid( gtid );
-        team = __kmp_threads[ gtid ]-> th.th_team;
-    }
-    num_threads = this_thr -> th.th_team_nproc;
-    other_threads = team -> t.t_threads;
-
-    /* Go down the tree, level by level */
-    for ( level = 0, offset = 1;
-          offset < num_threads;
-          level += branch_bits, offset <<= branch_bits )
-    {
-        if (((tid >> level) & (branch_factor - 1)) != 0)
-            /* No need to go any lower than this, since this is the level
-               parent would be notified */
-            break;
-
-        /* iterate through children on this level of the tree */
-        for ( child = 1, child_tid = tid + (1 << level);
-              child < branch_factor && child_tid < num_threads;
-              child++, child_tid += (1 << level) )
-        {
-            register kmp_info_t   *child_thr = other_threads[ child_tid ];
-            register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
-#if KMP_CACHE_MANAGE
-            {
-                register kmp_uint32 next_child_tid = child_tid + (1 << level);
-                /* prefetch next thread's go count */
-                if ( child+1 < branch_factor && next_child_tid < num_threads )
-                    KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
-            }
-#endif /* KMP_CACHE_MANAGE */
-
-#if KMP_BARRIER_ICV_PUSH
-            if ( propagate_icvs ) {
-                KMP_DEBUG_ASSERT( team != NULL );
-                __kmp_init_implicit_task( team->t.t_ident,
-                  team->t.t_threads[child_tid], team, child_tid, FALSE );
-                load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
-                store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
-                sync_icvs();
-            }
-#endif // KMP_BARRIER_ICV_PUSH
-
-            KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing "
-                            "T#%d(%d:%u) go(%p): %u => %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
-                            child_tid, &child_bar -> b_go, child_bar -> b_go,
-                            child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
-
-            /* release child from barrier */
-            __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
-        }
-    }
-
-    KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
-      gtid, team->t.t_id, tid, bt ) );
-}
-#endif /* KMP_REVERSE_HYPER_BAR */
-
-
 /*
  * Internal function to do a barrier.
  * If is_split is true, do a split barrier, otherwise, do a plain barrier
@@ -2043,6 +1924,8 @@ __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
     register kmp_team_t  *team            = this_thr -> th.th_team;
     register int status = 0;
 
+    ident_t * tmp_loc = __kmp_threads[ gtid ]->th.th_ident;
+
     KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n",
                     gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
 
@@ -2126,34 +2009,23 @@ __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
             #endif /* OMP_30_ENABLED */
 
 
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
             // Barrier - report frame end
-#if USE_ITT_BUILD
-            // Collect information only if the file was opened succesfully.
-            if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
-            {
-                ident_t * loc  = this_thr->th.th_ident;
-                if (loc) {
-                    // Use compiler-generated location to mark the frame:
-                    // "<func>$omp$frame@[file:]<line>[:<col>]"
-                    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
-
-                    kmp_uint64 fr_end;
-#if defined( __GNUC__ )
-# if !defined( __INTEL_COMPILER )
-                    fr_end = __kmp_hardware_timestamp();
-# else
-                    fr_end = __rdtsc();
-# endif
-#else
-                    fr_end = __rdtsc();
-#endif
-                    K_DIAG( 3, ( "__kmp_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n",
-                                 gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) );
-
-                    __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n",
-                                         str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end );
-                    __kmp_str_loc_free( &str_loc );
-                    this_thr->th.th_frame_time = fr_end;
+            if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) {
+                kmp_uint64 tmp = __itt_get_timestamp();
+                switch( __kmp_forkjoin_frames_mode ) {
+                case 1:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
+                  this_thr->th.th_frame_time = tmp;
+                  break;
+                case 2:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
+                  break;
+                case 3:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
+                  this_thr->th.th_frame_time = tmp;
+                  break;
                 }
             }
 #endif /* USE_ITT_BUILD */
@@ -2465,7 +2337,7 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
     KMP_MB();
 
     /* first, let's setup the master thread */
-    master_th -> th.th_info .ds.ds_tid = 0;
+    master_th -> th.th_info.ds.ds_tid  = 0;
     master_th -> th.th_team            = team;
     master_th -> th.th_team_nproc      = team -> t.t_nproc;
     master_th -> th.th_team_master     = master_th;
@@ -2514,6 +2386,17 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
 static void
 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
 
+static void
+__kmp_setup_icv_copy( kmp_team_t *team, int new_nproc,
+#if OMP_30_ENABLED
+                 kmp_internal_control_t * new_icvs,
+                 ident_t *                loc
+#else
+                 int new_set_nproc, int new_set_dynamic, int new_set_nested,
+                 int new_set_blocktime, int new_bt_intervals, int new_bt_set
+#endif // OMP_30_ENABLED
+                 ); // forward declaration
+
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
 int
@@ -2527,7 +2410,7 @@ __kmp_fork_call(
     microtask_t microtask,
     launch_t    invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if KMP_ARCH_X86_64 && KMP_OS_LINUX
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
     va_list   * ap
 #else
     va_list     ap
@@ -2576,7 +2459,6 @@ __kmp_fork_call(
 #endif
 
 
-
     master_th->th.th_ident = loc;
 
 #if OMP_40_ENABLED
@@ -2590,7 +2472,7 @@ __kmp_fork_call(
         argv = (void**)parent_team->t.t_argv;
         for( i=argc-1; i >= 0; --i )
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if KMP_ARCH_X86_64 && KMP_OS_LINUX
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
             *argv++ = va_arg( *ap, void * );
 #else
             *argv++ = va_arg( ap, void * );
@@ -2686,11 +2568,11 @@ __kmp_fork_call(
     /* create a serialized parallel region? */
     if ( nthreads == 1 ) {
         /* josh todo: hypothetical question: what do we do for OS X*? */
-#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+#if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM )
         void *   args[ argc ];
 #else
         void * * args = (void**) alloca( argc * sizeof( void * ) );
-#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) */
+#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM ) */
 
         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
         KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
@@ -2721,7 +2603,7 @@ __kmp_fork_call(
                 if ( ap ) {
                     for( i=argc-1; i >= 0; --i )
                       /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-                      #if KMP_ARCH_X86_64 && KMP_OS_LINUX
+                      #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
                         *argv++ = va_arg( *ap, void * );
                       #else
                         *argv++ = va_arg( ap, void * );
@@ -2741,7 +2623,7 @@ __kmp_fork_call(
                 argv = args;
                 for( i=argc-1; i >= 0; --i )
                 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-                #if KMP_ARCH_X86_64 && KMP_OS_LINUX
+                #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
                     *argv++ = va_arg( *ap, void * );
                 #else
                     *argv++ = va_arg( ap, void * );
@@ -2957,7 +2839,7 @@ __kmp_fork_call(
 #endif /* OMP_40_ENABLED */
         for( i=argc-1; i >= 0; --i )
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-#if KMP_ARCH_X86_64 && KMP_OS_LINUX
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
             *argv++ = va_arg( *ap, void * );
 #else
             *argv++ = va_arg( ap, void * );
@@ -2977,6 +2859,18 @@ __kmp_fork_call(
         root -> r.r_active = TRUE;
 
     __kmp_fork_team_threads( root, team, master_th, gtid );
+    __kmp_setup_icv_copy(team, nthreads
+#if OMP_30_ENABLED
+			 , &master_th->th.th_current_task->td_icvs, loc
+#else
+			 , parent_team->t.t_set_nproc[master_tid],
+			 parent_team->t.t_set_dynamic[master_tid],
+			 parent_team->t.t_set_nested[master_tid],
+			 parent_team->t.t_set_blocktime[master_tid],
+			 parent_team->t.t_set_bt_intervals[master_tid],
+			 parent_team->t.t_set_bt_set[master_tid]
+#endif /* OMP_30_ENABLED */
+			 );
 
 
     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
@@ -2992,23 +2886,12 @@ __kmp_fork_call(
         __kmp_itt_region_forking( gtid );
 #endif /* USE_ITT_BUILD */
 
+#if USE_ITT_BUILD && USE_ITT_NOTIFY && OMP_30_ENABLED
     // Internal fork - report frame begin
-#if USE_ITT_BUILD
-    // Collect information only if the file was opened succesfully.
-    if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
+    if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr )
     {
-        kmp_uint64 fr_begin;
-#if defined( __GNUC__ )
-# if !defined( __INTEL_COMPILER )
-        fr_begin = __kmp_hardware_timestamp();
-# else
-        fr_begin = __rdtsc();
-# endif
-#else
-        fr_begin = __rdtsc();
-#endif
         if( ! ( team->t.t_active_level > 1 ) ) {
-            master_th->th.th_frame_time   = fr_begin;
+            master_th->th.th_frame_time   = __itt_get_timestamp();
         }
     }
 #endif /* USE_ITT_BUILD */
@@ -3134,7 +3017,10 @@ __kmp_join_call(ident_t *loc, int gtid
         // Either not in teams or exiting teams region
         // (teams is a frame and no other frames inside the teams)
 # endif /* OMP_40_ENABLED */
+    {
+        master_th->th.th_ident = loc;
         __kmp_itt_region_joined( gtid );
+    }
 #endif /* USE_ITT_BUILD */
 
 #if OMP_40_ENABLED
@@ -4644,6 +4530,7 @@ __kmp_register_root( int initial_thread )
     root -> r.r_root_team -> t.t_threads[0] = root_thread;
     root -> r.r_hot_team  -> t.t_threads[0] = root_thread;
     root_thread -> th.th_serial_team -> t.t_threads[0] = root_thread;
+    root_thread -> th.th_serial_team -> t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
     root -> r.r_uber_thread = root_thread;
 
     /* initialize the thread, get it ready to go */
@@ -5007,6 +4894,19 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
             TCW_4( __kmp_init_monitor, 1 );
             __kmp_create_monitor( & __kmp_monitor );
             KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
+            #if KMP_OS_WINDOWS
+                // AC: wait until monitor has started. This is a fix for CQ232808.
+                //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
+                //     work in between, then there is high probability that monitor thread started after
+                //     the library shutdown. At shutdown it is too late to cope with the problem, because
+                //     when the master is in DllMain (process detach) the monitor has no chances to start
+                //     (it is blocked), and master has no means to inform the monitor that the library has gone,
+                //     because all the memory which the monitor can access is going to be released/reset.
+                while ( TCR_4(__kmp_init_monitor) < 2 ) {
+                    KMP_YIELD( TRUE );
+                }
+                KF_TRACE( 10, ( "after monitor thread has started\n" ) );
+            #endif
         }
         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
     }
@@ -5049,6 +4949,7 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
                                            0 );
     }
     KMP_ASSERT ( serial_team );
+    serial_team -> t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
     serial_team -> t.t_threads[0] = new_thr;
     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
       new_thr ) );
@@ -5144,76 +5045,94 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
  */
 static void
-__kmp_reinitialize_team(
-    kmp_team_t *  team,
-    int           new_nproc,
-    #if OMP_30_ENABLED
-        kmp_internal_control_t * new_icvs,
-        ident_t *                loc
-    #else
-        int new_set_nproc, int new_set_dynamic, int new_set_nested,
-        int new_set_blocktime, int new_bt_intervals, int new_bt_set
-    #endif // OMP_30_ENABLED
-) {
-    int f;
-    #if OMP_30_ENABLED
-        KMP_DEBUG_ASSERT( team && new_nproc && new_icvs );
-        KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
-        team->t.t_ident = loc;
-    #else
-        KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc );
-    #endif // OMP_30_ENABLED
+__kmp_reinitialize_team( kmp_team_t *team,
+#if OMP_30_ENABLED
+                         kmp_internal_control_t *new_icvs, ident_t *loc
+#else
+                         int new_set_nproc, int new_set_dynamic, int new_set_nested,
+                         int new_set_blocktime, int new_bt_intervals, int new_bt_set
+#endif
+                         ) {
+    KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
+                    team->t.t_threads[0], team ) );
+#if OMP_30_ENABLED
+    KMP_DEBUG_ASSERT( team && new_icvs);
+    KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
+    team->t.t_ident = loc;
+#else
+    KMP_DEBUG_ASSERT( team && new_set_nproc );
+#endif // OMP_30_ENABLED
 
     team->t.t_id = KMP_GEN_TEAM_ID();
 
-#if KMP_BARRIER_ICV_PULL
-    //
-    // Copy the ICV's to the team structure, where all of the worker threads
-    // can access them and make their own copies after the barrier.
-    //
+    // Copy ICVs to the master thread's implicit taskdata
+#if OMP_30_ENABLED
     load_icvs(new_icvs);
-    store_icvs(&team->t.t_initial_icvs, new_icvs);
-
-    //
-    // Set up the master thread's copy of the ICV's.  __kmp_fork_call()
-    // assumes they are already set in the master thread.
-    // FIXME - change that code to use the team->t.t_initial_icvs copy
-    // and eliminate this copy.
-    //
     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
     store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
     sync_icvs();
-    KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
-                    0, team->t.t_threads[0], team ) );
+# else
+    team -> t.t_set_nproc[0]   = new_set_nproc;
+    team -> t.t_set_dynamic[0] = new_set_dynamic;
+    team -> t.t_set_nested[0]  = new_set_nested;
+    team -> t.t_set_blocktime[0]   = new_set_blocktime;
+    team -> t.t_set_bt_intervals[0] = new_bt_intervals;
+    team -> t.t_set_bt_set[0]  = new_bt_set;
+# endif // OMP_30_ENABLED
 
-#elif KMP_BARRIER_ICV_PUSH
-    //
-    // Set the ICV's in the master thread only.
-    // They will be propagated by the fork barrier.
-    //
-    __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
+    KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
+                    team->t.t_threads[0], team ) );
+}
+
+static void
+__kmp_setup_icv_copy(kmp_team_t *  team, int           new_nproc,
+#if OMP_30_ENABLED
+                kmp_internal_control_t * new_icvs,
+                ident_t *                loc
+#else
+                int new_set_nproc, int new_set_dynamic, int new_set_nested,
+                int new_set_blocktime, int new_bt_intervals, int new_bt_set
+#endif // OMP_30_ENABLED
+                )
+{
+    int f;
+
+#if OMP_30_ENABLED
+    KMP_DEBUG_ASSERT( team && new_nproc && new_icvs );
+    KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
+#else
+    KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc );
+#endif // OMP_30_ENABLED
+
+    // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team.
+    // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called.
+#if KMP_BARRIER_ICV_PULL
+    // Copy the ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where all of the
+    // worker threads can access them and make their own copies after the barrier.
     load_icvs(new_icvs);
-    store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // the threads arrays should be allocated at this point
+    store_icvs(&team->t.t_threads[0]->th.th_fixed_icvs, new_icvs);
     sync_icvs();
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team));
 
-    KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
-                    0, team->t.t_threads[0], team ) );
+#elif KMP_BARRIER_ICV_PUSH
+    // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team));
 
 #else
-    //
-    // Copy the icvs to each of the threads.  This takes O(nthreads) time.
-    //
-#if OMP_30_ENABLED
+    // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
+# if OMP_30_ENABLED
     load_icvs(new_icvs);
-#endif
-    for( f=0 ; f<new_nproc ; f++) {
+# endif // OMP_30_ENABLED
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // the threads arrays should be allocated at this point
+    for(f=1 ; f<new_nproc ; f++) { // skip the master thread
 # if OMP_30_ENABLED
         // TODO: GEH - pass in better source location info since usually NULL here
-        KF_TRACE( 10, ( "__kmp_reinitialize_team1: T#%d this_thread=%p team=%p\n",
+        KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                         f, team->t.t_threads[f], team ) );
         __kmp_init_implicit_task( loc, team->t.t_threads[f], team, f, FALSE );
         store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
-        KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
+        KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
                         f, team->t.t_threads[f], team ) );
 # else
         team -> t.t_set_nproc[f]   = new_set_nproc;
@@ -5226,9 +5145,8 @@ __kmp_reinitialize_team(
     }
 # if OMP_30_ENABLED
     sync_icvs();
-# endif
-#endif // KMP_BARRIER_ICV_PUSH || KMP_BARRIER_ICV_PULL
-
+# endif // OMP_30_ENABLED
+#endif // KMP_BARRIER_ICV_PULL
 }
 
 /* initialize the team data structure
@@ -5246,6 +5164,8 @@ __kmp_initialize_team(
         int new_set_blocktime, int new_bt_intervals, int new_bt_set
     #endif // OMP_30_ENABLED
 ) {
+    KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
+
     /* verify */
     KMP_DEBUG_ASSERT( team );
     KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
@@ -5290,18 +5210,18 @@ __kmp_initialize_team(
 
     team -> t.t_control_stack_top = NULL;
 
-    __kmp_reinitialize_team(
-        team, new_nproc,
-        #if OMP_30_ENABLED
-            new_icvs,
-            loc
-        #else
-            new_set_nproc, new_set_dynamic, new_set_nested,
-            new_set_blocktime, new_bt_intervals, new_bt_set
-        #endif // OMP_30_ENABLED
-    );
+    __kmp_reinitialize_team( team,
+#if OMP_30_ENABLED
+                             new_icvs, loc
+#else
+                             new_set_nproc, new_set_dynamic, new_set_nested,
+                             new_set_blocktime, new_bt_intervals, new_bt_set
+#endif // OMP_30_ENABLED
+                             );
+
 
     KMP_MB();
+    KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
 }
 
 #if KMP_OS_LINUX
@@ -5700,15 +5620,15 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
             team -> t.t_sched =  new_icvs->sched;
 #endif
-            __kmp_reinitialize_team( team, new_nproc,
+            __kmp_reinitialize_team( team,
 #if OMP_30_ENABLED
-              new_icvs,
-              root->r.r_uber_thread->th.th_ident
+                                     new_icvs, root->r.r_uber_thread->th.th_ident
 #else
-              new_set_nproc, new_set_dynamic, new_set_nested,
-              new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-            );
+                                     new_set_nproc, new_set_dynamic, new_set_nested,
+                                     new_set_blocktime, new_bt_intervals, new_bt_set
+#endif // OMP_30_ENABLED
+                                     );
+
 
 #if OMP_30_ENABLED
             if ( __kmp_tasking_mode != tskm_immediate_exec ) {
@@ -5768,15 +5688,14 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             if(team -> t.t_max_nproc < new_nproc) {
                 /* reallocate larger arrays */
                 __kmp_reallocate_team_arrays(team, new_nproc);
-                __kmp_reinitialize_team( team, new_nproc,
+                __kmp_reinitialize_team( team,
 #if OMP_30_ENABLED
-                  new_icvs,
-                  NULL  // TODO: !!!
+                                         new_icvs, NULL
 #else
-                  new_set_nproc, new_set_dynamic, new_set_nested,
-                  new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-                );
+                                         new_set_nproc, new_set_dynamic, new_set_nested,
+                                         new_set_blocktime, new_bt_intervals, new_bt_set
+#endif // OMP_30_ENABLED
+                                         );
             }
 
 #if KMP_OS_LINUX
@@ -5859,8 +5778,8 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 # endif
 #endif
 
-         }
-         else {
+        }
+        else {
             KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
 #if KMP_MIC
             // This case can mean that omp_set_num_threads() was called and the hot team size
@@ -5877,15 +5796,14 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             team -> t.t_sched =  new_icvs->sched;
 #endif
 
-            __kmp_reinitialize_team( team, new_nproc,
+            __kmp_reinitialize_team( team,
 #if OMP_30_ENABLED
-              new_icvs,
-              root->r.r_uber_thread->th.th_ident
+                                     new_icvs, root->r.r_uber_thread->th.th_ident
 #else
-              new_set_nproc, new_set_dynamic, new_set_nested,
-              new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-            );
+                                     new_set_nproc, new_set_dynamic, new_set_nested,
+                                     new_set_blocktime, new_bt_intervals, new_bt_set
+#endif // OMP_30_ENABLED
+                                     );
 
 #if OMP_30_ENABLED
             KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
@@ -6000,6 +5918,8 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
      * up seems to really hurt performance a lot on the P4, so, let's not use
      * this... */
     __kmp_allocate_team_arrays( team, max_nproc );
+
+    KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
     __kmp_initialize_team( team, new_nproc,
 #if OMP_30_ENABLED
       new_icvs,
@@ -6293,7 +6213,6 @@ __kmp_join_barrier( int gtid )
     KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
                    gtid, team_id, tid ));
 
-
     #if OMP_30_ENABLED
         if ( __kmp_tasking_mode == tskm_extra_barrier ) {
             __kmp_tasking_barrier( team, this_thr, gtid );
@@ -6329,25 +6248,6 @@ __kmp_join_barrier( int gtid )
         #endif // OMP_30_ENABLED
     }
 
-    #if KMP_OS_WINDOWS
-        // AC: wait here until monitor has started. This is a fix for CQ232808.
-        //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
-        //     work in between, then there is high probability that monitor thread started after
-        //     the library shutdown. At shutdown it is too late to cope with the problem, because
-        //     when the master is in DllMain (process detach) the monitor has no chances to start
-        //     (it is blocked), and master has no means to inform the monitor that the library has gone,
-        //     because all the memory which the monitor can access is going to be released/reset.
-        //
-        //     The moment before barrier_gather sounds appropriate, because master needs to
-        //     wait for all workers anyway, and we want this to happen as late as possible,
-        //     but before the shutdown which may happen after the barrier.
-        if( KMP_MASTER_TID( tid ) && TCR_4(__kmp_init_monitor) < 2 ) {
-            __kmp_wait_sleep( this_thr, (volatile kmp_uint32*)&__kmp_init_monitor, 2, 0
-                              USE_ITT_BUILD_ARG( itt_sync_obj )
-                              );
-        }
-    #endif
-
 #if USE_ITT_BUILD
     if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
         __kmp_itt_barrier_starting( gtid, itt_sync_obj );
@@ -6390,34 +6290,22 @@ __kmp_join_barrier( int gtid )
                                       USE_ITT_BUILD_ARG( itt_sync_obj )
                                       );
             }
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
             // Join barrier - report frame end
-#if USE_ITT_BUILD
-            // Collect information only if the file was opened successfully.
-            if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
-            {
-                ident_t * loc  = this_thr->th.th_ident;
-                if (loc) {
-                    // Use compiler-generated location to mark the frame:
-                    // "<func>$omp$frame@[file:]<line>[:<col>]"
-                    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
-
-                    kmp_uint64 fr_end;
-#if defined( __GNUC__ )
-# if !defined( __INTEL_COMPILER )
-                    fr_end = __kmp_hardware_timestamp();
-# else
-                    fr_end = __rdtsc();
-# endif
-#else
-                    fr_end = __rdtsc();
-#endif
-                    K_DIAG( 3, ( "__kmp_join_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n",
-                                 gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) );
-
-                    __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n",
-                                         str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end );
-
-                    __kmp_str_loc_free( &str_loc );
+            if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) {
+                kmp_uint64 tmp = __itt_get_timestamp();
+                ident_t * loc = team->t.t_ident;
+                switch( __kmp_forkjoin_frames_mode ) {
+                case 1:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc );
+                  break;
+                case 2:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc );
+                  break;
+                case 3:
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc );
+                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc );
+                  break;
                 }
             }
 #endif /* USE_ITT_BUILD */
@@ -6571,20 +6459,16 @@ __kmp_fork_barrier( int gtid, int tid )
 #if OMP_30_ENABLED
 
 # if KMP_BARRIER_ICV_PULL
-    //
-    // FIXME - after __kmp_fork_call() is modified to not look at the
-    // master thread's implicit task ICV's, remove the ! KMP_MASTER_TID
-    // restriction from this if condition.
-    //
-    if (! KMP_MASTER_TID( tid ) ) {
-        //
-        // Copy the initial ICV's from the team struct to the implicit task
-        // for this tid.
-        //
-        __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid],
-          team, tid, FALSE );
-        load_icvs(&team->t.t_initial_icvs);
-        store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_initial_icvs);
+    // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team.
+    // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called.
+    // We cannot modify __kmp_fork_call() to look at the fixed ICVs in the master's thread struct, because it is
+    // not always the case that the threads arrays have been allocated when __kmp_fork_call() is executed.
+    if (! KMP_MASTER_TID( tid ) ) {  // master thread already has ICVs
+        // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
+        KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid ));
+        load_icvs(&team->t.t_threads[0]->th.th_fixed_icvs);
+        __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE );
+        store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_threads[0]->th.th_fixed_icvs);
         sync_icvs();
     }
 # endif // KMP_BARRIER_ICV_PULL
@@ -6716,13 +6600,13 @@ __kmp_launch_thread( kmp_info_t *this_thr )
 void
 __kmp_internal_end_dest( void *specific_gtid )
 {
-    #ifdef __INTEL_COMPILER
+    #if KMP_COMPILER_ICC
         #pragma warning( push )
         #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
     #endif
     // Make sure no significant bits are lost
     int gtid = (kmp_intptr_t)specific_gtid - 1;
-    #ifdef __INTEL_COMPILER
+    #if KMP_COMPILER_ICC
         #pragma warning( pop )
     #endif
 
@@ -7503,7 +7387,6 @@ __kmp_do_serial_initialize( void )
         __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
     }
     __kmp_max_nth = __kmp_sys_max_nth;
-    __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
 
     // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
     __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
@@ -7572,18 +7455,17 @@ __kmp_do_serial_initialize( void )
         if ( __kmp_str_match_true( val ) ) {
             kmp_str_buf_t buffer;
             __kmp_str_buf_init( & buffer );
-            __kmp_i18n_dump_catalog( buffer );
+            __kmp_i18n_dump_catalog( & buffer );
             __kmp_printf( "%s", buffer.str );
             __kmp_str_buf_free( & buffer );
         }; // if
         __kmp_env_free( & val );
     #endif
 
+    __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
     // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
     __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
 
-    //  omalyshe: This initialisation beats env var setting.
-    //__kmp_load_balance_interval = 1.0;
 
     // If the library is shut down properly, both pools must be NULL. Just in case, set them
     // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
@@ -7876,38 +7758,6 @@ __kmp_parallel_initialize( void )
         __kmp_print_version_2();
     }
 
-#if USE_ITT_BUILD
-    // Create CSV file to report frames
-    if( __kmp_forkjoin_frames_mode == 1 )
-    {
-        // Open CSV file to write itt frame information
-        const char * csv_file;
-/*        Internal AXE variables
-        char * host_name = __kmp_env_get("INTEL_MRTE_HOST_NAME");
-        char * out_dir   = __kmp_env_get("INTEL_MRTE_DATA_DIR");*/
-        char * host_name = __kmp_env_get("AMPLXE_HOSTNAME");
-        char * out_dir   = __kmp_env_get("AMPLXE_DATA_DIR");
-
-        if( out_dir && host_name ) {
-            csv_file = __kmp_str_format( "%s/omp-frames-hostname-%s.csv", out_dir, host_name );
-            __kmp_itt_csv_file = fopen( csv_file, "w" );
-            __kmp_str_free( &csv_file );
-        } else {
-#ifdef KMP_DEBUG
-            // Create CSV file in the current dir
-            csv_file = __kmp_str_format( "./omp-frames-hostname-xxx.csv" );
-            __kmp_itt_csv_file = fopen( csv_file, "w" );
-            __kmp_str_free( &csv_file );
-#endif
-        }
-        if( __kmp_itt_csv_file ) {
-            __kmp_str_buf_init( & __kmp_itt_frame_buffer );
-            __kmp_str_buf_print( & __kmp_itt_frame_buffer, "name,start_tsc.TSC,end_tsc,pid,tid\n" );
-        }
-    }
-
-#endif /* USE_ITT_BUILD */
-
     /* we have finished parallel initialization */
     TCW_SYNC_4(__kmp_init_parallel, TRUE);
 
@@ -8347,16 +8197,6 @@ __kmp_cleanup( void )
 
     __kmp_i18n_catclose();
 
-#if USE_ITT_BUILD
-    // Close CSV file for frames
-    if( __kmp_forkjoin_frames_mode && __kmp_itt_csv_file ) {
-        fprintf( __kmp_itt_csv_file, __kmp_itt_frame_buffer.str );
-
-        __kmp_str_buf_free( & __kmp_itt_frame_buffer );
-        fclose( __kmp_itt_csv_file );
-    }
-#endif /* USE_ITT_BUILD */
-
     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
 }
 
@@ -8576,14 +8416,6 @@ __kmp_aux_set_defaults(
  * internal fast reduction routines
  */
 
-// implementation rev. 0.4
-// AT: determine CPU, and always use 'critical method' if non-Intel
-// AT: test loc != NULL
-// AT: what to return if lck == NULL
-// AT: tune the cut-off point for atomic reduce method
-// AT: tune what to return depending on the CPU and platform configuration
-// AT: tune what to return depending on team size
-// AT: move this function out to kmp_csupport.c
 PACKED_REDUCTION_METHOD_T
 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
         kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
@@ -8641,22 +8473,10 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
                 #error "Unknown or unsupported OS"
             #endif // KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN
 
-        #elif KMP_ARCH_X86
+        #elif KMP_ARCH_X86 || KMP_ARCH_ARM
 
             #if KMP_OS_LINUX || KMP_OS_WINDOWS
 
-                // similar to win_32
-                // 4x1x2 fxqlin04, the 'linear,linear' barrier
-
-                // similar to lin_32
-                // 4x1x2 fxqwin04, the 'linear,linear' barrier
-
-                // actual measurement shows that the critical section method is better if team_size <= 8;
-                // what happenes when team_size > 8 ? ( no machine to test )
-
-                // TO DO: need to run a 32-bit code on Intel(R) 64
-                // TO DO: test the 'hyper,hyper,1,1' barrier
-
                 // basic tuning
 
                 if( atomic_available ) {
@@ -8667,7 +8487,6 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
 
             #elif KMP_OS_DARWIN
 
-
                 if( atomic_available && ( num_vars <= 3 ) ) {
                         retval = atomic_reduce_block;
                 } else if( tree_available ) {
@@ -8686,18 +8505,6 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
 
     }
 
-    //AT: TO DO: critical block method not implemented by PAROPT
-    //if( retval == __kmp_critical_reduce_block ) {
-    //  if( lck == NULL ) { // critical block method not implemented by PAROPT
-    //  }
-    //}
-
-    // tune what to return depending on the CPU and platform configuration
-    //           (sometimes tree method is slower than critical)
-
-    // probably tune what to return depending on team size
-
-
     // KMP_FORCE_REDUCTION
 
     if( __kmp_force_reduction_method != reduction_method_not_defined ) {