88 files changed, 8766 insertions, 5773 deletions
diff --git a/openmp/runtime/src/defs.mk b/openmp/runtime/src/defs.mk
index 45ba2bde5b7..14a0e90ff80 100644
--- a/openmp/runtime/src/defs.mk
+++ b/openmp/runtime/src/defs.mk
@@ -1,6 +1,6 @@
 # defs.mk
-# $Revision: 42061 $
-# $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+# $Revision: 42951 $
+# $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 #
 #//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 2c58f4c673f..a82b7af61f5 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -161,10 +161,8 @@
     # Regular entry points
         __kmp_wait_yield_4
         __kmp_wait_yield_8
-        __kmp_wait_sleep
         __kmp_fork_call
         __kmp_invoke_microtask
-        __kmp_release
         __kmp_launch_monitor
         __kmp_launch_worker
         __kmp_reap_monitor
@@ -192,6 +190,14 @@
             _You_must_link_with_Microsoft_OpenMP_library  DATA
         %endif
 
+        __kmp_wait_32
+        __kmp_wait_64
+        __kmp_wait_oncore
+        __kmp_release_32
+        __kmp_release_64
+        __kmp_release_oncore
+
+
 #    VT_getthid                              1
 #    vtgthid                                 2
 
@@ -360,6 +366,18 @@ kmpc_set_defaults                           224
         __kmpc_cancel                       244
         __kmpc_cancellationpoint            245
         __kmpc_cancel_barrier               246
+        __kmpc_dist_for_static_init_4       247
+        __kmpc_dist_for_static_init_4u      248
+        __kmpc_dist_for_static_init_8       249
+        __kmpc_dist_for_static_init_8u      250
+        __kmpc_dist_dispatch_init_4         251
+        __kmpc_dist_dispatch_init_4u        252
+        __kmpc_dist_dispatch_init_8         253
+        __kmpc_dist_dispatch_init_8u        254
+        __kmpc_team_static_init_4           255
+        __kmpc_team_static_init_4u          256
+        __kmpc_team_static_init_8           257
+        __kmpc_team_static_init_8u          258
     %endif # OMP_40
 %endif
 
diff --git a/openmp/runtime/src/exports_so.txt b/openmp/runtime/src/exports_so.txt
index 9ace78fd549..0b16049dd12 100644
--- a/openmp/runtime/src/exports_so.txt
+++ b/openmp/runtime/src/exports_so.txt
@@ -40,6 +40,8 @@ VERSION {
         __kmp_thread_pool;
         __kmp_thread_pool_nth;
 
+	__kmp_reset_stats;
+
 #if USE_ITT_BUILD
         #
         # ITT support.
@@ -64,8 +66,12 @@ VERSION {
         __kmp_launch_worker;
         __kmp_reap_monitor;
         __kmp_reap_worker;
-        __kmp_release;
-        __kmp_wait_sleep;
+        __kmp_release_32;
+        __kmp_release_64;
+        __kmp_release_oncore;
+        __kmp_wait_32;
+        __kmp_wait_64;
+        __kmp_wait_oncore;
         __kmp_wait_yield_4;
         __kmp_wait_yield_8;
 
diff --git a/openmp/runtime/src/extractExternal.cpp b/openmp/runtime/src/extractExternal.cpp
index 3f15c8e306e..41f03a4a8b2 100644
--- a/openmp/runtime/src/extractExternal.cpp
+++ b/openmp/runtime/src/extractExternal.cpp
@@ -1,7 +1,7 @@
 /*
  * extractExternal.cpp
- * $Revision: 42181 $
- * $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+ * $Revision: 43084 $
+ * $Date: 2014-04-15 09:15:14 -0500 (Tue, 15 Apr 2014) $
  */
 
 
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index e172d3e3084..c925ef97cce 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -1,6 +1,6 @@
 # en_US.txt #
-# $Revision: 42659 $
-# $Date: 2013-09-12 09:22:48 -0500 (Thu, 12 Sep 2013) $
+# $Revision: 43419 $
+# $Date: 2014-08-27 14:59:52 -0500 (Wed, 27 Aug 2014) $
 
 #
 #//===----------------------------------------------------------------------===//
@@ -40,7 +40,7 @@ Language "English"
 Country  "USA"
 LangId   "1033"
 Version  "2"
-Revision "20130911"
+Revision "20140827"
 
 
 
@@ -290,7 +290,7 @@ ChangeThreadAffMaskError     "Cannot change thread affinity mask."
 ThreadsMigrate               "%1$s: Threads may migrate across %2$d innermost levels of machine"
 DecreaseToThreads            "%1$s: decrease to %2$d threads"
 IncreaseToThreads            "%1$s: increase to %2$d threads"
-BoundToOSProcSet             "%1$s: Internal thread %2$d bound to OS proc set %3$s"
+OBSOLETE                     "%1$s: Internal thread %2$d bound to OS proc set %3$s"
 AffCapableUseCpuinfo         "%1$s: Affinity capable, using cpuinfo file"
 AffUseGlobCpuid              "%1$s: Affinity capable, using global cpuid info"
 AffCapableUseFlat            "%1$s: Affinity capable, using default \"flat\" topology"
@@ -395,9 +395,17 @@ AffThrPlaceInvalid           "%1$s: invalid value \"%2$s\", valid format is \"nC
 AffThrPlaceUnsupported       "KMP_PLACE_THREADS ignored: unsupported architecture."
 AffThrPlaceManyCores         "KMP_PLACE_THREADS ignored: too many cores requested."
 SyntaxErrorUsing             "%1$s: syntax error, using %2$s."
-AdaptiveNotSupported         "%1$s: Adaptive locks are not supported; using queuing." 
-EnvSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"." 
-EnvSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"." 
+AdaptiveNotSupported         "%1$s: Adaptive locks are not supported; using queuing."
+EnvSyntaxError               "%1$s: Invalid symbols found. Check the value \"%2$s\"."
+EnvSpacesNotAllowed          "%1$s: Spaces between digits are not allowed \"%2$s\"."
+BoundToOSProcSet             "%1$s: pid %2$d thread %3$d bound to OS proc set %4$s"
+CnsLoopIncrIllegal           "%1$s error: parallel loop increment and condition are inconsistent."
+NoGompCancellation           "libgomp cancellation is not currently supported."
+AffThrPlaceNonUniform        "KMP_PLACE_THREADS ignored: non-uniform topology."
+AffThrPlaceNonThreeLevel     "KMP_PLACE_THREADS ignored: only three-level topology is supported."
+AffGranTopGroup              "%1$s: granularity=%2$s is not supported with KMP_TOPOLOGY_METHOD=group. Using \"granularity=fine\"."
+AffGranGroupType             "%1$s: granularity=group is not supported with KMP_AFFINITY=%2$s. Using \"granularity=core\"."
+
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
diff --git a/openmp/runtime/src/include/25/iomp.h.var b/openmp/runtime/src/include/25/iomp.h.var
index 79b24eca26e..a0af224c633 100644
--- a/openmp/runtime/src/include/25/iomp.h.var
+++ b/openmp/runtime/src/include/25/iomp.h.var
@@ -1,7 +1,7 @@
 /*
  * include/25/iomp.h.var
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/include/25/iomp_lib.h.var b/openmp/runtime/src/include/25/iomp_lib.h.var
index 644f012ace2..d6aa093ffaa 100644
--- a/openmp/runtime/src/include/25/iomp_lib.h.var
+++ b/openmp/runtime/src/include/25/iomp_lib.h.var
@@ -1,6 +1,6 @@
 ! include/25/iomp_lib.h.var
-! $Revision: 42061 $
-! $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/25/omp.h.var b/openmp/runtime/src/include/25/omp.h.var
index 603f4744cf4..12f71a4d008 100644
--- a/openmp/runtime/src/include/25/omp.h.var
+++ b/openmp/runtime/src/include/25/omp.h.var
@@ -1,7 +1,7 @@
 /*
  * include/25/omp.h.var
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/include/25/omp_lib.f.var b/openmp/runtime/src/include/25/omp_lib.f.var
index a4622ea61a4..3ece259630b 100644
--- a/openmp/runtime/src/include/25/omp_lib.f.var
+++ b/openmp/runtime/src/include/25/omp_lib.f.var
@@ -1,6 +1,6 @@
 ! include/25/omp_lib.f.var
-! $Revision: 42181 $
-! $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
@@ -314,7 +314,7 @@
 !dec$   else
 
 !***
-!*** On Windows* OS IA-32 architecture, the Fortran entry points have an 
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an
 !*** underscore prepended.
 !***
 
diff --git a/openmp/runtime/src/include/25/omp_lib.f90.var b/openmp/runtime/src/include/25/omp_lib.f90.var
index 01bf725bf8e..0d86a98cea6 100644
--- a/openmp/runtime/src/include/25/omp_lib.f90.var
+++ b/openmp/runtime/src/include/25/omp_lib.f90.var
@@ -1,6 +1,6 @@
 ! include/25/omp_lib.f90.var
-! $Revision: 42061 $
-! $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/25/omp_lib.h.var b/openmp/runtime/src/include/25/omp_lib.h.var
index d0527e05a49..4b93c98bc95 100644
--- a/openmp/runtime/src/include/25/omp_lib.h.var
+++ b/openmp/runtime/src/include/25/omp_lib.h.var
@@ -1,6 +1,6 @@
 ! include/25/omp_lib.h.var
-! $Revision: 42181 $
-! $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
@@ -301,7 +301,7 @@
 !dec$   else
 
 !***
-!*** On Windows* OS IA-32 architecture, the Fortran entry points have an 
+!*** On Windows* OS IA-32 architecture, the Fortran entry points have an
 !*** underscore prepended.
 !***
 
diff --git a/openmp/runtime/src/include/30/iomp.h.var b/openmp/runtime/src/include/30/iomp.h.var
index 0efa3d16ca5..f24901b8180 100644
--- a/openmp/runtime/src/include/30/iomp.h.var
+++ b/openmp/runtime/src/include/30/iomp.h.var
@@ -1,7 +1,7 @@
 /*
  * include/30/iomp.h.var
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/include/30/iomp_lib.h.var b/openmp/runtime/src/include/30/iomp_lib.h.var
index bf2a8e20c4f..7831073b2af 100644
--- a/openmp/runtime/src/include/30/iomp_lib.h.var
+++ b/openmp/runtime/src/include/30/iomp_lib.h.var
@@ -1,6 +1,6 @@
 ! include/30/iomp_lib.h.var
-! $Revision: 42061 $
-! $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/30/omp.h.var b/openmp/runtime/src/include/30/omp.h.var
index 79c3400d0de..d6f6510d958 100644
--- a/openmp/runtime/src/include/30/omp.h.var
+++ b/openmp/runtime/src/include/30/omp.h.var
@@ -1,7 +1,7 @@
 /*
  * include/30/omp.h.var
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/include/30/omp_lib.f.var b/openmp/runtime/src/include/30/omp_lib.f.var
index c3c546b450c..04607f947f9 100644
--- a/openmp/runtime/src/include/30/omp_lib.f.var
+++ b/openmp/runtime/src/include/30/omp_lib.f.var
@@ -1,6 +1,6 @@
 ! include/30/omp_lib.f.var
-! $Revision: 42181 $
-! $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/30/omp_lib.f90.var b/openmp/runtime/src/include/30/omp_lib.f90.var
index 666ddfd591e..87c2b7d6dd4 100644
--- a/openmp/runtime/src/include/30/omp_lib.f90.var
+++ b/openmp/runtime/src/include/30/omp_lib.f90.var
@@ -1,6 +1,6 @@
 ! include/30/omp_lib.f90.var
-! $Revision: 42061 $
-! $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/30/omp_lib.h.var b/openmp/runtime/src/include/30/omp_lib.h.var
index b3e3fc77340..6643c93aeeb 100644
--- a/openmp/runtime/src/include/30/omp_lib.h.var
+++ b/openmp/runtime/src/include/30/omp_lib.h.var
@@ -1,6 +1,6 @@
 ! include/30/omp_lib.h.var
-! $Revision: 42181 $
-! $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+! $Revision: 42951 $
+! $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 !
 !//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/include/40/iomp.h.var b/openmp/runtime/src/include/40/iomp.h.var
index 8aeb38c3bc2..adb7dcc5048 100644
--- a/openmp/runtime/src/include/40/iomp.h.var
+++ b/openmp/runtime/src/include/40/iomp.h.var
@@ -91,7 +91,7 @@
     } kmp_cancel_kind_t;
 
     extern int    __KAI_KMPC_CONVENTION  kmp_get_cancellation_status(kmp_cancel_kind_t);
-    
+
 #   undef __KAI_KMPC_CONVENTION
 
     /* Warning:
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f5dd10f8baa..6daf9735601 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1,8 +1,8 @@
 /*! \file */
 /*
  * kmp.h -- KPTS runtime header file.
- * $Revision: 42816 $
- * $Date: 2013-11-11 15:33:37 -0600 (Mon, 11 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -28,8 +28,6 @@
 
 /* Defines for OpenMP 3.0 tasking and auto scheduling */
 
-#if OMP_30_ENABLED
-
 # ifndef KMP_STATIC_STEAL_ENABLED
 #  define KMP_STATIC_STEAL_ENABLED 1
 # endif
@@ -56,8 +54,6 @@
 #define TASK_EXPLICIT            1
 #define TASK_IMPLICIT            0
 
-#endif  // OMP_30_ENABLED
-
 #define KMP_CANCEL_THREADS
 #define KMP_THREAD_ATTR
 
@@ -79,6 +75,10 @@
 
 #include "kmp_os.h"
 
+#if KMP_STATS_ENABLED
+class kmp_stats_list;
+#endif
+
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
 #include <xmmintrin.h>
 #endif
@@ -125,6 +125,24 @@
 #define USE_FAST_MEMORY 3
 #endif
 
+#ifndef KMP_NESTED_HOT_TEAMS
+# define KMP_NESTED_HOT_TEAMS 0
+# define USE_NESTED_HOT_ARG(x)
+#else
+# if KMP_NESTED_HOT_TEAMS
+#  if OMP_40_ENABLED
+#   define USE_NESTED_HOT_ARG(x) ,x
+#  else
+// Nested hot teams feature depends on omp 4.0, disable it for earlier versions
+#   undef KMP_NESTED_HOT_TEAMS
+#   define KMP_NESTED_HOT_TEAMS 0
+#   define USE_NESTED_HOT_ARG(x)
+#  endif
+# else
+#  define USE_NESTED_HOT_ARG(x)
+# endif
+#endif
+
 // Assume using BGET compare_exchange instruction instead of lock by default.
 #ifndef USE_CMP_XCHG_FOR_BGET
 #define USE_CMP_XCHG_FOR_BGET 1
@@ -459,15 +477,6 @@ typedef int PACKED_REDUCTION_METHOD_T;
 /*
  * Only Linux* OS and Windows* OS support thread affinity.
  */
-#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK && !KMP_ARCH_PPC64
-# define KMP_AFFINITY_SUPPORTED 1
-#elif KMP_OS_DARWIN || KMP_OS_FREEBSD || KMP_OS_CNK || KMP_ARCH_PPC64
-// affinity not supported
-# define KMP_AFFINITY_SUPPORTED 0
-#else
-# error "Unknown or unsupported OS"
-#endif
-
 #if KMP_AFFINITY_SUPPORTED
 
 extern size_t __kmp_affin_mask_size;
@@ -540,11 +549,14 @@ typedef unsigned char kmp_affin_mask_t;
 
 #  if KMP_ARCH_X86_64
 
+// GROUP_AFFINITY is already defined for _MSC_VER>=1600 (VS2010 and later).
+#   if _MSC_VER < 1600
 typedef struct GROUP_AFFINITY {
-    KAFFINITY mask;
-    WORD group;
-    WORD reserved[3];
+    KAFFINITY Mask;
+    WORD Group;
+    WORD Reserved[3];
 } GROUP_AFFINITY;
+#   endif
 
 typedef DWORD_PTR kmp_affin_mask_t;
 
@@ -798,7 +810,6 @@ extern unsigned int __kmp_place_core_offset;
 #define __kmp_entry_gtid()             __kmp_get_global_thread_id_reg()
 
 #define __kmp_tid_from_gtid(gtid)     ( KMP_DEBUG_ASSERT( (gtid) >= 0 ), \
-                                        /*(__kmp_threads[ (gtid) ]->th.th_team_serialized) ? 0 : */ /* TODO remove this check, it is redundant */ \
                                         __kmp_threads[ (gtid) ]->th.th_info.ds.ds_tid )
 
 #define __kmp_get_tid()               ( __kmp_tid_from_gtid( __kmp_get_gtid() ) )
@@ -865,6 +876,9 @@ extern unsigned int __kmp_place_core_offset;
 #define KMP_MAX_STKOFFSET       KMP_MAX_STKSIZE
 #define KMP_DEFAULT_STKOFFSET   KMP_MIN_STKOFFSET
 
+#define KMP_MIN_STKPADDING      (0)
+#define KMP_MAX_STKPADDING      (2 * 1024 * 1024)
+
 #define KMP_MIN_MONITOR_WAKEUPS      (1)       /* min number of times monitor wakes up per second */
 #define KMP_MAX_MONITOR_WAKEUPS      (1000)    /* maximum number of times monitor can wake up per second */
 #define KMP_BLOCKTIME_MULTIPLIER     (1000)    /* number of blocktime units per second */
@@ -952,12 +966,14 @@ extern unsigned int __kmp_place_core_offset;
 #elif KMP_OS_LINUX
 #  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
 #  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
-#elif KMP_OS_DARWIN || KMP_OS_FREEBSD
-/* TODO: tune for OS */
+#elif KMP_OS_DARWIN
+/* TODO: tune for KMP_OS_DARWIN */
+#  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
+#  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
+#elif KMP_OS_FREEBSD
+/* TODO: tune for KMP_OS_FREEBSD */
 #  define KMP_INIT_WAIT  1024U          /* initial number of spin-tests   */
 #  define KMP_NEXT_WAIT   512U          /* susequent number of spin-tests */
-#else
-#  error "Unknown or unsupported OS"
 #endif
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
@@ -968,12 +984,14 @@ struct kmp_cpuid {
     kmp_uint32  edx;
 };
 extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
-# if KMP_MIC
+# if KMP_ARCH_X86
+  extern void __kmp_x86_pause( void );
+# elif KMP_MIC
   static void __kmp_x86_pause( void ) { _mm_delay_32( 100 ); };
 # else
-  extern void __kmp_x86_pause( void );
+  static void __kmp_x86_pause( void ) { _mm_pause(); };
 # endif
-# define KMP_CPU_PAUSE()        __kmp_x86_pause()
+# define KMP_CPU_PAUSE() __kmp_x86_pause()
 #elif KMP_ARCH_PPC64
 # define KMP_PPC64_PRI_LOW() __asm__ volatile ("or 1, 1, 1")
 # define KMP_PPC64_PRI_MED() __asm__ volatile ("or 2, 2, 2")
@@ -985,7 +1003,7 @@ extern void __kmp_x86_cpuid( int mode, int mode2, struct kmp_cpuid *p );
 
 #define KMP_INIT_YIELD(count)           { (count) = __kmp_yield_init; }
 
-#define KMP_YIELD(cond)                 { KMP_CPU_PAUSE(); __kmp_static_yield( (cond) ); }
+#define KMP_YIELD(cond)                 { KMP_CPU_PAUSE(); __kmp_yield( (cond) ); }
 
 // Note the decrement of 2 in the following Macros.  With KMP_LIBRARY=turnaround,
 // there should be no yielding since the starting value from KMP_INIT_YIELD() is odd.
@@ -1533,6 +1551,9 @@ typedef struct kmp_disp {
     dispatch_private_info_t *th_disp_buffer;
     kmp_int32                th_disp_index;
     void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64
+#if KMP_USE_INTERNODE_ALIGNMENT
+    char more_padding[INTERNODE_CACHE_LINE];
+#endif
 } kmp_disp_t;
 
 /* ------------------------------------------------------------------------ */
@@ -1557,6 +1578,12 @@ typedef struct kmp_disp {
 # error "Barrier unused bit must be smaller than barrier bump bit"
 #endif
 
+// Constants for release barrier wait state: currently, hierarchical only
+#define KMP_BARRIER_NOT_WAITING        0  // Normal state; worker not in wait_sleep
+#define KMP_BARRIER_OWN_FLAG           1  // Normal state; worker waiting on own b_go flag in release
+#define KMP_BARRIER_PARENT_FLAG        2  // Special state; worker waiting on parent's b_go flag in release
+#define KMP_BARRIER_SWITCH_TO_OWN_FLAG 3  // Special state; tells worker to shift from parent to own b_go
+#define KMP_BARRIER_SWITCHING          4  // Special state; worker resets appropriate flag on wake-up
 
 enum barrier_type {
     bs_plain_barrier = 0,       /* 0, All non-fork/join barriers (except reduction barriers if enabled) */
@@ -1576,16 +1603,58 @@ typedef enum kmp_bar_pat {      /* Barrier communication patterns */
     bp_linear_bar = 0,          /* Single level (degenerate) tree */
     bp_tree_bar = 1,            /* Balanced tree with branching factor 2^n */
     bp_hyper_bar = 2,           /* Hypercube-embedded tree with min branching factor 2^n */
-    bp_last_bar = 3             /* Placeholder to mark the end */
+    bp_hierarchical_bar = 3,    /* Machine hierarchy tree */
+    bp_last_bar = 4             /* Placeholder to mark the end */
 } kmp_bar_pat_e;
 
+# define KMP_BARRIER_ICV_PUSH   1
+
+/* Record for holding the values of the internal controls stack records */
+typedef struct kmp_internal_control {
+    int           serial_nesting_level;  /* corresponds to the value of the th_team_serialized field */
+    kmp_int8      nested;                /* internal control for nested parallelism (per thread) */
+    kmp_int8      dynamic;               /* internal control for dynamic adjustment of threads (per thread) */
+    kmp_int8      bt_set;                /* internal control for whether blocktime is explicitly set */
+    int           blocktime;             /* internal control for blocktime */
+    int           bt_intervals;          /* internal control for blocktime intervals */
+    int           nproc;                 /* internal control for #threads for next parallel region (per thread) */
+    int           max_active_levels;     /* internal control for max_active_levels */
+    kmp_r_sched_t sched;                 /* internal control for runtime schedule {sched,chunk} pair */
+#if OMP_40_ENABLED
+    kmp_proc_bind_t proc_bind;           /* internal control for affinity  */
+#endif // OMP_40_ENABLED
+    struct kmp_internal_control *next;
+} kmp_internal_control_t;
+
+static inline void
+copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) {
+    *dst = *src;
+}
+
 /* Thread barrier needs volatile barrier fields */
 typedef struct KMP_ALIGN_CACHE kmp_bstate {
-    volatile kmp_uint   b_arrived;              /* STATE => task reached synch point. */
-    #if (KMP_PERF_V19 == KMP_ON)
-        KMP_ALIGN_CACHE
-    #endif
-    volatile kmp_uint   b_go;                   /* STATE => task should proceed.      */
+    // th_fixed_icvs is aligned by virtue of kmp_bstate being aligned (and all uses of it).
+    // It is not explicitly aligned below, because we *don't* want it to be padded -- instead,
+    // we fit b_go into the same cache line with th_fixed_icvs, enabling NGO cache lines
+    // stores in the hierarchical barrier.
+    kmp_internal_control_t th_fixed_icvs;          // Initial ICVs for the thread
+    // Tuck b_go into end of th_fixed_icvs cache line, so it can be stored with same NGO store
+    volatile kmp_uint64 b_go;                      // STATE => task should proceed (hierarchical)
+    KMP_ALIGN_CACHE volatile kmp_uint64 b_arrived; // STATE => task reached synch point.
+    kmp_uint32 *skip_per_level;
+    kmp_uint32 my_level;
+    kmp_int32 parent_tid;
+    kmp_uint32 old_tid;
+    kmp_uint32 depth;
+    struct kmp_bstate *parent_bar;
+    kmp_team_t *team;
+    kmp_uint64 leaf_state;
+    kmp_uint32 nproc;
+    kmp_uint8 base_leaf_kids;
+    kmp_uint8 leaf_kids;
+    kmp_uint8 offset;
+    kmp_uint8 wait_flag;
+    kmp_uint8 use_oncore_barrier;
 } kmp_bstate_t;
 
 union KMP_ALIGN_CACHE kmp_barrier_union {
@@ -1698,7 +1767,6 @@ typedef union KMP_ALIGN_CACHE kmp_desc {
 
 typedef struct kmp_local {
     volatile int           this_construct; /* count of single's encountered by thread */
-    volatile int           last_construct; /* cache for team's count used by old algorithm */
     void                  *reduce_data;
 #if KMP_USE_BGET
     void                  *bget_data;
@@ -1721,151 +1789,54 @@ typedef struct kmp_local {
 
 } kmp_local_t;
 
-/* Record for holding the values of the internal controls stack records */
-typedef struct KMP_ALIGN_CACHE kmp_internal_control {
-    int           serial_nesting_level;  /* corresponds to the value of the th_team_serialized field */
-    int           nested;                /* internal control for nested parallelism (per thread) */
-    int           dynamic;               /* internal control for dynamic adjustment of threads (per thread) */
-    int           nproc;                 /* internal control for # of threads for next parallel region (per thread) */
-    int           blocktime;             /* internal control for blocktime */
-    int           bt_intervals;          /* internal control for blocktime intervals */
-    int           bt_set;                /* internal control for whether blocktime is explicitly set */
-#if OMP_30_ENABLED
-    int           max_active_levels;     /* internal control for max_active_levels */
-    kmp_r_sched_t sched;                 /* internal control for runtime schedule {sched,chunk} pair */
-#endif // OMP_30_ENABLED
-#if OMP_40_ENABLED
-    kmp_proc_bind_t proc_bind;           /* internal control for affinity  */
-#endif // OMP_40_ENABLED
-    struct kmp_internal_control *next;
-
-} kmp_internal_control_t;
-
-#if OMP_30_ENABLED
-static inline void
-copy_icvs( kmp_internal_control_t *dst, kmp_internal_control_t *src ) {
-    *dst = *src;
-}
-#endif // OMP_30_ENABLED
-
-#if OMP_30_ENABLED
-
-    #define get__blocktime( xteam, xtid )     ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
-    #define get__bt_set( xteam, xtid )        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
-    #define get__bt_intervals( xteam, xtid )  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
+#define get__blocktime( xteam, xtid )     ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime)
+#define get__bt_set( xteam, xtid )        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set)
+#define get__bt_intervals( xteam, xtid )  ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals)
 
-    #define get__nested_2(xteam,xtid)         ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested)
-    #define get__dynamic_2(xteam,xtid)        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
-    #define get__nproc_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
-    #define get__sched_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
+#define get__nested_2(xteam,xtid)         ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nested)
+#define get__dynamic_2(xteam,xtid)        ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.dynamic)
+#define get__nproc_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.nproc)
+#define get__sched_2(xteam,xtid)          ((xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.sched)
 
-    #define set__blocktime_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime )    = (xval) )
+#define set__blocktime_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.blocktime )    = (xval) )
 
-    #define set__bt_intervals_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) )
+#define set__bt_intervals_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_intervals ) = (xval) )
 
-    #define set__bt_set_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set )       = (xval) )
+#define set__bt_set_team( xteam, xtid, xval ) \
+        ( ( (xteam)->t.t_threads[(xtid)]->th.th_current_task->td_icvs.bt_set )       = (xval) )
 
 
+#define set__nested( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.nested ) = (xval) )
+#define get__nested( xthread ) \
+        ( ( (xthread)->th.th_current_task->td_icvs.nested ) ? (FTN_TRUE) : (FTN_FALSE) )
 
-    #define set__nested( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_threads[0]                                ->th.th_current_task->td_icvs.nested ) = \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nested ) = \
-              (xval) )
-    #define get__nested( xthread ) \
-            ( ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nested ) \
-            ? (FTN_TRUE) : (FTN_FALSE) )
+#define set__dynamic( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) = (xval) )
+#define get__dynamic( xthread ) \
+        ( ( (xthread)->th.th_current_task->td_icvs.dynamic ) ? (FTN_TRUE) : (FTN_FALSE) )
 
-    #define set__dynamic( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_threads[0]                                ->th.th_current_task->td_icvs.dynamic ) = \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.dynamic ) = \
-              (xval) )
-    #define get__dynamic( xthread ) \
-            ( ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.dynamic ) \
-            ? (FTN_TRUE) : (FTN_FALSE) )
+#define set__nproc( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.nproc ) = (xval) )
 
-    #define set__nproc( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_threads[0]                                ->th.th_current_task->td_icvs.nproc ) = \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nproc ) = \
-              (xval) )
+#define set__max_active_levels( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.max_active_levels ) = (xval) )
 
-    #define set__nproc_p( xthread, xval )                            \
-            (                                                        \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.nproc ) = \
-              (xval) )
-
-    #define set__max_active_levels( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_threads[0]                                ->th.th_current_task->td_icvs.max_active_levels ) = \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.max_active_levels ) = \
-              (xval) )
-
-    #define set__sched( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_threads[0]                                ->th.th_current_task->td_icvs.sched ) = \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.sched ) = \
-              (xval) )
+#define set__sched( xthread, xval )                            \
+        ( ( (xthread)->th.th_current_task->td_icvs.sched ) = (xval) )
 
 #if OMP_40_ENABLED
 
-    #define set__proc_bind( xthread, xval )                          \
-            (                                                        \
-              ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.proc_bind ) = \
-              (xval) )
-
-    #define get__proc_bind( xthread ) \
-            ( (xthread)->th.th_team       ->t.t_threads[((xthread)->th.th_info.ds.ds_tid)]->th.th_current_task->td_icvs.proc_bind )
+#define set__proc_bind( xthread, xval )                          \
+        ( ( (xthread)->th.th_current_task->td_icvs.proc_bind ) = (xval) )
+#define get__proc_bind( xthread ) \
+        ( (xthread)->th.th_current_task->td_icvs.proc_bind )
 
 #endif /* OMP_40_ENABLED */
 
-#else
-
-    #define get__blocktime( xteam, xtid )    ((xteam)->t.t_set_blocktime[   (xtid)])
-    #define get__bt_set( xteam, xtid )       ((xteam)->t.t_set_bt_set[      (xtid)])
-    #define get__bt_intervals( xteam, xtid ) ((xteam)->t.t_set_bt_intervals[(xtid)])
-
-    #define set__nested( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_set_nested[0] ) = \
-              ( (xthread)->th.th_team->t.t_set_nested[((xthread)->th.th_info.ds.ds_tid)] ) = \
-              (xval) )
-    #define get__nested( xthread ) \
-            ( ( (xthread)->th.th_team->t.t_set_nested[((xthread)->th.th_info.ds.ds_tid)] ) \
-            ? (FTN_TRUE) : (FTN_FALSE) )
-
-    #define set__dynamic( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_set_dynamic[0] ) = \
-              ( (xthread)->th.th_team->t.t_set_dynamic[((xthread)->th.th_info.ds.ds_tid)] ) = \
-              (xval) )
-    #define get__dynamic( xthread ) \
-            ( ( (xthread)->th.th_team->t.t_set_dynamic[((xthread)->th.th_info.ds.ds_tid)] ) \
-            ? (FTN_TRUE) : (FTN_FALSE) )
-
-    #define set__nproc( xthread, xval )                            \
-            ( ( (xthread)->th.th_serial_team->t.t_set_nproc[0] ) = \
-              ( (xthread)->th.th_team->t.t_set_nproc[((xthread)->th.th_info.ds.ds_tid)] ) = \
-              (xval) )
-
-    #define set__nproc_p( xthread, xval )                                                   \
-            ( ( (xthread)->th.th_team->t.t_set_nproc[((xthread)->th.th_info.ds.ds_tid)] ) = (xval) )
-
-    #define set__blocktime_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_set_blocktime[(xtid)] ) = (xval) )
 
-    #define set__bt_intervals_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_set_bt_intervals[(xtid)] ) = (xval) )
-
-    #define set__bt_set_team( xteam, xtid, xval ) \
-            ( ( (xteam)->t.t_set_bt_set[(xtid)] ) = (xval) )
-
-    #define get__nested_2(xteam,xtid)  ( (xteam)->t.t_set_nested[(xtid)] )
-    #define get__dynamic_2(xteam,xtid) ( (xteam)->t.t_set_dynamic[(xtid)] )
-    #define get__nproc_2(xteam,xtid)   ( (xteam)->t.t_set_nproc[(xtid)] )
-    #define get__sched_2(xteam,xtid)   ( (xteam)->t.t_set_sched[(xtid)] )
-
-
-#endif
-
-#if OMP_30_ENABLED
 /* ------------------------------------------------------------------------ */
 // OpenMP tasking data structures
 //
@@ -1931,7 +1902,7 @@ typedef struct kmp_dephash_entry kmp_dephash_entry_t;
 
 typedef struct kmp_depend_info {
      kmp_intptr_t               base_addr;
-     size_t 	                len;
+     size_t                     len;
      struct {
          bool                   in:1;
          bool                   out:1;
@@ -1947,13 +1918,13 @@ typedef struct kmp_base_depnode {
     kmp_depnode_list_t        * successors;
     kmp_task_t                * task;
 
-    kmp_lock_t 	                lock;
+    kmp_lock_t                  lock;
 
 #if KMP_SUPPORT_GRAPH_OUTPUT
     kmp_uint32                  id;
 #endif
 
-    volatile kmp_int32 	        npredecessors;
+    volatile kmp_int32          npredecessors;
     volatile kmp_int32          nrefs;
 } kmp_base_depnode_t;
 
@@ -1965,8 +1936,8 @@ union KMP_ALIGN_CACHE kmp_depnode {
 
 struct kmp_dephash_entry {
     kmp_intptr_t               addr;
-    kmp_depnode_t     	     * last_out;
-    kmp_depnode_list_t 	     * last_ins;
+    kmp_depnode_t            * last_out;
+    kmp_depnode_list_t       * last_ins;
     kmp_dephash_entry_t      * next_in_bucket;
 };
 
@@ -2039,7 +2010,7 @@ struct kmp_taskdata {                                 /* aligned during dynamic
     ident_t *               td_taskwait_ident;
     kmp_uint32              td_taskwait_counter;
     kmp_int32               td_taskwait_thread;       /* gtid + 1 of thread encountered taskwait */
-    kmp_internal_control_t  td_icvs;                  /* Internal control variables for the task */
+    KMP_ALIGN_CACHE kmp_internal_control_t  td_icvs;  /* Internal control variables for the task */
     volatile kmp_uint32     td_allocated_child_tasks;  /* Child tasks (+ current task) not yet deallocated */
     volatile kmp_uint32     td_incomplete_child_tasks; /* Child tasks not yet complete */
 #if OMP_40_ENABLED
@@ -2060,7 +2031,7 @@ KMP_BUILD_ASSERT( sizeof(kmp_taskdata_t) % sizeof(void *) == 0 );
 // Data for task team but per thread
 typedef struct kmp_base_thread_data {
     kmp_info_p *            td_thr;                // Pointer back to thread info
-                                                   // Used only in __kmp_execute_tasks, maybe not avail until task is queued?
+                                                   // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
     kmp_bootstrap_lock_t    td_deque_lock;         // Lock for accessing deque
     kmp_taskdata_t **       td_deque;              // Deque of tasks encountered by td_thr, dynamically allocated
     kmp_uint32              td_deque_head;         // Head of deque (will wrap)
@@ -2099,6 +2070,10 @@ typedef struct kmp_base_task_team {
     volatile kmp_uint32     tt_active;             /* is the team still actively executing tasks */
 
     KMP_ALIGN_CACHE
+#if KMP_USE_INTERNODE_ALIGNMENT
+    kmp_int32               tt_padme[INTERNODE_CACHE_LINE/sizeof(kmp_int32)];
+#endif
+
     volatile kmp_uint32     tt_ref_ct;             /* #threads accessing struct  */
                                                    /* (not incl. master)         */
     kmp_int32               tt_state;              /* alternating 0/1 for task team identification */
@@ -2111,8 +2086,6 @@ union KMP_ALIGN_CACHE kmp_task_team {
     char                 tt_pad[ KMP_PAD(kmp_base_task_team_t, CACHE_LINE) ];
 };
 
-#endif  // OMP_30_ENABLED
-
 #if ( USE_FAST_MEMORY == 3 ) || ( USE_FAST_MEMORY == 5 )
 // Free lists keep same-size free memory slots for fast memory allocation routines
 typedef struct kmp_free_list {
@@ -2121,6 +2094,20 @@ typedef struct kmp_free_list {
     void             *th_free_list_other;  // Non-self free list (to be returned to owner's sync list)
 } kmp_free_list_t;
 #endif
+#if KMP_NESTED_HOT_TEAMS
+// Hot teams array keeps hot teams and their sizes for given thread.
+// Hot teams are not put in teams pool, and they don't put threads in threads pool.
+typedef struct kmp_hot_team_ptr {
+    kmp_team_p *hot_team;      // pointer to hot_team of given nesting level
+    kmp_int32   hot_team_nth;  // number of threads allocated for the hot_team
+} kmp_hot_team_ptr_t;
+#endif
+#if OMP_40_ENABLED
+typedef struct kmp_teams_size {
+    kmp_int32   nteams;        // number of teams in a league
+    kmp_int32   nth;           // number of threads in each team of the league
+} kmp_teams_size_t;
+#endif
 
 /* ------------------------------------------------------------------------ */
 // OpenMP thread data structures
@@ -2146,7 +2133,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
     kmp_info_p       *th_team_master;     /* the team's master thread */
     int               th_team_serialized; /* team is serialized */
 #if OMP_40_ENABLED
-    microtask_t       th_team_microtask;  /* save entry address for teams construct */
+    microtask_t       th_teams_microtask; /* save entry address for teams construct */
     int               th_teams_level;     /* save initial level of teams construct */
                                           /* it is 0 on device but may be any on host */
 #endif
@@ -2158,21 +2145,21 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
     int               th_team_bt_intervals;
     int               th_team_bt_set;
 
-    kmp_internal_control_t  th_fixed_icvs;            /* Initial ICVs for the thread */
-
 
 #if KMP_AFFINITY_SUPPORTED
     kmp_affin_mask_t  *th_affin_mask; /* thread's current affinity mask */
 #endif
 
-
 /*
  * The data set by the master at reinit, then R/W by the worker
  */
     KMP_ALIGN_CACHE int     th_set_nproc;  /* if > 0, then only use this request for the next fork */
+#if KMP_NESTED_HOT_TEAMS
+    kmp_hot_team_ptr_t     *th_hot_teams;     /* array of hot teams */
+#endif
 #if OMP_40_ENABLED
-    int                     th_set_nth_teams; /* number of threads in parallel nested in teams construct */
     kmp_proc_bind_t         th_set_proc_bind; /* if != proc_bind_default, use request for next fork */
+    kmp_teams_size_t        th_teams_size;    /* number of teams/threads in teams construct */
 # if KMP_AFFINITY_SUPPORTED
     int                     th_current_place; /* place currently bound to */
     int                     th_new_place;     /* place to bind to in par reg */
@@ -2182,6 +2169,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
 #endif
 #if USE_ITT_BUILD
     kmp_uint64              th_bar_arrive_time;           /* arrival to barrier timestamp */
+    kmp_uint64              th_bar_min_time;              /* minimum arrival time at the barrier */
     kmp_uint64              th_frame_time;                /* frame timestamp */
     kmp_uint64              th_frame_time_serialized;     /* frame timestamp in serialized parallel */
 #endif /* USE_ITT_BUILD */
@@ -2200,24 +2188,18 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
     volatile kmp_uint32     th_spin_here;   /* thread-local location for spinning */
                                             /* while awaiting queuing lock acquire */
 
-    volatile kmp_uint32    *th_sleep_loc;
+    volatile void          *th_sleep_loc;   // this points at a kmp_flag<T>
 
-/*
- * Two variables used for consistency check - struct cons_header *th_cons and inte th_first moved below
- * from here in order to avoid performance regression
-*/
     ident_t          *th_ident;
     unsigned         th_x;                     // Random number generator data
     unsigned         th_a;                     // Random number generator data
 
-#if OMP_30_ENABLED
 /*
  * Tasking-related data for the thread
  */
     kmp_task_team_t    * th_task_team;           // Task team struct
     kmp_taskdata_t     * th_current_task;        // Innermost Task being executed
     kmp_uint8            th_task_state;          // alternating 0/1 for task team identification
-#endif  // OMP_30_ENABLED
 
     /*
      * More stuff for keeping track of active/sleeping threads
@@ -2229,8 +2211,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
                                                  // 32 bits for TCR/TCW
 
 
-    struct cons_header * th_cons;
-    int                  th_first;
+    struct cons_header * th_cons; // used for consistency check
 
 /*
  * Add the syncronizing data which is cache aligned and padded.
@@ -2259,6 +2240,9 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
     kmp_itt_mark_t        th_itt_mark_single;
     // alignment ???
 #endif /* USE_ITT_BUILD */
+#if KMP_STATS_ENABLED
+    kmp_stats_list* th_stats;
+#endif
 } kmp_base_info_t;
 
 typedef union KMP_ALIGN_CACHE kmp_info {
@@ -2291,154 +2275,89 @@ typedef int     (*launch_t)( int gtid );
 /* Minimum number of ARGV entries to malloc if necessary */
 #define KMP_MIN_MALLOC_ARGV_ENTRIES     100
 
-#if KMP_MIC && OMP_30_ENABLED
-# define KMP_BARRIER_ICV_PULL   1
+// Set up how many argv pointers will fit in cache lines containing t_inline_argv. Historically, we
+// have supported at least 96 bytes. Using a larger value for more space between the master write/worker
+// read section and read/write by all section seems to buy more performance on EPCC PARALLEL.
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+# define KMP_INLINE_ARGV_BYTES         ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) )
 #else
-# define KMP_BARRIER_ICV_PULL   0
-#endif
-
-#if (KMP_PERF_V106 == KMP_ON)
-//
-// Set up how many argv pointers will fit in cache lines containing
-// *t_inline_argv. Historically, we have supported at least 96 bytes.
-//
-// Using a larger value for more space between the master write/worker read
-// section and read/write by all section seems to buy more performance
-// on EPCC PARALLEL.
-//
-//# define KMP_INLINE_ARGV_BYTES          ( 2 * CACHE_LINE )
-# if KMP_BARRIER_ICV_PULL
-#  define KMP_INLINE_ARGV_BYTES          192
-//#  define KMP_INLINE_ARGV_BYTES         ( 2 * CACHE_LINE - ( ( 5 * KMP_PTR_SKIP + 10 * sizeof(int) + sizeof(kmp_int64) ) % CACHE_LINE ) )
-# elif KMP_ARCH_X86 || KMP_ARCH_X86_64
-#  define KMP_INLINE_ARGV_BYTES         ( 4 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) + 2 * sizeof(kmp_int8) + sizeof(kmp_int16) + sizeof(kmp_uint32) ) % CACHE_LINE ) )
-# else
-#  define KMP_INLINE_ARGV_BYTES         ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) )
-# endif
-# define KMP_INLINE_ARGV_ENTRIES        (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP )
+# define KMP_INLINE_ARGV_BYTES         ( 2 * CACHE_LINE - ( ( 3 * KMP_PTR_SKIP + 2 * sizeof(int) ) % CACHE_LINE ) )
 #endif
+#define KMP_INLINE_ARGV_ENTRIES        (int)( KMP_INLINE_ARGV_BYTES / KMP_PTR_SKIP )
 
 typedef struct KMP_ALIGN_CACHE kmp_base_team {
-/*
- * Synchronization Data
- */
-    KMP_ALIGN_CACHE kmp_ordered_team_t       t_ordered;
+    // Synchronization Data ---------------------------------------------------------------------------------
+    KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
     kmp_balign_team_t        t_bar[ bs_last_barrier ];
-
-    /* count of single directive encountered by team */
-    volatile int             t_construct;
-    kmp_lock_t               t_single_lock;  /* team specific lock */
-
-/*
- * Master only
- */
-    KMP_ALIGN_CACHE int      t_master_tid;   /* tid of master in parent team */
-    int                      t_master_this_cons; /* "this_construct" single counter of master in parent team */
-    int                      t_master_last_cons; /* "last_construct" single counter of master in parent team */
-    ident_t                 *t_ident;        /* if volatile, have to change too much other crud to volatile too */
-    kmp_team_p              *t_parent;       /* parent team */
-    kmp_team_p              *t_next_pool;    /* next free team in the team pool */
-    kmp_disp_t              *t_dispatch;     /* thread's dispatch data */
-#if OMP_30_ENABLED
-    kmp_task_team_t         *t_task_team;    /* Task team struct */
-#endif /* OMP_30_ENABLED */
+    volatile int             t_construct;    // count of single directive encountered by team
+    kmp_lock_t               t_single_lock;  // team specific lock
+
+    // Master only -----------------------------------------------------------------------------------------
+    KMP_ALIGN_CACHE int      t_master_tid;   // tid of master in parent team
+    int                      t_master_this_cons; // "this_construct" single counter of master in parent team
+    ident_t                 *t_ident;        // if volatile, have to change too much other crud to volatile too
+    kmp_team_p              *t_parent;       // parent team
+    kmp_team_p              *t_next_pool;    // next free team in the team pool
+    kmp_disp_t              *t_dispatch;     // thread's dispatch data
+    kmp_task_team_t         *t_task_team;    // Task team struct
 #if OMP_40_ENABLED
-    kmp_proc_bind_t          t_proc_bind;    /* bind type for par region */
+    kmp_proc_bind_t          t_proc_bind;    // bind type for par region
 #endif // OMP_40_ENABLED
+#if USE_ITT_BUILD
+    kmp_uint64               t_region_time;  // region begin timestamp
+#endif /* USE_ITT_BUILD */
 
-/*
- * Master write, workers read
- */
-    KMP_ALIGN_CACHE
-    void                     **t_argv;
+    // Master write, workers read --------------------------------------------------------------------------
+    KMP_ALIGN_CACHE void   **t_argv;
     int                      t_argc;
-#if (KMP_PERF_V106 == KMP_ON)
-    /* swap cache lines  for t_nproc and t_max_argc */
-    int                      t_nproc;        /* number of threads in team */
-#else
-    int                      t_max_argc;
-#endif
+    int                      t_nproc;        // number of threads in team
     microtask_t              t_pkfn;
-    launch_t                 t_invoke;       /* procedure to launch the microtask */
-
+    launch_t                 t_invoke;       // procedure to launch the microtask
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
     kmp_int8                 t_fp_control_saved;
     kmp_int8                 t_pad2b;
-    kmp_int16                t_x87_fpu_control_word; /* FP control regs */
+    kmp_int16                t_x87_fpu_control_word; // FP control regs
     kmp_uint32               t_mxcsr;
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if (KMP_PERF_V106 == KMP_ON)
     void                    *t_inline_argv[ KMP_INLINE_ARGV_ENTRIES ];
-#endif
 
-#if (KMP_PERF_V19 == KMP_ON)
-    KMP_ALIGN_CACHE
-#endif
-    kmp_info_t             **t_threads;
-#if (KMP_PERF_V106 == KMP_ON)
-    /* swap cache lines  for t_nproc and t_max_argc */
+    KMP_ALIGN_CACHE kmp_info_t **t_threads;
     int                      t_max_argc;
-#else
-    int                      t_nproc;        /* number of threads in team */
-#endif
-    int                      t_max_nproc;    /* maximum threads this team can handle (this is dynamicly expandable) */
-    int                      t_serialized;   /* levels deep of serialized teams */
-    dispatch_shared_info_t  *t_disp_buffer;  /* buffers for dispatch system */
+    int                      t_max_nproc;    // maximum threads this team can handle (dynamicly expandable)
+    int                      t_serialized;   // levels deep of serialized teams
+    dispatch_shared_info_t  *t_disp_buffer;  // buffers for dispatch system
     int                      t_id;           // team's id, assigned by debugger.
-#if OMP_30_ENABLED
-    int                      t_level;        /* nested parallel level */
-    int                      t_active_level; /* nested active parallel level */
-    kmp_r_sched_t            t_sched;        /* run-time schedule for the team */
-#endif // OMP_30_ENABLED
+    int                      t_level;        // nested parallel level
+    int                      t_active_level; // nested active parallel level
+    kmp_r_sched_t            t_sched;        // run-time schedule for the team
 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
-    int                      t_first_place;  /* first & last place in      */
-    int                      t_last_place;   /* parent thread's partition. */
-                                             /* Restore these values to    */
-                                             /* master after par region.   */
+    int                      t_first_place;  // first & last place in parent thread's partition.
+    int                      t_last_place;   // Restore these values to master after par region.
 #endif // OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
 #if KMP_MIC
-    int                      t_size_changed; /* team size was changed?: 0 - no, 1 - yes, -1 - changed via omp_set_num_threads() call */
+    int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
 #endif
 
-/*
- * Read/write by workers as well
- */
+    // Read/write by workers as well -----------------------------------------------------------------------
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    // Using CACHE_LINE=64 reduces memory footprint,
-    //    but causes a big perf regression of epcc 'parallel' and 'barrier' on fxe256lin01.
-    // This extra padding serves to fix the performance of epcc 'parallel' and 'barrier' when CACHE_LINE=64.
-    // TODO: investigate more and get rid if this padding.
+    // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
+    // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
+    // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.
     char dummy_padding[1024];
 #endif
-    KMP_ALIGN_CACHE
-#if OMP_30_ENABLED
-    kmp_taskdata_t          *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
-#else
-    // Internal control variables for current thread team
-    // TODO  Convert these fields to an array of  kmp_internal_control_t which simplifies parameter passing
-    //       and also prevents performance degradation due to false sharing when all threads set a control var
-    int                     *t_set_nproc;    /* internal control for # of threads for next
-                                                parallel region (per thread) */
-    int                     *t_set_nested;   /* internal control for nested parallelism (per thread) */
-    int                     *t_set_dynamic;  /* internal control for dynamic adjustment of threads (per thread) */
-    int                     *t_set_blocktime; /* internal control for blocktime */
-    int                     *t_set_bt_intervals; /* internal control for blocktime intervals */
-    int                     *t_set_bt_set;   /* internal control for whether blocktime is explicitly set */
-#endif // OMP_30_ENABLED
-
-    kmp_internal_control_t  *t_control_stack_top;  /* internal control stack for additional nested teams.
-                                                      for SERIALIZED teams nested 2 or more levels deep */
+    KMP_ALIGN_CACHE kmp_taskdata_t *t_implicit_task_taskdata;  // Taskdata for the thread's implicit task
+    kmp_internal_control_t  *t_control_stack_top;  // internal control stack for additional nested teams.
+                                                   // for SERIALIZED teams nested 2 or more levels deep
 #if OMP_40_ENABLED
-    kmp_int32                t_cancel_request; /* typed flag to store request state of cancellation */
+    kmp_int32                t_cancel_request; // typed flag to store request state of cancellation
 #endif
-
-    int                      t_master_active;/* save on fork, restore on join */
-    kmp_taskq_t              t_taskq;        /* this team's task queue */
-    void                    *t_copypriv_data;  /* team specific pointer to copyprivate data array */
+    int                      t_master_active;  // save on fork, restore on join
+    kmp_taskq_t              t_taskq;          // this team's task queue
+    void                    *t_copypriv_data;  // team specific pointer to copyprivate data array
     kmp_uint32               t_copyin_counter;
 #if USE_ITT_BUILD
-    void                    *t_stack_id;       /* team specific stack stitching id (for ittnotify) */
+    void                    *t_stack_id;       // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
 } kmp_base_team_t;
 
@@ -2543,6 +2462,7 @@ extern int     __kmp_debug_count;          /* Counter for number of lines printe
 extern int     __kmp_debug_buf_warn_chars; /* Keep track of char increase recommended in warnings */
 /* end rotating debug buffer */
 
+#ifdef KMP_DEBUG
 extern int      __kmp_par_range;           /* +1 => only go par for constructs in range */
 
 #define KMP_PAR_RANGE_ROUTINE_LEN       1024
@@ -2551,6 +2471,7 @@ extern char     __kmp_par_range_routine[KMP_PAR_RANGE_ROUTINE_LEN];
 extern char     __kmp_par_range_filename[KMP_PAR_RANGE_FILENAME_LEN];
 extern int      __kmp_par_range_lb;
 extern int      __kmp_par_range_ub;
+#endif
 
 /* For printing out dynamic storage map for threads and teams */
 extern int      __kmp_storage_map;         /* True means print storage map for threads and teams */
@@ -2607,14 +2528,13 @@ extern enum library_type __kmp_library;
 extern enum sched_type  __kmp_sched;    /* default runtime scheduling */
 extern enum sched_type  __kmp_static;   /* default static scheduling method */
 extern enum sched_type  __kmp_guided;   /* default guided scheduling method */
-#if OMP_30_ENABLED
 extern enum sched_type  __kmp_auto;     /* default auto scheduling method */
-#endif // OMP_30_ENABLED
 extern int              __kmp_chunk;    /* default runtime chunk size */
 
 extern size_t     __kmp_stksize;        /* stack size per thread         */
 extern size_t     __kmp_monitor_stksize;/* stack size for monitor thread */
 extern size_t     __kmp_stkoffset;      /* stack offset per thread       */
+extern int        __kmp_stkpadding;     /* Should we pad root thread(s) stack */
 
 extern size_t     __kmp_malloc_pool_incr; /* incremental size of pool for kmp_malloc() */
 extern int        __kmp_env_chunk;      /* was KMP_CHUNK specified?     */
@@ -2629,7 +2549,7 @@ extern int        __kmp_generate_warnings; /* should we issue warnings? */
 extern int        __kmp_reserve_warn;   /* have we issued reserve_threads warning? */
 
 #ifdef DEBUG_SUSPEND
-extern int        __kmp_suspend_count;  /* count inside __kmp_suspend() */
+extern int        __kmp_suspend_count;  /* count inside __kmp_suspend_template() */
 #endif
 
 extern kmp_uint32 __kmp_yield_init;
@@ -2693,9 +2613,11 @@ extern kmp_int16  __kmp_init_x87_fpu_control_word; /* init thread's FP control r
 extern kmp_uint32 __kmp_init_mxcsr;      /* init thread's mxscr */
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-#if OMP_30_ENABLED
 extern int        __kmp_dflt_max_active_levels; /* max_active_levels for nested parallelism enabled by default a la OMP_MAX_ACTIVE_LEVELS */
-#endif // OMP_30_ENABLED
+#if KMP_NESTED_HOT_TEAMS
+extern int        __kmp_hot_teams_mode;
+extern int        __kmp_hot_teams_max_level;
+#endif
 
 # if KMP_OS_LINUX
 extern enum clock_function_type __kmp_clock_function;
@@ -2833,8 +2755,6 @@ static inline kmp_info_t * __kmp_entry_thread()
       return __kmp_threads[gtid];
 }
 
-#if OMP_30_ENABLED
-
 extern void __kmp_set_max_active_levels( int gtid, int new_max_active_levels );
 extern int  __kmp_get_max_active_levels( int gtid );
 extern int  __kmp_get_ancestor_thread_num( int gtid, int level );
@@ -2842,8 +2762,6 @@ extern int  __kmp_get_team_size( int gtid, int level );
 extern void __kmp_set_schedule( int gtid, kmp_sched_t new_sched, int chunk );
 extern void __kmp_get_schedule( int gtid, kmp_sched_t * sched, int * chunk );
 
-#endif // OMP_30_ENABLED
-
 extern unsigned short __kmp_get_random( kmp_info_t * thread );
 extern void __kmp_init_random( kmp_info_t * thread );
 
@@ -2888,8 +2806,6 @@ extern void __kmp_push_num_teams( ident_t *loc, int gtid, int num_teams, int num
 #endif
 
 extern void __kmp_yield( int cond );
-extern void __kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin,
-                           enum kmp_mem_fence_type fetchadd_fence );
 
 extern void __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid,
     enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
@@ -2956,11 +2872,28 @@ extern kmp_uint32 __kmp_le_8(  kmp_uint64 value, kmp_uint64 checker );
 extern kmp_uint32 __kmp_wait_yield_4( kmp_uint32 volatile * spinner, kmp_uint32 checker, kmp_uint32 (*pred) (kmp_uint32, kmp_uint32), void * obj );
 extern kmp_uint64 __kmp_wait_yield_8( kmp_uint64 volatile * spinner, kmp_uint64 checker, kmp_uint32 (*pred) (kmp_uint64, kmp_uint64), void * obj );
 
-extern void __kmp_wait_sleep( kmp_info_t *this_thr, volatile kmp_uint *spinner, kmp_uint checker, kmp_int final_spin
+class kmp_flag_32;
+class kmp_flag_64;
+class kmp_flag_oncore;
+extern void __kmp_wait_32(kmp_info_t *this_thr, kmp_flag_32 *flag, int final_spin
 #if USE_ITT_BUILD
-                              , void * itt_sync_obj
-#endif /* USE_ITT_BUILD */
-);
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_32(kmp_flag_32 *flag);
+extern void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin
+#if USE_ITT_BUILD
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_64(kmp_flag_64 *flag);
+extern void __kmp_wait_oncore(kmp_info_t *this_thr, kmp_flag_oncore *flag, int final_spin
+#if USE_ITT_BUILD
+                   , void * itt_sync_obj
+#endif
+                   );
+extern void __kmp_release_oncore(kmp_flag_oncore *flag);
+
 extern void __kmp_infinite_loop( void );
 
 extern void __kmp_cleanup( void );
@@ -3003,9 +2936,10 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
 extern void __kmp_balanced_affinity( int tid, int team_size );
-
 #endif /* KMP_AFFINITY_SUPPORTED */
 
+extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
+
 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM)
 
 extern int __kmp_futex_determine_capable( void );
@@ -3035,8 +2969,12 @@ extern void __kmp_reap_monitor( kmp_info_t *th );
 extern void __kmp_reap_worker( kmp_info_t *th );
 extern void __kmp_terminate_thread( int gtid );
 
-extern void __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker );
-extern void __kmp_resume( int target_gtid, volatile kmp_uint *spinner );
+extern void __kmp_suspend_32( int th_gtid, kmp_flag_32 *flag );
+extern void __kmp_suspend_64( int th_gtid, kmp_flag_64 *flag );
+extern void __kmp_suspend_oncore( int th_gtid, kmp_flag_oncore *flag );
+extern void __kmp_resume_32( int target_gtid, kmp_flag_32 *flag );
+extern void __kmp_resume_64( int target_gtid, kmp_flag_64 *flag );
+extern void __kmp_resume_oncore( int target_gtid, kmp_flag_oncore *flag );
 
 extern void __kmp_elapsed( double * );
 extern void __kmp_elapsed_tick( double * );
@@ -3062,19 +3000,14 @@ extern kmp_info_t * __kmp_allocate_thread( kmp_root_t *root,
 extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                                          kmp_proc_bind_t proc_bind,
                                          kmp_internal_control_t *new_icvs,
-                                         int argc );
-#elif OMP_30_ENABLED
-extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
-                                         kmp_internal_control_t *new_icvs,
-                                         int argc );
+                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
 #else
 extern kmp_team_t * __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
-                                         int new_set_nproc, int new_set_dynamic, int new_set_nested,
-                                         int new_set_blocktime, int new_bt_intervals, int new_bt_set,
-                                         int argc );
-#endif // OMP_30_ENABLED
+                                         kmp_internal_control_t *new_icvs,
+                                         int argc USE_NESTED_HOT_ARG(kmp_info_t *thr) );
+#endif // OMP_40_ENABLED
 extern void __kmp_free_thread( kmp_info_t * );
-extern void __kmp_free_team( kmp_root_t *, kmp_team_t * );
+extern void __kmp_free_team( kmp_root_t *, kmp_team_t *  USE_NESTED_HOT_ARG(kmp_info_t *) );
 extern kmp_team_t * __kmp_reap_team( kmp_team_t * );
 
 /* ------------------------------------------------------------------------ */
@@ -3094,7 +3027,16 @@ extern int  __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
                            size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) );
 extern void __kmp_end_split_barrier ( enum barrier_type bt, int gtid );
 
-extern int __kmp_fork_call( ident_t *loc, int gtid, int exec_master,
+/*!
+ * Tell the fork call which compiler generated the fork call, and therefore how to deal with the call.
+ */
+enum fork_context_e
+{
+    fork_context_gnu,                           /**< Called from GNU generated code, so must not invoke the microtask internally. */
+    fork_context_intel,                         /**< Called from Intel generated code.  */
+    fork_context_last
+};
+extern int __kmp_fork_call( ident_t *loc, int gtid, enum fork_context_e fork_context,
   kmp_int32 argc, microtask_t microtask, launch_t invoker,
 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
 #if (KMP_ARCH_ARM || KMP_ARCH_X86_64) && KMP_OS_LINUX
@@ -3110,6 +3052,7 @@ extern void __kmp_join_call( ident_t *loc, int gtid
 #endif
                            );
 
+extern void __kmp_serialized_parallel(ident_t *id, kmp_int32 gtid);
 extern void __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team );
 extern void __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team );
 extern int __kmp_invoke_task_func( int gtid );
@@ -3120,7 +3063,7 @@ extern void __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_th
 KMP_EXPORT int __kmpc_invoke_task_func( int gtid );
 #if OMP_40_ENABLED
 extern int  __kmp_invoke_teams_master( int gtid );
-extern void __kmp_teams_master( microtask_t microtask, int gtid );
+extern void __kmp_teams_master( int gtid );
 #endif
 extern void __kmp_save_internal_controls( kmp_info_t * thread );
 extern void __kmp_user_set_library (enum library_type arg);
@@ -3135,7 +3078,6 @@ void ompc_set_nested( int flag );
 void ompc_set_dynamic( int flag );
 void ompc_set_num_threads( int arg );
 
-#if OMP_30_ENABLED
 extern void __kmp_push_current_task_to_thread( kmp_info_t *this_thr,
                   kmp_team_t *team, int tid );
 extern void __kmp_pop_current_task_from_thread( kmp_info_t *this_thr );
@@ -3145,12 +3087,25 @@ extern kmp_task_t* __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid,
 extern void __kmp_init_implicit_task( ident_t *loc_ref, kmp_info_t *this_thr,
                   kmp_team_t *team, int tid, int set_curr_task );
 
-extern int  __kmp_execute_tasks( kmp_info_t *thread, kmp_int32 gtid, volatile kmp_uint *spinner,
-                                 kmp_uint checker, int final_spin, int *thread_finished,
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+                           int *thread_finished,
 #if USE_ITT_BUILD
-                                 void * itt_sync_obj,
+                           void * itt_sync_obj,
 #endif /* USE_ITT_BUILD */
-                                 int c );
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+                           int *thread_finished,
+#if USE_ITT_BUILD
+                           void * itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                           kmp_int32 is_constrained);
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void * itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
 extern void __kmp_reap_task_teams( void );
 extern void __kmp_unref_task_team( kmp_task_team_t *task_team, kmp_info_t *thread );
 extern void __kmp_wait_to_unref_task_teams( void );
@@ -3163,8 +3118,6 @@ extern void __kmp_task_team_wait  ( kmp_info_t *this_thr, kmp_team_t *team
 );
 extern void __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid );
 
-#endif // OMP_30_ENABLED
-
 extern int  __kmp_is_address_mapped( void *addr );
 extern kmp_uint64 __kmp_hardware_timestamp(void);
 
@@ -3259,7 +3212,6 @@ KMP_EXPORT kmpc_thunk_t * __kmpc_task_buffer (ident_t *loc, kmp_int32 global_tid
 
 /* ------------------------------------------------------------------------ */
 
-#if OMP_30_ENABLED
 /*
  * OMP 3.0 tasking interface routines
  */
@@ -3288,9 +3240,9 @@ void __kmpc_omp_task_complete( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *tas
 #endif // TASK_UNUSED
 
 /* ------------------------------------------------------------------------ */
-#endif // OMP_30_ENABLED
 
 #if OMP_40_ENABLED
+
 KMP_EXPORT void __kmpc_taskgroup( ident_t * loc, int gtid );
 KMP_EXPORT void __kmpc_end_taskgroup( ident_t * loc, int gtid );
 
@@ -3301,13 +3253,13 @@ KMP_EXPORT void __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int
                                           kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list );
 extern void __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task );
 
-#endif
+extern kmp_int32 __kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate );
 
-#if OMP_40_ENABLED
 KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
 KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t* loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind);
 KMP_EXPORT kmp_int32 __kmpc_cancel_barrier(ident_t* loc_ref, kmp_int32 gtid);
 KMP_EXPORT int __kmp_get_cancellation_status(int cancel_kind);
+
 #endif
 
 /*
@@ -3404,8 +3356,6 @@ kmp_threadprivate_insert( int gtid, void *pc_addr, void *data_addr, size_t pc_si
 #   define KMPC_CONVENTION
 #endif
 
-#if OMP_30_ENABLED
-
 #ifndef __OMP_H
 typedef enum omp_sched_t {
     omp_sched_static  = 1,
@@ -3424,8 +3374,6 @@ KMP_EXPORT int  KMPC_CONVENTION kmpc_set_affinity_mask_proc(int, kmp_affinity_ma
 KMP_EXPORT int  KMPC_CONVENTION kmpc_unset_affinity_mask_proc(int, kmp_affinity_mask_t *);
 KMP_EXPORT int  KMPC_CONVENTION kmpc_get_affinity_mask_proc(int, kmp_affinity_mask_t *);
 
-#endif // OMP_30_ENABLED
-
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize(int);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_stacksize_s(size_t);
 KMP_EXPORT void KMPC_CONVENTION kmpc_set_library(int);
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 082ec9d6c73..d6821e0440f 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -1,7 +1,7 @@
 /*
  * kmp_affinity.cpp -- affinity management
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -19,7 +19,7 @@
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_str.h"
-
+#include "kmp_wrapper_getpid.h"
 
 #if KMP_AFFINITY_SUPPORTED
 
@@ -49,7 +49,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
         return buf;
     }
 
-    sprintf(scan, "{%ld", i);
+    sprintf(scan, "{%ld", (long)i);
     while (*scan != '\0') scan++;
     i++;
     for (; i < KMP_CPU_SETSIZE; i++) {
@@ -66,7 +66,7 @@ __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask)
         if (end - scan < 15) {
            break;
         }
-        sprintf(scan, ",%-ld", i);
+        sprintf(scan, ",%-ld", (long)i);
         while (*scan != '\0') scan++;
     }
     if (i < KMP_CPU_SETSIZE) {
@@ -89,7 +89,6 @@ __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
 
     if (__kmp_num_proc_groups > 1) {
         int group;
-        struct GROUP_AFFINITY ga;
         KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
         for (group = 0; group < __kmp_num_proc_groups; group++) {
             int i;
@@ -315,6 +314,106 @@ __kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
     return 0;
 }
 
+/** A structure for holding machine-specific hierarchy info to be computed once at init. */
+class hierarchy_info {
+public:
+    /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
+        etc.  We don't want to get specific with nomenclature */
+    static const kmp_uint32 maxLevels=7;
+
+    /** This is specifically the depth of the machine configuration hierarchy, in terms of the
+        number of levels along the longest path from root to any leaf. It corresponds to the
+        number of entries in numPerLevel if we exclude all but one trailing 1. */
+    kmp_uint32 depth;
+    kmp_uint32 base_depth;
+    kmp_uint32 base_num_threads;
+    bool uninitialized;
+
+    /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
+        node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
+        and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
+    kmp_uint32 numPerLevel[maxLevels];
+    kmp_uint32 skipPerLevel[maxLevels];
+
+    void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
+        int hier_depth = adr2os[0].first.depth;
+        int level = 0;
+        for (int i=hier_depth-1; i>=0; --i) {
+            int max = -1;
+            for (int j=0; j<num_addrs; ++j) {
+                int next = adr2os[j].first.childNums[i];
+                if (next > max) max = next;
+            }
+            numPerLevel[level] = max+1;
+            ++level;
+        }
+    }
+
+    hierarchy_info() : depth(1), uninitialized(true) {}
+    void init(AddrUnsPair *adr2os, int num_addrs)
+    {
+        uninitialized = false;
+        for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+            numPerLevel[i] = 1;
+            skipPerLevel[i] = 1;
+        }
+
+        // Sort table by physical ID
+        if (adr2os) {
+            qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
+            deriveLevels(adr2os, num_addrs);
+        }
+        else {
+            numPerLevel[0] = 4;
+            numPerLevel[1] = num_addrs/4;
+            if (num_addrs%4) numPerLevel[1]++;
+        }
+
+        base_num_threads = num_addrs;
+        for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
+            if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
+                depth++;
+
+        kmp_uint32 branch = 4;
+        if (numPerLevel[0] == 1) branch = num_addrs/4;
+        if (branch<4) branch=4;
+        for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
+            while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
+                if (numPerLevel[d] & 1) numPerLevel[d]++;
+                numPerLevel[d] = numPerLevel[d] >> 1;
+                if (numPerLevel[d+1] == 1) depth++;
+                numPerLevel[d+1] = numPerLevel[d+1] << 1;
+            }
+            if(numPerLevel[0] == 1) {
+                branch = branch >> 1;
+                if (branch<4) branch = 4;
+            }
+        }
+
+        for (kmp_uint32 i=1; i<depth; ++i)
+            skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
+
+        base_depth = depth;
+    }
+};
+
+static hierarchy_info machine_hierarchy;
+
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+    if (machine_hierarchy.uninitialized)
+        machine_hierarchy.init(NULL, nproc);
+
+    if (nproc <= machine_hierarchy.base_num_threads)
+        machine_hierarchy.depth = machine_hierarchy.base_depth;
+    KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0);
+    while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) {
+        machine_hierarchy.depth++;
+        machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2];
+    }
+    thr_bar->depth = machine_hierarchy.depth;
+    thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
+    thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
+}
 
 //
 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
@@ -1963,7 +2062,7 @@ __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line,
             // A newline has signalled the end of the processor record.
             // Check that there aren't too many procs specified.
             //
-            if (num_avail == __kmp_xproc) {
+            if ((int)num_avail == __kmp_xproc) {
                 CLEANUP_THREAD_INFO;
                 *msg_id = kmp_i18n_str_TooManyEntries;
                 return -1;
@@ -2587,7 +2686,7 @@ static int nextNewMask;
 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \
     {                                                                   \
         if (((_osId) > _maxOsId) ||                                     \
-          (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\
+          (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
             if (__kmp_affinity_verbose || (__kmp_affinity_warnings      \
               && (__kmp_affinity_type != affinity_none))) {             \
                 KMP_WARNING(AffIgnoreInvalidProcID, _osId);             \
@@ -3045,14 +3144,15 @@ __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
             (*setSize)++;
         }
         *scan = next;  // skip num
-        }
+    }
     else {
         KMP_ASSERT2(0, "bad explicit places list");
     }
 }
 
 
-static void
+//static void
+void
 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
   unsigned int *out_numMasks, const char *placelist,
   kmp_affin_mask_t *osId2Mask, int maxOsId)
@@ -3109,71 +3209,41 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
         // valid follow sets are ',' ':' and EOL
         //
         SKIP_WS(scan);
+        int stride;
         if (*scan == '\0' || *scan == ',') {
-            int i;
-            for (i = 0; i < count; i++) {
-                int j;
-                if (setSize == 0) {
-                    break;
-                }
-                ADD_MASK(tempMask);
-                setSize = 0;
-                for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) {
-                    //
-                    // Use a temp var in case macro is changed to evaluate
-                    // args multiple times.
-                    //
-                    if (KMP_CPU_ISSET(j - 1, tempMask)) {
-                        KMP_CPU_SET(j, tempMask);
-                        setSize++;
-                    }
-                    else {
-                        KMP_CPU_CLR(j, tempMask);
-                    }
+            stride = +1;
+        }
+        else {
+            KMP_ASSERT2(*scan == ':', "bad explicit places list");
+            scan++;         // skip ':'
+
+            //
+            // Read stride parameter
+            //
+            int sign = +1;
+            for (;;) {
+                SKIP_WS(scan);
+                if (*scan == '+') {
+                    scan++; // skip '+'
+                    continue;
                 }
-                for (; j >= 0; j--) {
-                    KMP_CPU_CLR(j, tempMask);
+                if (*scan == '-') {
+                    sign *= -1;
+                    scan++; // skip '-'
+                    continue;
                 }
-            }
-            KMP_CPU_ZERO(tempMask);
-            setSize = 0;
-
-            if (*scan == '\0') {
                 break;
             }
-            scan++;     // skip ','
-            continue;
-        }
-
-        KMP_ASSERT2(*scan == ':', "bad explicit places list");
-        scan++;         // skip ':'
-
-        //
-        // Read stride parameter
-        //
-        int sign = +1;
-        for (;;) {
             SKIP_WS(scan);
-            if (*scan == '+') {
-                scan++; // skip '+'
-                continue;
-            }
-            if (*scan == '-') {
-                sign *= -1;
-                scan++; // skip '-'
-                continue;
-            }
-            break;
+            KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
+              "bad explicit places list");
+            next = scan;
+            SKIP_DIGITS(next);
+            stride = __kmp_str_to_int(scan, *next);
+            KMP_DEBUG_ASSERT(stride >= 0);
+            scan = next;
+            stride *= sign;
         }
-        SKIP_WS(scan);
-        KMP_ASSERT2((*scan >= '0') && (*scan <= '9'),
-          "bad explicit places list");
-        next = scan;
-        SKIP_DIGITS(next);
-        int stride = __kmp_str_to_int(scan, *next);
-        KMP_DEBUG_ASSERT(stride >= 0);
-        scan = next;
-        stride *= sign;
 
         if (stride > 0) {
             int i;
@@ -3185,13 +3255,21 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
                 ADD_MASK(tempMask);
                 setSize = 0;
                 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) {
-                    if (KMP_CPU_ISSET(j - stride, tempMask)) {
-                        KMP_CPU_SET(j, tempMask);
-                        setSize++;
+                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
+                        KMP_CPU_CLR(j, tempMask);
                     }
-                    else {
+                    else if ((j > maxOsId) ||
+                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
+                        if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                          && (__kmp_affinity_type != affinity_none))) {
+                            KMP_WARNING(AffIgnoreInvalidProcID, j);
+                        }
                         KMP_CPU_CLR(j, tempMask);
                     }
+                    else {
+                        KMP_CPU_SET(j, tempMask);
+                        setSize++;
+                    }
                 }
                 for (; j >= 0; j--) {
                     KMP_CPU_CLR(j, tempMask);
@@ -3201,23 +3279,31 @@ __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
         else {
             int i;
             for (i = 0; i < count; i++) {
-                unsigned j;
+                int j;
                 if (setSize == 0) {
                     break;
                 }
                 ADD_MASK(tempMask);
                 setSize = 0;
-                for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride;
+                for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride;
                   j++) {
-                    if (KMP_CPU_ISSET(j - stride, tempMask)) {
-                        KMP_CPU_SET(j, tempMask);
-                        setSize++;
+                    if (! KMP_CPU_ISSET(j - stride, tempMask)) {
+                        KMP_CPU_CLR(j, tempMask);
                     }
-                    else {
+                    else if ((j > maxOsId) ||
+                      (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) {
+                        if (__kmp_affinity_verbose || (__kmp_affinity_warnings
+                          && (__kmp_affinity_type != affinity_none))) {
+                            KMP_WARNING(AffIgnoreInvalidProcID, j);
+                        }
                         KMP_CPU_CLR(j, tempMask);
                     }
+                    else {
+                        KMP_CPU_SET(j, tempMask);
+                        setSize++;
+                    }
                 }
-                for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) {
+                for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) {
                     KMP_CPU_CLR(j, tempMask);
                 }
             }
@@ -3270,9 +3356,13 @@ __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth)
         }
         __kmp_place_num_cores = nCoresPerPkg;   // use all available cores
     }
-    if ( !__kmp_affinity_uniform_topology() || depth != 3 ) {
-        KMP_WARNING( AffThrPlaceUnsupported );
-        return; // don't support non-uniform topology or not-3-level architecture
+    if ( !__kmp_affinity_uniform_topology() ) {
+        KMP_WARNING( AffThrPlaceNonUniform );
+        return; // don't support non-uniform topology
+    }
+    if ( depth != 3 ) {
+        KMP_WARNING( AffThrPlaceNonThreeLevel );
+        return; // don't support not-3-level topology
     }
     if ( __kmp_place_num_threads_per_core == 0 ) {
         __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore;  // use all HW contexts
@@ -3400,18 +3490,14 @@ __kmp_aux_affinity_initialize(void)
         }
 
         if (depth < 0) {
-            if ((msg_id != kmp_i18n_null)
-              && (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none)))) {
-#  if KMP_MIC
-                if (__kmp_affinity_verbose) {
+            if (__kmp_affinity_verbose) {
+                if (msg_id != kmp_i18n_null) {
                     KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
                       KMP_I18N_STR(DecodingLegacyAPIC));
                 }
-#  else
-                KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id),
-                  KMP_I18N_STR(DecodingLegacyAPIC));
-#  endif
+                else {
+                    KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
+                }
             }
 
             file_name = NULL;
@@ -3428,19 +3514,13 @@ __kmp_aux_affinity_initialize(void)
 # if KMP_OS_LINUX
 
         if (depth < 0) {
-            if ((msg_id != kmp_i18n_null)
-              && (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none)))) {
-#  if KMP_MIC
-                if (__kmp_affinity_verbose) {
+            if (__kmp_affinity_verbose) {
+                if (msg_id != kmp_i18n_null) {
                     KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
                 }
-#  else
-                KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
-#  endif
-            }
-            else if (__kmp_affinity_verbose) {
-                KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
+                else {
+                    KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
+                }
             }
 
             FILE *f = fopen("/proc/cpuinfo", "r");
@@ -3461,20 +3541,32 @@ __kmp_aux_affinity_initialize(void)
 
 # endif /* KMP_OS_LINUX */
 
+# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
+
+        if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
+            if (__kmp_affinity_verbose) {
+                KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
+            }
+
+            depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
+            KMP_ASSERT(depth != 0);
+        }
+
+# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
+
         if (depth < 0) {
-            if (msg_id != kmp_i18n_null
-              && (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none)))) {
+            if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
                 if (file_name == NULL) {
-                    KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
+                    KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
                 }
                 else if (line == 0) {
-                    KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
+                    KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
                 }
                 else {
-                    KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
+                    KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id));
                 }
             }
+            // FIXME - print msg if msg_id = kmp_i18n_null ???
 
             file_name = "";
             depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
@@ -3508,7 +3600,6 @@ __kmp_aux_affinity_initialize(void)
             KMP_ASSERT(address2os == NULL);
             return;
         }
-
         if (depth < 0) {
             KMP_ASSERT(msg_id != kmp_i18n_null);
             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
@@ -3526,7 +3617,6 @@ __kmp_aux_affinity_initialize(void)
             KMP_ASSERT(address2os == NULL);
             return;
         }
-
         if (depth < 0) {
             KMP_ASSERT(msg_id != kmp_i18n_null);
             KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
@@ -3597,23 +3687,9 @@ __kmp_aux_affinity_initialize(void)
 
         depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
         KMP_ASSERT(depth != 0);
-
         if (depth < 0) {
-            if ((msg_id != kmp_i18n_null)
-              && (__kmp_affinity_verbose || (__kmp_affinity_warnings
-              && (__kmp_affinity_type != affinity_none)))) {
-                KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id));
-            }
-
-            depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
-            if (depth == 0) {
-                KMP_ASSERT(__kmp_affinity_type == affinity_none);
-                KMP_ASSERT(address2os == NULL);
-                return;
-            }
-            // should not fail
-            KMP_ASSERT(depth > 0);
-            KMP_ASSERT(address2os != NULL);
+            KMP_ASSERT(msg_id != kmp_i18n_null);
+            KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
         }
     }
 
@@ -3658,7 +3734,7 @@ __kmp_aux_affinity_initialize(void)
     kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique,
       address2os, __kmp_avail_proc);
     if (__kmp_affinity_gran_levels == 0) {
-        KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc);
+        KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
     }
 
     //
@@ -3852,6 +3928,7 @@ __kmp_aux_affinity_initialize(void)
     }
 
     __kmp_free(osId2Mask);
+    machine_hierarchy.init(address2os, __kmp_avail_proc);
 }
 
 
@@ -3953,7 +4030,7 @@ __kmp_affinity_set_init_mask(int gtid, int isa_root)
             }
 # endif
             KMP_ASSERT(fullMask != NULL);
-            i = -1;
+            i = KMP_PLACE_ALL;
             mask = fullMask;
         }
         else {
@@ -4020,7 +4097,8 @@ __kmp_affinity_set_init_mask(int gtid, int isa_root)
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
           th->th.th_affin_mask);
-        KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf);
+        KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid,
+          buf);
     }
 
 # if KMP_OS_WINDOWS
@@ -4058,14 +4136,14 @@ __kmp_affinity_set_place(int gtid)
     // Check that the new place is within this thread's partition.
     //
     KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
-    KMP_DEBUG_ASSERT(th->th.th_new_place >= 0);
-    KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
+    KMP_ASSERT(th->th.th_new_place >= 0);
+    KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
     if (th->th.th_first_place <= th->th.th_last_place) {
-        KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place)
+        KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place)
          && (th->th.th_new_place <= th->th.th_last_place));
     }
     else {
-        KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place)
+        KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place)
          || (th->th.th_new_place >= th->th.th_last_place));
     }
 
@@ -4082,7 +4160,8 @@ __kmp_affinity_set_place(int gtid)
         char buf[KMP_AFFIN_MASK_PRINT_LEN];
         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
           th->th.th_affin_mask);
-        KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf);
+        KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
+          gtid, buf);
     }
     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
 }
@@ -4153,6 +4232,11 @@ __kmp_aux_set_affinity(void **mask)
     th->th.th_new_place = KMP_PLACE_UNDEFINED;
     th->th.th_first_place = 0;
     th->th.th_last_place = __kmp_affinity_num_masks - 1;
+
+    //
+    // Turn off 4.0 affinity for the current tread at this parallel level.
+    //
+    th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
 # endif
 
     return retval;
@@ -4207,7 +4291,6 @@ __kmp_aux_get_affinity(void **mask)
 
 }
 
-
 int
 __kmp_aux_set_affinity_mask_proc(int proc, void **mask)
 {
@@ -4360,7 +4443,8 @@ void __kmp_balanced_affinity( int tid, int nthreads )
         if (__kmp_affinity_verbose) {
             char buf[KMP_AFFIN_MASK_PRINT_LEN];
             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
+            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+              tid, buf);
         }
         __kmp_set_system_affinity( mask, TRUE );
     } else { // Non-uniform topology
@@ -4535,7 +4619,8 @@ void __kmp_balanced_affinity( int tid, int nthreads )
         if (__kmp_affinity_verbose) {
             char buf[KMP_AFFIN_MASK_PRINT_LEN];
             __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
-            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf);
+            KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
+              tid, buf);
         }
         __kmp_set_system_affinity( mask, TRUE );
     }
@@ -4543,4 +4628,50 @@ void __kmp_balanced_affinity( int tid, int nthreads )
 
 # endif /* KMP_MIC */
 
+#else
+    // affinity not supported
+
+kmp_uint32 mac_skipPerLevel[7];
+kmp_uint32 mac_depth;
+kmp_uint8 mac_leaf_kids;
+void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
+    static int first = 1;
+    if (first) {
+        const kmp_uint32 maxLevels = 7;
+        kmp_uint32 numPerLevel[maxLevels];
+
+        for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
+            numPerLevel[i] = 1;
+            mac_skipPerLevel[i] = 1;
+        }
+
+        mac_depth = 2;
+        numPerLevel[0] = nproc;
+
+        kmp_uint32 branch = 4;
+        if (numPerLevel[0] == 1) branch = nproc/4;
+        if (branch<4) branch=4;
+        for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width
+            while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
+                if (numPerLevel[d] & 1) numPerLevel[d]++;
+                numPerLevel[d] = numPerLevel[d] >> 1;
+                if (numPerLevel[d+1] == 1) mac_depth++;
+                numPerLevel[d+1] = numPerLevel[d+1] << 1;
+            }
+            if(numPerLevel[0] == 1) {
+                branch = branch >> 1;
+                if (branch<4) branch = 4;
+            }
+        }
+
+        for (kmp_uint32 i=1; i<mac_depth; ++i)
+            mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1];
+        mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
+        first=0;
+    }
+    thr_bar->depth = mac_depth;
+    thr_bar->base_leaf_kids = mac_leaf_kids;
+    thr_bar->skip_per_level = mac_skipPerLevel;
+}
+
 #endif // KMP_AFFINITY_SUPPORTED
diff --git a/openmp/runtime/src/kmp_alloc.c b/openmp/runtime/src/kmp_alloc.c
index 885754fd006..18f62365f74 100644
--- a/openmp/runtime/src/kmp_alloc.c
+++ b/openmp/runtime/src/kmp_alloc.c
@@ -1,7 +1,7 @@
 /*
  * kmp_alloc.c -- private/shared dyanmic memory allocation and management
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43450 $
+ * $Date: 2014-09-09 10:07:22 -0500 (Tue, 09 Sep 2014) $
  */
 
 
@@ -1228,7 +1228,7 @@ bpoold(  kmp_info_t *th, void *buf, int dumpalloc, int dumpfree)
                 bufdump( th, (void *) (((char *) b) + sizeof(bhead_t)));
             }
         } else {
-            char *lerr = "";
+            const char *lerr = "";
 
             KMP_DEBUG_ASSERT(bs > 0);
             if ((b->ql.blink->ql.flink != b) || (b->ql.flink->ql.blink != b)) {
@@ -1772,7 +1772,11 @@ ___kmp_free( void * ptr KMP_SRC_LOC_DECL )
 
         #ifndef LEAK_MEMORY
             KE_TRACE( 10, ( "   free( %p )\n", descr.ptr_allocated ) );
+        # ifdef KMP_DEBUG
+            _free_src_loc( descr.ptr_allocated, _file_, _line_ );
+        # else
             free_src_loc( descr.ptr_allocated KMP_SRC_LOC_PARM );
+        # endif
         #endif
 
     KMP_MB();
@@ -1790,7 +1794,7 @@ ___kmp_free( void * ptr KMP_SRC_LOC_DECL )
 // Otherwise allocate normally using kmp_thread_malloc.
 
 // AC: How to choose the limit? Just get 16 for now...
-static int const __kmp_free_list_limit = 16;
+#define KMP_FREE_LIST_LIMIT 16
 
 // Always use 128 bytes for determining buckets for caching memory blocks
 #define DCACHE_LINE  128
@@ -1932,7 +1936,7 @@ ___kmp_fast_free( kmp_info_t *this_thr, void * ptr KMP_SRC_LOC_DECL )
             kmp_mem_descr_t * dsc  = (kmp_mem_descr_t *)( (char*)head - sizeof(kmp_mem_descr_t) );
             kmp_info_t      * q_th = (kmp_info_t *)(dsc->ptr_aligned); // allocating thread, same for all queue nodes
             size_t            q_sz = dsc->size_allocated + 1;          // new size in case we add current task
-            if ( q_th == alloc_thr && q_sz <= __kmp_free_list_limit ) {
+            if ( q_th == alloc_thr && q_sz <= KMP_FREE_LIST_LIMIT ) {
                 // we can add current task to "other" list, no sync needed
                 *((void **)ptr) = head;
                 descr->size_allocated = q_sz;
diff --git a/openmp/runtime/src/kmp_atomic.c b/openmp/runtime/src/kmp_atomic.c
index 3e9c82f874f..6613c6da76c 100644
--- a/openmp/runtime/src/kmp_atomic.c
+++ b/openmp/runtime/src/kmp_atomic.c
@@ -1,7 +1,7 @@
 /*
  * kmp_atomic.c -- ATOMIC implementation routines
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43421 $
+ * $Date: 2014-08-28 08:56:10 -0500 (Thu, 28 Aug 2014) $
  */
 
 
@@ -690,7 +690,7 @@ RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lh
 #endif /* KMP_GOMP_COMPAT */
 
 #if KMP_MIC
-# define KMP_DO_PAUSE _mm_delay_32( 30 )
+# define KMP_DO_PAUSE _mm_delay_32( 1 )
 #else
 # define KMP_DO_PAUSE KMP_CPU_PAUSE()
 #endif /* KMP_MIC */
@@ -700,14 +700,10 @@ RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lh
 //     TYPE    - operands' type
 //     BITS    - size in bits, used to distinguish low level calls
 //     OP      - operator
-// Note: temp_val introduced in order to force the compiler to read
-//       *lhs only once (w/o it the compiler reads *lhs twice)
 #define OP_CMPXCHG(TYPE,BITS,OP)                                          \
     {                                                                     \
-        TYPE KMP_ATOMIC_VOLATILE temp_val;                                \
         TYPE old_value, new_value;                                        \
-        temp_val = *lhs;                                                  \
-        old_value = temp_val;                                             \
+        old_value = *(TYPE volatile *)lhs;                                \
         new_value = old_value OP rhs;                                     \
         while ( ! KMP_COMPARE_AND_STORE_ACQ##BITS( (kmp_int##BITS *) lhs, \
                       *VOLATILE_CAST(kmp_int##BITS *) &old_value,         \
@@ -715,8 +711,7 @@ RET_TYPE __kmpc_atomic_##TYPE_ID##_##OP_ID( ident_t *id_ref, int gtid, TYPE * lh
         {                                                                 \
                 KMP_DO_PAUSE;                                             \
                                                                           \
-            temp_val = *lhs;                                              \
-            old_value = temp_val;                                         \
+            old_value = *(TYPE volatile *)lhs;                            \
             new_value = old_value OP rhs;                                 \
         }                                                                 \
     }
@@ -765,13 +760,6 @@ ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
     KMP_TEST_THEN_ADD##BITS( lhs, OP rhs );                                \
 }
 // -------------------------------------------------------------------------
-#define ATOMIC_FLOAT_ADD(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
-ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
-    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
-    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */      \
-    KMP_TEST_THEN_ADD_REAL##BITS( lhs, OP rhs );                           \
-}
-// -------------------------------------------------------------------------
 #define ATOMIC_CMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
 ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
     OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
@@ -803,17 +791,6 @@ ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
     }                                                                      \
 }
 // -------------------------------------------------------------------------
-#define ATOMIC_FLOAT_ADD(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG) \
-ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
-    OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
-    if ( ! ( (kmp_uintptr_t) lhs & 0x##MASK) ) {                           \
-        OP_CMPXCHG(TYPE,BITS,OP)     /* aligned address */                 \
-    } else {                                                               \
-        KMP_CHECK_GTID;                                                    \
-        OP_CRITICAL(OP##=,LCK_ID)  /* unaligned address - use critical */  \
-    }                                                                      \
-}
-// -------------------------------------------------------------------------
 #define ATOMIC_CMPXCHG(TYPE_ID,OP_ID,TYPE,BITS,OP,LCK_ID,MASK,GOMP_FLAG)   \
 ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)                                      \
     OP_GOMP_CRITICAL(OP##=,GOMP_FLAG)                                      \
@@ -845,25 +822,15 @@ ATOMIC_BEGIN(TYPE_ID,OP_ID,TYPE,void)
 ATOMIC_FIXED_ADD( fixed4, add, kmp_int32,  32, +, 4i, 3, 0            )  // __kmpc_atomic_fixed4_add
 ATOMIC_FIXED_ADD( fixed4, sub, kmp_int32,  32, -, 4i, 3, 0            )  // __kmpc_atomic_fixed4_sub
 
-#if KMP_MIC
 ATOMIC_CMPXCHG( float4,  add, kmp_real32, 32, +,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add
 ATOMIC_CMPXCHG( float4,  sub, kmp_real32, 32, -,  4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub
-#else
-ATOMIC_FLOAT_ADD( float4, add, kmp_real32, 32, +, 4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add
-ATOMIC_FLOAT_ADD( float4, sub, kmp_real32, 32, -, 4r, 3, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub
-#endif // KMP_MIC
 
 // Routines for ATOMIC 8-byte operands addition and subtraction
 ATOMIC_FIXED_ADD( fixed8, add, kmp_int64,  64, +, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_add
 ATOMIC_FIXED_ADD( fixed8, sub, kmp_int64,  64, -, 8i, 7, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub
 
-#if KMP_MIC
 ATOMIC_CMPXCHG( float8,  add, kmp_real64, 64, +,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add
 ATOMIC_CMPXCHG( float8,  sub, kmp_real64, 64, -,  8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub
-#else
-ATOMIC_FLOAT_ADD( float8, add, kmp_real64, 64, +, 8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add
-ATOMIC_FLOAT_ADD( float8, sub, kmp_real64, 64, -, 8r, 7, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub
-#endif // KMP_MIC
 
 // ------------------------------------------------------------------------
 // Entries definition for integer operands
@@ -1867,35 +1834,16 @@ ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
         return old_value;                                                  \
 }
 // -------------------------------------------------------------------------
-#define ATOMIC_FLOAT_ADD_CPT(TYPE_ID,OP_ID,TYPE,BITS,OP,GOMP_FLAG)         \
-ATOMIC_BEGIN_CPT(TYPE_ID,OP_ID,TYPE,TYPE)                                  \
-    TYPE old_value, new_value;                                             \
-    OP_GOMP_CRITICAL_CPT(OP,GOMP_FLAG)                                     \
-    /* OP used as a sign for subtraction: (lhs-rhs) --> (lhs+-rhs) */      \
-    old_value = KMP_TEST_THEN_ADD_REAL##BITS( lhs, OP rhs );               \
-    if( flag ) {                                                           \
-        return old_value OP rhs;                                           \
-    } else                                                                 \
-        return old_value;                                                  \
-}
-// -------------------------------------------------------------------------
 
 ATOMIC_FIXED_ADD_CPT( fixed4, add_cpt, kmp_int32,  32, +, 0            )  // __kmpc_atomic_fixed4_add_cpt
 ATOMIC_FIXED_ADD_CPT( fixed4, sub_cpt, kmp_int32,  32, -, 0            )  // __kmpc_atomic_fixed4_sub_cpt
 ATOMIC_FIXED_ADD_CPT( fixed8, add_cpt, kmp_int64,  64, +, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_add_cpt
 ATOMIC_FIXED_ADD_CPT( fixed8, sub_cpt, kmp_int64,  64, -, KMP_ARCH_X86 )  // __kmpc_atomic_fixed8_sub_cpt
 
-#if KMP_MIC
 ATOMIC_CMPXCHG_CPT( float4, add_cpt, kmp_real32, 32, +, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add_cpt
 ATOMIC_CMPXCHG_CPT( float4, sub_cpt, kmp_real32, 32, -, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub_cpt
 ATOMIC_CMPXCHG_CPT( float8, add_cpt, kmp_real64, 64, +, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add_cpt
 ATOMIC_CMPXCHG_CPT( float8, sub_cpt, kmp_real64, 64, -, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub_cpt
-#else
-ATOMIC_FLOAT_ADD_CPT( float4, add_cpt, kmp_real32, 32, +, KMP_ARCH_X86 )  // __kmpc_atomic_float4_add_cpt
-ATOMIC_FLOAT_ADD_CPT( float4, sub_cpt, kmp_real32, 32, -, KMP_ARCH_X86 )  // __kmpc_atomic_float4_sub_cpt
-ATOMIC_FLOAT_ADD_CPT( float8, add_cpt, kmp_real64, 64, +, KMP_ARCH_X86 )  // __kmpc_atomic_float8_add_cpt
-ATOMIC_FLOAT_ADD_CPT( float8, sub_cpt, kmp_real64, 64, -, KMP_ARCH_X86 )  // __kmpc_atomic_float8_sub_cpt
-#endif // KMP_MIC
 
 // ------------------------------------------------------------------------
 // Entries definition for integer operands
diff --git a/openmp/runtime/src/kmp_atomic.h b/openmp/runtime/src/kmp_atomic.h
index 361dce9aa79..68673dad8fb 100644
--- a/openmp/runtime/src/kmp_atomic.h
+++ b/openmp/runtime/src/kmp_atomic.h
@@ -1,7 +1,7 @@
 /*
  * kmp_atomic.h - ATOMIC header file
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43191 $
+ * $Date: 2014-05-27 07:44:11 -0500 (Tue, 27 May 2014) $
  */
 
 
@@ -33,7 +33,7 @@
 #if defined( __cplusplus ) && ( KMP_OS_WINDOWS )
     // create shortcuts for c99 complex types
 
-    #ifdef _DEBUG
+    #if (_MSC_VER < 1600) && defined(_DEBUG)
         // Workaround for the problem of _DebugHeapTag unresolved external.
         // This problem prevented to use our static debug library for C tests
         // compiled with /MDd option (the library itself built with /MTd),
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
new file mode 100644
index 00000000000..cbd1edd9e7c
--- /dev/null
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -0,0 +1,1607 @@
+/*
+ * kmp_barrier.cpp
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "kmp.h"
+#include "kmp_wait_release.h"
+#include "kmp_stats.h"
+#include "kmp_itt.h"
+
+#if KMP_MIC
+#include <immintrin.h>
+#define USE_NGO_STORES 1
+#endif // KMP_MIC
+
+#if KMP_MIC && USE_NGO_STORES
+// ICV copying
+#define ngo_load(src)            __m512d Vt = _mm512_load_pd((void *)(src))
+#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_store_go(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt)
+#define ngo_sync()               __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
+#else
+#define ngo_load(src)            ((void)0)
+#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
+#define ngo_store_go(dst, src)   memcpy((dst), (src), CACHE_LINE)
+#define ngo_sync()               ((void)0)
+#endif /* KMP_MIC && USE_NGO_STORES */
+
+void __kmp_print_structure(void); // Forward declaration
+
+// ---------------------------- Barrier Algorithms ----------------------------
+
+// Linear Barrier
+static void
+__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                            void (*reduce)(void *, void *)
+                            USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_linear_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    // We now perform a linear reduction to signal that all of the threads have arrived.
+    if (!KMP_MASTER_TID(tid)) {
+        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
+                      thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+        // Mark arrival to master thread
+        /* After performing this write, a worker thread may not assume that the team is valid
+           any more - it could be deallocated by the master thread at any time. */
+        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
+        flag.release();
+    } else {
+        register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
+        register int nproc = this_thr->th.th_team_nproc;
+        register int i;
+        // Don't have to worry about sleep bit here or atomic since team setting
+        register kmp_uint new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
+
+        // Collect all the worker team member threads.
+        for (i=1; i<nproc; ++i) {
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's arrived count
+            if (i+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                            __kmp_gtid_from_tid(i, team), team->t.t_id, i,
+                            &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
+
+            // Wait for worker thread to arrive
+            kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
+            flag.wait(this_thr, FALSE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and the other thread time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          other_threads[i]->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
+                               team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
+                (*reduce)(this_thr->th.th_local.reduce_data,
+                          other_threads[i]->th.th_local.reduce_data);
+            }
+        }
+        // Don't have to worry about sleep bit here or atomic since team setting
+        team_bar->b_arrived = new_state;
+        KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
+    }
+    KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                             int propagate_icvs
+                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_linear_release);
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_team_t *team;
+
+    if (KMP_MASTER_TID(tid)) {
+        register unsigned int i;
+        register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+        register kmp_info_t **other_threads;
+
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        other_threads = team->t.t_threads;
+
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+
+        if (nproc > 1) {
+#if KMP_BARRIER_ICV_PUSH
+            KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+            if (propagate_icvs) {
+                ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
+                for (i=1; i<nproc; ++i) {
+                    __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
+                    ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
+                                   &team->t.t_implicit_task_taskdata[0].td_icvs);
+                }
+                ngo_sync();
+            }
+            KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PUSH
+
+            // Now, release all of the worker threads
+            for (i=1; i<nproc; ++i) {
+#if KMP_CACHE_MANAGE
+                // Prefetch next thread's go flag
+                if (i+1 < nproc)
+                    KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+                KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                              other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
+                              &other_threads[i]->th.th_bar[bt].bb.b_go,
+                              other_threads[i]->th.th_bar[bt].bb.b_go,
+                              other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
+                kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
+                flag.release();
+            }
+        }
+    } else { // Wait for the MASTER thread to release us
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
+            return;
+        // The worker thread may now assume that the team is valid.
+#ifdef KMP_DEBUG
+        tid = __kmp_tid_from_gtid(gtid);
+        team = __kmp_threads[gtid]->th.th_team;
+#endif
+        KMP_DEBUG_ASSERT(team != NULL);
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+    KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// Tree barrier
+static void
+__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                          void (*reduce)(void *, void *)
+                          USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_tree_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 child;
+    register kmp_uint32 child_tid;
+    register kmp_uint new_state;
+
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
+    child_tid = (tid << branch_bits) + 1;
+    if (child_tid < nproc) {
+        // Parent threads wait for all their children to arrive
+        new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        child = 1;
+        do {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's arrived count
+            if (child+1 <= branch_factor && child_tid+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                            __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
+                            &child_bar->b_arrived, new_state));
+            // Wait for child to arrive
+            kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+            flag.wait(this_thr, FALSE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and a child time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          child_thr->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                               team->t.t_id, child_tid));
+                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+            }
+            child++;
+            child_tid++;
+        }
+        while (child <= branch_factor && child_tid < nproc);
+    }
+
+    if (!KMP_MASTER_TID(tid)) { // Worker threads
+        register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
+
+        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
+                      &thr_bar->b_arrived, thr_bar->b_arrived,
+                      thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+
+        // Mark arrival to parent thread
+        /* After performing this write, a worker thread may not assume that the team is valid
+           any more - it could be deallocated by the master thread at any time.  */
+        kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
+        flag.release();
+    } else {
+        // Need to update the team arrived pointer if we are the master thread
+        if (nproc > 1) // New value was already computed above
+            team->t.t_bar[bt].b_arrived = new_state;
+        else
+            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+        KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id,
+                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                           int propagate_icvs
+                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_tree_release);
+    register kmp_team_t *team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 child;
+    register kmp_uint32 child_tid;
+
+    // Perform a tree release for all of the threads that have been gathered
+    if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        // Wait for parent thread to release us
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    } else {
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+    }
+    nproc = this_thr->th.th_team_nproc;
+    child_tid = (tid << branch_bits) + 1;
+
+    if (child_tid < nproc) {
+        register kmp_info_t **other_threads = team->t.t_threads;
+        child = 1;
+        // Parent threads release all their children
+        do {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            // Prefetch next thread's go count
+            if (child+1 <= branch_factor && child_tid+1 < nproc)
+                KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+            KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+            if (propagate_icvs) {
+                __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
+                                         team, child_tid, FALSE);
+                copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
+                          &team->t.t_implicit_task_taskdata[0].td_icvs);
+            }
+            KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PUSH
+            KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                          "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                          child_tid, &child_bar->b_go, child_bar->b_go,
+                          child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+            // Release child from barrier
+            kmp_flag_64 flag(&child_bar->b_go, child_thr);
+            flag.release();
+            child++;
+            child_tid++;
+        }
+        while (child <= branch_factor && child_tid < nproc);
+    }
+    KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+
+// Hyper Barrier
+static void
+__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                           void (*reduce)(void *, void *)
+                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hyper_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
+    register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
+    register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
+    register kmp_uint32 branch_factor = 1 << branch_bits;
+    register kmp_uint32 offset;
+    register kmp_uint32 level;
+
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    // Barrier imbalance - save arrive time to the thread
+    if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+        this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
+    }
+#endif
+    /* Perform a hypercube-embedded tree gather to wait until all of the threads have
+       arrived, and reduce any required data as we go.  */
+    kmp_flag_64 p_flag(&thr_bar->b_arrived);
+    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
+    {
+        register kmp_uint32 child;
+        register kmp_uint32 child_tid;
+
+        if (((tid >> level) & (branch_factor - 1)) != 0) {
+            register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
+
+            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                          "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
+                          &thr_bar->b_arrived, thr_bar->b_arrived,
+                          thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+            // Mark arrival to parent thread
+            /* After performing this write (in the last iteration of the enclosing for loop),
+               a worker thread may not assume that the team is valid any more - it could be
+               deallocated by the master thread at any time.  */
+            p_flag.set_waiter(other_threads[parent_tid]);
+	    p_flag.release();
+            break;
+        }
+
+        // Parent threads wait for children to arrive
+        if (new_state == KMP_BARRIER_UNUSED_STATE)
+            new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
+             child++, child_tid+=(1 << level))
+        {
+            register kmp_info_t *child_thr = other_threads[child_tid];
+            register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+            register kmp_uint32 next_child_tid = child_tid + (1 << level);
+            // Prefetch next thread's arrived count
+            if (child+1 < branch_factor && next_child_tid < num_threads)
+                KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
+#endif /* KMP_CACHE_MANAGE */
+            KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
+                          "arrived(%p) == %u\n", gtid, team->t.t_id, tid,
+                          __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
+                          &child_bar->b_arrived, new_state));
+            // Wait for child to arrive
+            kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
+            c_flag.wait(this_thr, FALSE
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier imbalance - write min of the thread time and a child time to the thread.
+            if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) {
+                this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
+                                                          child_thr->th.th_bar_min_time);
+            }
+#endif
+            if (reduce) {
+                KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
+                               gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                               team->t.t_id, child_tid));
+                (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+            }
+        }
+    }
+
+    if (KMP_MASTER_TID(tid)) {
+        // Need to update the team arrived pointer if we are the master thread
+        if (new_state == KMP_BARRIER_UNUSED_STATE)
+            team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
+        else
+            team->t.t_bar[bt].b_arrived = new_state;
+        KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id,
+                      &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// The reverse versions seem to beat the forward versions overall
+#define KMP_REVERSE_HYPER_BAR
+static void
+__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                            int propagate_icvs
+                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hyper_release);
+    register kmp_team_t    *team;
+    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
+    register kmp_info_t   **other_threads;
+    register kmp_uint32     num_threads;
+    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
+    register kmp_uint32     branch_factor = 1 << branch_bits;
+    register kmp_uint32     child;
+    register kmp_uint32     child_tid;
+    register kmp_uint32     offset;
+    register kmp_uint32     level;
+
+    /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
+       If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
+       order of the corresponding gather, otherwise threads are released in the same order. */
+    if (KMP_MASTER_TID(tid)) { // master
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs) { // master already has ICVs in final destination; copy
+            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
+        }
+#endif
+    }
+    else  { // Handle fork barrier workers who aren't part of a team yet
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
+                      gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
+        // Wait for parent thread to release us
+        kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
+            // In fork barrier where we could not get the object reliably
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
+            // Cancel wait on previous parallel region...
+            __kmp_itt_task_starting(itt_sync_obj);
+
+            if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+                return;
+
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            if (itt_sync_obj != NULL)
+                // Call prepare as early as possible for "new" barrier
+                __kmp_itt_task_finished(itt_sync_obj);
+        } else
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
+        KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+    num_threads = this_thr->th.th_team_nproc;
+    other_threads = team->t.t_threads;
+
+#ifdef KMP_REVERSE_HYPER_BAR
+    // Count up to correct level for parent
+    for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
+         level+=branch_bits, offset<<=branch_bits);
+
+    // Now go down from there
+    for (level-=branch_bits, offset>>=branch_bits; offset != 0;
+         level-=branch_bits, offset>>=branch_bits)
+#else
+    // Go down the tree, level by level
+    for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
+#endif // KMP_REVERSE_HYPER_BAR
+    {
+#ifdef KMP_REVERSE_HYPER_BAR
+        /* Now go in reverse order through the children, highest to lowest.
+           Initial setting of child is conservative here. */
+        child = num_threads >> ((level==0)?level:level-1);
+        for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
+             child>=1; child--, child_tid-=(1<<level))
+#else
+        if (((tid >> level) & (branch_factor - 1)) != 0)
+            // No need to go lower than this, since this is the level parent would be notified
+            break;
+        // Iterate through children on this level of the tree
+        for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
+             child++, child_tid+=(1<<level))
+#endif // KMP_REVERSE_HYPER_BAR
+        {
+            if (child_tid >= num_threads) continue;  // Child doesn't exist so keep going
+            else {
+                register kmp_info_t *child_thr = other_threads[child_tid];
+                register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+#if KMP_CACHE_MANAGE
+                register kmp_uint32 next_child_tid = child_tid - (1 << level);
+                // Prefetch next thread's go count
+# ifdef KMP_REVERSE_HYPER_BAR
+                if (child-1 >= 1 && next_child_tid < num_threads)
+# else
+                if (child+1 < branch_factor && next_child_tid < num_threads)
+# endif // KMP_REVERSE_HYPER_BAR
+                    KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
+#endif /* KMP_CACHE_MANAGE */
+
+#if KMP_BARRIER_ICV_PUSH
+                if (propagate_icvs) // push my fixed ICVs to my child
+                    copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+
+                KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
+                              "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                              __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
+                              child_tid, &child_bar->b_go, child_bar->b_go,
+                              child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                // Release child from barrier
+                kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                flag.release();
+            }
+        }
+    }
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
+        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
+    }
+#endif
+    KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// Hierarchical Barrier
+
+// Initialize thread barrier data
+/* Initializes/re-initializes the hierarchical barrier data stored on a thread.  Performs the
+   minimum amount of initialization required based on how the team has changed.  Returns true if
+   leaf children will require both on-core and traditional wake-up mechanisms.  For example, if the
+   team size increases, threads already in the team will respond to on-core wakeup on their parent
+   thread, but threads newly added to the team will only be listening on the their local b_go. */
+static bool
+__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
+                                       int gtid, int tid, kmp_team_t *team)
+{
+    // Checks to determine if (re-)initialization is needed
+    bool uninitialized = thr_bar->team == NULL;
+    bool team_changed = team != thr_bar->team;
+    bool team_sz_changed = nproc != thr_bar->nproc;
+    bool tid_changed = tid != thr_bar->old_tid;
+    bool retval = false;
+
+    if (uninitialized || team_sz_changed) {
+        __kmp_get_hierarchy(nproc, thr_bar);
+    }
+
+    if (uninitialized || team_sz_changed || tid_changed) {
+        thr_bar->my_level = thr_bar->depth-1; // default for master
+        thr_bar->parent_tid = -1; // default for master
+        if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
+            kmp_uint32 d=0;
+            while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
+                kmp_uint32 rem;
+                if (d == thr_bar->depth-2) { // reached level right below the master
+                    thr_bar->parent_tid = 0;
+                    thr_bar->my_level = d;
+                    break;
+                }
+                else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
+                    // thread is not a subtree root at next level, so this is max
+                    thr_bar->parent_tid = tid - rem;
+                    thr_bar->my_level = d;
+                    break;
+                }
+                ++d;
+            }
+        }
+        thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
+        thr_bar->old_tid = tid;
+        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+    }
+    if (uninitialized || team_changed || tid_changed) {
+        thr_bar->team = team;
+        thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
+        retval = true;
+    }
+    if (uninitialized || team_sz_changed || tid_changed) {
+        thr_bar->nproc = nproc;
+        thr_bar->leaf_kids = thr_bar->base_leaf_kids;
+        if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
+        if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
+            thr_bar->leaf_kids = nproc - tid - 1;
+        thr_bar->leaf_state = 0;
+        for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
+    }
+    return retval;
+}
+
+static void
+__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
+                                  int gtid, int tid, void (*reduce) (void *, void *)
+                                  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hier_gather);
+    register kmp_team_t *team = this_thr->th.th_team;
+    register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc = this_thr->th.th_team_nproc;
+    register kmp_info_t **other_threads = team->t.t_threads;
+    register kmp_uint64 new_state;
+
+    if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
+    else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
+
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+    KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
+
+    (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
+
+    if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
+        register kmp_int32 child_tid;
+        new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
+            if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
+                kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : (kmp_uint64)team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
+                kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
+                flag.wait(this_thr, FALSE
+                          USE_ITT_BUILD_ARG(itt_sync_obj) );
+                if (reduce) {
+                    for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
+                    }
+                }
+                (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
+            }
+            // Next, wait for higher level children on each child's b_arrived flag
+            for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
+                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t *child_thr = other_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                                  "arrived(%p) == %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+                    flag.wait(this_thr, FALSE
+                              USE_ITT_BUILD_ARG(itt_sync_obj) );
+                    if (reduce) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+                    }
+                }
+            }
+        }
+        else { // Blocktime is not infinite
+            for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
+                kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t *child_thr = other_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
+                                  "arrived(%p) == %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
+                    kmp_flag_64 flag(&child_bar->b_arrived, new_state);
+                    flag.wait(this_thr, FALSE
+                              USE_ITT_BUILD_ARG(itt_sync_obj) );
+                    if (reduce) {
+                        KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
+                                       gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                       team->t.t_id, child_tid));
+                        (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
+                    }
+                }
+            }
+        }
+    }
+    // All subordinates are gathered; now release parent if not master thread
+
+    if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
+                      "arrived(%p): %u => %u\n", gtid, team->t.t_id, tid,
+                      __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
+                      &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
+        /* Mark arrival to parent: After performing this write, a worker thread may not assume that
+           the team is valid any more - it could be deallocated by the master thread at any time. */
+        if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
+            || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
+            kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
+            flag.release();
+        }
+        else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
+            thr_bar->b_arrived = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
+            kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
+            flag.set_waiter(other_threads[thr_bar->parent_tid]);
+            flag.release();
+        }
+    } else { // Master thread needs to update the team's b_arrived value
+        team->t.t_bar[bt].b_arrived = (kmp_uint32)new_state;
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
+                      gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
+    }
+    // Is the team access below unsafe or just technically invalid?
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+static void
+__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
+                                   int propagate_icvs
+                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    KMP_TIME_BLOCK(KMP_hier_release);
+    register kmp_team_t *team;
+    register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
+    register kmp_uint32 nproc;
+    bool team_change = false; // indicates on-core barrier shouldn't be used
+
+    if (KMP_MASTER_TID(tid)) {
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
+                      gtid, team->t.t_id, tid, bt));
+    }
+    else { // Worker threads
+        // Wait for parent thread to release me
+        if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
+            || thr_bar->my_level != 0 || thr_bar->team == NULL) {
+            // Use traditional method of waiting on my own b_go flag
+            thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
+            kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
+            flag.wait(this_thr, TRUE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+        }
+        else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
+            // Wait on my "offset" bits on parent's b_go flag
+            thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
+            kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
+                                 bt, this_thr
+                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
+            flag.wait(this_thr, TRUE
+                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
+                TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+            }
+            else { // Reset my bits on parent's b_go flag
+                ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
+            }
+        }
+        thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
+        // Early exit for reaping threads releasing forkjoin barrier
+        if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
+            return;
+        // The worker thread may now assume that the team is valid.
+        team = __kmp_threads[gtid]->th.th_team;
+        KMP_DEBUG_ASSERT(team != NULL);
+        tid = __kmp_tid_from_gtid(gtid);
+
+        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
+                      gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
+        KMP_MB();  // Flush all pending memory write invalidates.
+    }
+
+    if (this_thr->th.th_team->t.t_level == 1) thr_bar->use_oncore_barrier = 1;
+    else thr_bar->use_oncore_barrier = 0;
+    nproc = this_thr->th.th_team_nproc;
+
+    // If the team size has increased, we still communicate with old leaves via oncore barrier.
+    unsigned short int old_leaf_kids = thr_bar->leaf_kids;
+    kmp_uint64 old_leaf_state = thr_bar->leaf_state;
+    team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
+    // But if the entire team changes, we won't use oncore barrier at all
+    if (team_change) old_leaf_kids = 0;
+
+#if KMP_BARRIER_ICV_PUSH
+    if (propagate_icvs) {
+        if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
+            copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
+        }
+        else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
+            if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
+                // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
+                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                          &thr_bar->parent_bar->th_fixed_icvs);
+            // non-leaves will get ICVs piggybacked with b_go via NGO store
+        }
+        else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
+            if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
+                copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
+            else // leaves copy parent's fixed ICVs directly to local ICV store
+                copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                          &thr_bar->parent_bar->th_fixed_icvs);
+        }
+    }
+#endif // KMP_BARRIER_ICV_PUSH
+
+    // Now, release my children
+    if (thr_bar->my_level) { // not a leaf
+        register kmp_int32 child_tid;
+        kmp_uint32 last;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
+            if (KMP_MASTER_TID(tid)) { // do a flat release
+                // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
+                thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
+                // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
+                ngo_load(&thr_bar->th_fixed_icvs);
+                // This loops over all the threads skipping only the leaf nodes in the hierarchy
+                for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
+                    register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                                  " go(%p): %u => %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                    // Use ngo store (if available) to both store ICVs and release child via child's b_go
+                    ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
+                }
+                ngo_sync();
+            }
+            TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
+            // Now, release leaf children
+            if (thr_bar->leaf_kids) { // if there are any
+                // We test team_change on the off-chance that the level 1 team changed.
+                if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
+                    if (old_leaf_kids) { // release old leaf kids
+                        thr_bar->b_go |= old_leaf_state;
+                    }
+                    // Release new leaf kids
+                    last = tid+thr_bar->skip_per_level[1];
+                    if (last > nproc) last = nproc;
+                    for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
+                        register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
+                        register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                        KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
+                                      " T#%d(%d:%d) go(%p): %u => %u\n",
+                                      gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                      team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                      child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                        // Release child using child's b_go flag
+                        kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                        flag.release();
+                    }
+                }
+                else { // Release all children at once with leaf_state bits on my own b_go flag
+                    thr_bar->b_go |= thr_bar->leaf_state;
+                }
+            }
+        }
+        else { // Blocktime is not infinite; do a simple hierarchical release
+            for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
+                last = tid+thr_bar->skip_per_level[d+1];
+                kmp_uint32 skip = thr_bar->skip_per_level[d];
+                if (last > nproc) last = nproc;
+                for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
+                    register kmp_info_t   *child_thr = team->t.t_threads[child_tid];
+                    register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
+                    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
+                                  " go(%p): %u => %u\n",
+                                  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
+                                  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
+                                  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
+                    // Release child using child's b_go flag
+                    kmp_flag_64 flag(&child_bar->b_go, child_thr);
+                    flag.release();
+                }
+            }
+        }
+#if KMP_BARRIER_ICV_PUSH
+        if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
+            copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
+#endif // KMP_BARRIER_ICV_PUSH
+    }
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
+                  gtid, team->t.t_id, tid, bt));
+}
+
+// ---------------------------- End of Barrier Algorithms ----------------------------
+
+// Internal function to do a barrier.
+/* If is_split is true, do a split barrier, otherwise, do a plain barrier
+   If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
+   Returns 0 if master thread, 1 if worker thread.  */
+int
+__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
+              void *reduce_data, void (*reduce)(void *, void *))
+{
+    KMP_TIME_BLOCK(KMP_barrier);
+    register int tid = __kmp_tid_from_gtid(gtid);
+    register kmp_info_t *this_thr = __kmp_threads[gtid];
+    register kmp_team_t *team = this_thr->th.th_team;
+    register int status = 0;
+    ident_t *loc = __kmp_threads[gtid]->th.th_ident;
+
+    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
+                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+
+    if (! team->t.t_serialized) {
+#if USE_ITT_BUILD
+        // This value will be used in itt notify events below.
+        void *itt_sync_obj = NULL;
+# if USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
+# endif
+#endif /* USE_ITT_BUILD */
+        if (__kmp_tasking_mode == tskm_extra_barrier) {
+            __kmp_tasking_barrier(team, this_thr, gtid);
+            KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
+                          gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
+        }
+
+        /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
+           the team struct is not guaranteed to exist. */
+        // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+        }
+
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+        if (reduce != NULL) {
+            //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
+            this_thr->th.th_local.reduce_data = reduce_data;
+        }
+        switch (__kmp_barrier_gather_pattern[bt]) {
+        case bp_hyper_bar: {
+            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
+            __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                       USE_ITT_BUILD_ARG(itt_sync_obj) );
+            break;
+        }
+        case bp_hierarchical_bar: {
+            __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                              USE_ITT_BUILD_ARG(itt_sync_obj));
+            break;
+        }
+        case bp_tree_bar: {
+            KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
+            __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                      USE_ITT_BUILD_ARG(itt_sync_obj) );
+            break;
+        }
+        default: {
+            __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
+                                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+        }
+        }
+
+        KMP_MB();
+
+        if (KMP_MASTER_TID(tid)) {
+            status = 0;
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_wait(this_thr, team
+                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
+                __kmp_task_team_setup(this_thr, team);
+            }
+
+
+#if USE_ITT_BUILD
+            /* TODO: In case of split reduction barrier, master thread may send acquired event early,
+               before the final summation into the shared variable is done (final summation can be a
+               long operation for array reductions).  */
+            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+            // Barrier - report frame end
+            if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
+                kmp_uint64 cur_time = __itt_get_timestamp();
+                kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
+                int nproc = this_thr->th.th_team_nproc;
+                int i;
+                // Initialize with master's wait time
+                kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+                switch(__kmp_forkjoin_frames_mode) {
+                case 1:
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                    this_thr->th.th_frame_time = cur_time;
+                    break;
+                case 2:
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
+                    break;
+                case 3:
+                    if( __itt_metadata_add_ptr ) {
+                        for (i=1; i<nproc; ++i) {
+                            delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
+                        }
+                        __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
+                    }
+                    __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                    this_thr->th.th_frame_time = cur_time;
+                    break;
+                }
+            }
+#endif /* USE_ITT_BUILD */
+        } else {
+            status = 1;
+#if USE_ITT_BUILD
+            if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+                __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+        }
+        if (status == 1 || ! is_split) {
+            switch (__kmp_barrier_release_pattern[bt]) {
+            case bp_hyper_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                            USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            case bp_hierarchical_bar: {
+                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            case bp_tree_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
+                break;
+            }
+            default: {
+                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                             USE_ITT_BUILD_ARG(itt_sync_obj) );
+            }
+            }
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_sync(this_thr, team);
+            }
+        }
+
+#if USE_ITT_BUILD
+        /* GEH: TODO: Move this under if-condition above and also include in
+           __kmp_end_split_barrier(). This will more accurately represent the actual release time
+           of the threads for split barriers.  */
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+    } else { // Team is serialized.
+        status = 0;
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            // The task team should be NULL for serialized code (tasks will be executed immediately)
+            KMP_DEBUG_ASSERT(team->t.t_task_team == NULL);
+            KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
+        }
+    }
+    KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
+                  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
+    return status;
+}
+
+
+void
+__kmp_end_split_barrier(enum barrier_type bt, int gtid)
+{
+    KMP_TIME_BLOCK(KMP_end_split_barrier);
+    int tid = __kmp_tid_from_gtid(gtid);
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+    kmp_team_t *team = this_thr->th.th_team;
+
+    if (!team->t.t_serialized) {
+        if (KMP_MASTER_GTID(gtid)) {
+            switch (__kmp_barrier_release_pattern[bt]) {
+            case bp_hyper_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                            USE_ITT_BUILD_ARG(NULL) );
+                break;
+            }
+            case bp_hierarchical_bar: {
+                __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                                   USE_ITT_BUILD_ARG(NULL));
+                break;
+            }
+            case bp_tree_bar: {
+                KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
+                __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                           USE_ITT_BUILD_ARG(NULL) );
+                break;
+            }
+            default: {
+                __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
+                                             USE_ITT_BUILD_ARG(NULL) );
+            }
+            }
+            if (__kmp_tasking_mode != tskm_immediate_exec) {
+                __kmp_task_team_sync(this_thr, team);
+            } // if
+        }
+    }
+}
+
+
+void
+__kmp_join_barrier(int gtid)
+{
+    KMP_TIME_BLOCK(KMP_join_barrier);
+    register kmp_info_t *this_thr = __kmp_threads[gtid];
+    register kmp_team_t *team;
+    register kmp_uint nproc;
+    kmp_info_t *master_thread;
+    int tid;
+#ifdef KMP_DEBUG
+    int team_id;
+#endif /* KMP_DEBUG */
+#if USE_ITT_BUILD
+    void *itt_sync_obj = NULL;
+# if USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
+        // Get object created at fork_barrier
+        itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+# endif
+#endif /* USE_ITT_BUILD */
+    KMP_MB();
+
+    // Get current info
+    team = this_thr->th.th_team;
+    nproc = this_thr->th.th_team_nproc;
+    KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
+    tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+    team_id = team->t.t_id;
+#endif /* KMP_DEBUG */
+    master_thread = this_thr->th.th_team_master;
+#ifdef KMP_DEBUG
+    if (master_thread != team->t.t_threads[0]) {
+        __kmp_print_structure();
+    }
+#endif /* KMP_DEBUG */
+    KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
+    KMP_MB();
+
+    // Verify state
+    KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
+    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
+    KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
+    KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
+
+    if (__kmp_tasking_mode == tskm_extra_barrier) {
+        __kmp_tasking_barrier(team, this_thr, gtid);
+        KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
+    }
+# ifdef KMP_DEBUG
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+        KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
+                       __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team,
+                       this_thr->th.th_task_team));
+        KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team);
+    }
+# endif /* KMP_DEBUG */
+
+    /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
+       team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
+       down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
+       since the values are not used by __kmp_wait_template() in that case. */
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+        this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+    }
+
+#if USE_ITT_BUILD
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+        __kmp_itt_barrier_starting(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+    switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
+    case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+        __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                          USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
+        __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                  USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    default: {
+        __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
+                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
+    }
+    }
+
+    /* From this point on, the team data structure may be deallocated at any time by the
+       master thread - it is unsafe to reference it in any of the worker threads. Any per-team
+       data items that need to be referenced before the end of the barrier should be moved to
+       the kmp_task_team_t structs.  */
+    if (KMP_MASTER_TID(tid)) {
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            // Master shouldn't call decrease_load().         // TODO: enable master threads.
+            // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
+            __kmp_task_team_wait(this_thr, team
+                                 USE_ITT_BUILD_ARG(itt_sync_obj) );
+        }
+#if USE_ITT_BUILD
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+#endif /* USE_ITT_BUILD */
+
+# if USE_ITT_BUILD && USE_ITT_NOTIFY
+        // Join barrier - report frame end
+        if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) {
+            kmp_uint64 cur_time = __itt_get_timestamp();
+            ident_t * loc = team->t.t_ident;
+            kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads;
+            int nproc = this_thr->th.th_team_nproc;
+            int i;
+            // Initialize with master's wait time
+            kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
+            switch(__kmp_forkjoin_frames_mode) {
+            case 1:
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                break;
+            case 2:
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
+                break;
+            case 3:
+                if( __itt_metadata_add_ptr ) {
+                    for (i=1; i<nproc; ++i) {
+                        delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
+                    }
+                    __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
+                }
+                __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
+                this_thr->th.th_frame_time = cur_time;
+                break;
+            }
+        }
+# endif /* USE_ITT_BUILD */
+    }
+#if USE_ITT_BUILD
+    else {
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);
+    }
+#endif /* USE_ITT_BUILD */
+
+#if KMP_DEBUG
+    if (KMP_MASTER_TID(tid)) {
+        KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
+                      gtid, team_id, tid, nproc));
+    }
+#endif /* KMP_DEBUG */
+
+    // TODO now, mark worker threads as done so they may be disbanded
+    KMP_MB(); // Flush all pending memory write invalidates.
+    KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
+}
+
+
+// TODO release worker threads' fork barriers as we are ready instead of all at once
+void
+__kmp_fork_barrier(int gtid, int tid)
+{
+    KMP_TIME_BLOCK(KMP_fork_barrier);
+    kmp_info_t *this_thr = __kmp_threads[gtid];
+    kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
+#if USE_ITT_BUILD
+    void * itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
+                  gtid, (team != NULL) ? team->t.t_id : -1, tid));
+
+    // th_team pointer only valid for master thread here
+    if (KMP_MASTER_TID(tid)) {
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+            // Create itt barrier object
+            itt_sync_obj  = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
+            __kmp_itt_barrier_middle(gtid, itt_sync_obj);  // Call acquired/releasing
+        }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+
+#ifdef KMP_DEBUG
+        register kmp_info_t **other_threads = team->t.t_threads;
+        register int i;
+
+        // Verify state
+        KMP_MB();
+
+        for(i=1; i<team->t.t_nproc; ++i) {
+            KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
+                           gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
+                           team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
+                           other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
+            KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
+                              & ~(KMP_BARRIER_SLEEP_STATE))
+                             == KMP_INIT_BARRIER_STATE);
+            KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
+        }
+#endif
+
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            __kmp_task_team_setup(this_thr, team);
+        }
+
+        /* The master thread may have changed its blocktime between the join barrier and the
+           fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
+           access it when the team struct is not guaranteed to exist. */
+        // See note about the corresponding code in __kmp_join_barrier() being performance-critical
+        if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+            this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
+            this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
+        }
+    } // master
+
+    switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
+    case bp_hyper_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+        __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                    USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_hierarchical_bar: {
+        __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                           USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    case bp_tree_bar: {
+        KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
+        __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                   USE_ITT_BUILD_ARG(itt_sync_obj) );
+        break;
+    }
+    default: {
+        __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
+                                     USE_ITT_BUILD_ARG(itt_sync_obj) );
+    }
+    }
+
+    // Early exit for reaping threads releasing forkjoin barrier
+    if (TCR_4(__kmp_global.g.g_done)) {
+        if (this_thr->th.th_task_team != NULL) {
+            if (KMP_MASTER_TID(tid)) {
+                TCW_PTR(this_thr->th.th_task_team, NULL);
+            }
+            else {
+                __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
+            }
+        }
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+        if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+            if (!KMP_MASTER_TID(tid)) {
+                itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+                if (itt_sync_obj)
+                    __kmp_itt_barrier_finished(gtid, itt_sync_obj);
+            }
+        }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+        KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
+        return;
+    }
+
+    /* We can now assume that a valid team structure has been allocated by the master and
+       propagated to all worker threads. The current thread, however, may not be part of the
+       team, so we can't blindly assume that the team pointer is non-null.  */
+    team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
+    KMP_DEBUG_ASSERT(team != NULL);
+    tid = __kmp_tid_from_gtid(gtid);
+
+
+#if KMP_BARRIER_ICV_PULL
+    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
+       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
+       this data before this function is called. We cannot modify __kmp_fork_call() to look at
+       the fixed ICVs in the master's thread struct, because it is not always the case that the
+       threads arrays have been allocated when __kmp_fork_call() is executed. */
+    KMP_START_EXPLICIT_TIMER(USER_icv_copy);
+    if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
+        // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
+        KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
+        __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
+        copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
+                  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
+    }
+    KMP_STOP_EXPLICIT_TIMER(USER_icv_copy);
+#endif // KMP_BARRIER_ICV_PULL
+
+    if (__kmp_tasking_mode != tskm_immediate_exec) {
+        __kmp_task_team_sync(this_thr, team);
+    }
+
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
+    if (proc_bind == proc_bind_intel) {
+#endif
+#if KMP_MIC
+        // Call dynamic affinity settings
+        if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
+            __kmp_balanced_affinity(tid, team->t.t_nproc);
+        }
+#endif
+#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
+    }
+    else if ((proc_bind != proc_bind_false)
+             && (proc_bind != proc_bind_disabled)) {
+        if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
+            KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
+                           __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
+        }
+        else {
+            __kmp_affinity_set_place(gtid);
+        }
+    }
+#endif
+
+#if USE_ITT_BUILD && USE_ITT_NOTIFY
+    if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
+        if (!KMP_MASTER_TID(tid)) {
+            // Get correct barrier object
+            itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
+            __kmp_itt_barrier_finished(gtid, itt_sync_obj);  // Workers call acquired
+        } // (prepare called inside barrier_release)
+    }
+#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
+}
+
+
+void
+__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
+{
+    KMP_TIME_BLOCK(KMP_setup_icv_copy);
+    int f;
+
+    KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
+    KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
+
+    /* Master thread's copy of the ICVs was set up on the implicit taskdata in
+       __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
+       this data before this function is called. */
+#if KMP_BARRIER_ICV_PULL
+    /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
+       all of the worker threads can access them and make their own copies after the barrier. */
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
+    copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
+                  0, team->t.t_threads[0], team));
+#elif KMP_BARRIER_ICV_PUSH
+    // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
+    KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
+                  0, team->t.t_threads[0], team));
+#else
+    // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
+    ngo_load(new_icvs);
+    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // The threads arrays should be allocated at this point
+    for (f=1; f<new_nproc; ++f) { // Skip the master thread
+        // TODO: GEH - pass in better source location info since usually NULL here
+        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                      f, team->t.t_threads[f], team));
+        __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
+        ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
+        KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
+                      f, team->t.t_threads[f], team));
+    }
+    ngo_sync();
+#endif // KMP_BARRIER_ICV_PULL
+}
diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c
index 780bd41cb78..af5c6144c2d 100644
--- a/openmp/runtime/src/kmp_csupport.c
+++ b/openmp/runtime/src/kmp_csupport.c
@@ -1,7 +1,7 @@
 /*
  * kmp_csupport.c -- kfront linkage support for OpenMP.
- * $Revision: 42826 $
- * $Date: 2013-11-20 03:39:45 -0600 (Wed, 20 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -20,6 +20,7 @@
 #include "kmp_i18n.h"
 #include "kmp_itt.h"
 #include "kmp_error.h"
+#include "kmp_stats.h"
 
 #define MAX_MESSAGE 512
 
@@ -35,7 +36,7 @@
  * @param flags in   for future use (currently ignored)
  *
  * Initialize the runtime library. This call is optional; if it is not made then
- * it will be implicilty called by attempts to use other library functions.
+ * it will be implicitly called by attempts to use other library functions.
  *
  */
 void
@@ -276,13 +277,18 @@ Do the actual fork and call the microtask in the relevant number of threads.
 void
 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
 {
+  KMP_STOP_EXPLICIT_TIMER(OMP_serial);
+  KMP_COUNT_BLOCK(OMP_PARALLEL);
   int         gtid = __kmp_entry_gtid();
   // maybe to save thr_state is enough here
   {
     va_list     ap;
     va_start(   ap, microtask );
 
-    __kmp_fork_call( loc, gtid, TRUE,
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call( loc, gtid, fork_context_intel,
             argc,
             VOLATILE_CAST(microtask_t) microtask,
             VOLATILE_CAST(launch_t)    __kmp_invoke_task_func,
@@ -293,10 +299,14 @@ __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
             ap
 #endif
             );
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
     __kmp_join_call( loc, gtid );
 
     va_end( ap );
   }
+  KMP_START_EXPLICIT_TIMER(OMP_serial);
 }
 
 #if OMP_40_ENABLED
@@ -337,17 +347,18 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
     va_start(   ap, microtask );
 
     // remember teams entry point and nesting level
-    this_thr->th.th_team_microtask = microtask;
+    this_thr->th.th_teams_microtask = microtask;
     this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
 
     // check if __kmpc_push_num_teams called, set default number of teams otherwise
-    if ( this_thr->th.th_set_nth_teams == 0 ) {
+    if ( this_thr->th.th_teams_size.nteams == 0 ) {
         __kmp_push_num_teams( loc, gtid, 0, 0 );
     }
     KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
-    KMP_DEBUG_ASSERT(this_thr->th.th_set_nth_teams >= 1);
+    KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
+    KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
 
-    __kmp_fork_call( loc, gtid, TRUE,
+    __kmp_fork_call( loc, gtid, fork_context_intel,
             argc,
             VOLATILE_CAST(microtask_t) __kmp_teams_master,
             VOLATILE_CAST(launch_t)    __kmp_invoke_teams_master,
@@ -358,9 +369,9 @@ __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
 #endif
             );
     __kmp_join_call( loc, gtid );
-    this_thr->th.th_team_microtask = NULL;
+    this_thr->th.th_teams_microtask = NULL;
     this_thr->th.th_teams_level = 0;
-
+    *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
     va_end( ap );
 }
 #endif /* OMP_40_ENABLED */
@@ -393,252 +404,9 @@ when the condition is false.
 void
 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
 {
-    kmp_info_t *this_thr;
-    kmp_team_t *serial_team;
-
-    KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
-
-    /* Skip all this code for autopar serialized loops since it results in
-       unacceptable overhead */
-    if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
-        return;
-
-    if( ! TCR_4( __kmp_init_parallel ) )
-        __kmp_parallel_initialize();
-
-    this_thr     = __kmp_threads[ global_tid ];
-    serial_team  = this_thr -> th.th_serial_team;
-
-    /* utilize the serialized team held by this thread */
-    KMP_DEBUG_ASSERT( serial_team );
-    KMP_MB();
-
-#if OMP_30_ENABLED
-    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-        KMP_DEBUG_ASSERT( this_thr -> th.th_task_team == this_thr -> th.th_team -> t.t_task_team );
-        KMP_DEBUG_ASSERT( serial_team -> t.t_task_team == NULL );
-        KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
-                        global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
-        this_thr -> th.th_task_team = NULL;
-    }
-#endif // OMP_30_ENABLED
-
-#if OMP_40_ENABLED
-    kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
-    if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
-        proc_bind = proc_bind_false;
-    }
-    else if ( proc_bind == proc_bind_default ) {
-        //
-        // No proc_bind clause was specified, so use the current value
-        // of proc-bind-var for this parallel region.
-        //
-        proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
-    }
-    //
-    // Reset for next parallel region
-    //
-    this_thr->th.th_set_proc_bind = proc_bind_default;
-#endif /* OMP_3_ENABLED */
-
-    if( this_thr -> th.th_team != serial_team ) {
-#if OMP_30_ENABLED
-        // Nested level will be an index in the nested nthreads array
-        int level = this_thr->th.th_team->t.t_level;
-#endif
-        if( serial_team -> t.t_serialized ) {
-            /* this serial team was already used
-             * TODO increase performance by making this locks more specific */
-            kmp_team_t *new_team;
-            int tid = this_thr->th.th_info.ds.ds_tid;
-
-            __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
-
-            new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
-#if OMP_40_ENABLED
-                                           proc_bind,
-#endif
-#if OMP_30_ENABLED
-                                           & this_thr->th.th_current_task->td_icvs,
-#else
-                                           this_thr->th.th_team->t.t_set_nproc[tid],
-                                           this_thr->th.th_team->t.t_set_dynamic[tid],
-                                           this_thr->th.th_team->t.t_set_nested[tid],
-                                           this_thr->th.th_team->t.t_set_blocktime[tid],
-                                           this_thr->th.th_team->t.t_set_bt_intervals[tid],
-                                           this_thr->th.th_team->t.t_set_bt_set[tid],
-#endif // OMP_30_ENABLED
-                                           0);
-            __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
-            KMP_ASSERT( new_team );
-
-            /* setup new serialized team and install it */
-            new_team -> t.t_threads[0] = this_thr;
-            new_team -> t.t_parent = this_thr -> th.th_team;
-            serial_team = new_team;
-            this_thr -> th.th_serial_team = serial_team;
-
-            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
-                            global_tid, serial_team ) );
-
-
-            /* TODO the above breaks the requirement that if we run out of
-             * resources, then we can still guarantee that serialized teams
-             * are ok, since we may need to allocate a new one */
-        } else {
-            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
-                            global_tid, serial_team ) );
-        }
-
-        /* we have to initialize this serial team */
-        KMP_DEBUG_ASSERT( serial_team->t.t_threads );
-        KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
-        KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
-        serial_team -> t.t_ident         = loc;
-        serial_team -> t.t_serialized    = 1;
-        serial_team -> t.t_nproc         = 1;
-        serial_team -> t.t_parent        = this_thr->th.th_team;
-#if OMP_30_ENABLED
-        serial_team -> t.t_sched         = this_thr->th.th_team->t.t_sched;
-#endif // OMP_30_ENABLED
-        this_thr -> th.th_team           = serial_team;
-        serial_team -> t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
-
-#if OMP_30_ENABLED
-        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
-                        global_tid, this_thr->th.th_current_task ) );
-        KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
-        this_thr->th.th_current_task->td_flags.executing = 0;
-
-        __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
-
-        /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
-           each serialized task represented by team->t.t_serialized? */
-        copy_icvs(
-                  & this_thr->th.th_current_task->td_icvs,
-                  & this_thr->th.th_current_task->td_parent->td_icvs );
-
-        // Thread value exists in the nested nthreads array for the next nested level
-        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
-            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
-        }
-
-#if OMP_40_ENABLED
-        if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
-            this_thr->th.th_current_task->td_icvs.proc_bind
-                = __kmp_nested_proc_bind.bind_types[ level + 1 ];
-        }
-#endif /* OMP_40_ENABLED */
-
-#else /* pre-3.0 icv's */
-        serial_team -> t.t_set_nproc[0]  = serial_team->t.t_parent->
-            t.t_set_nproc[serial_team->
-                          t.t_master_tid];
-        serial_team -> t.t_set_dynamic[0] = serial_team->t.t_parent->
-            t.t_set_dynamic[serial_team->
-                            t.t_master_tid];
-        serial_team -> t.t_set_nested[0] = serial_team->t.t_parent->
-            t.t_set_nested[serial_team->
-                           t.t_master_tid];
-        serial_team -> t.t_set_blocktime[0]  = serial_team->t.t_parent->
-            t.t_set_blocktime[serial_team->
-                              t.t_master_tid];
-        serial_team -> t.t_set_bt_intervals[0] = serial_team->t.t_parent->
-            t.t_set_bt_intervals[serial_team->
-                                 t.t_master_tid];
-        serial_team -> t.t_set_bt_set[0] = serial_team->t.t_parent->
-            t.t_set_bt_set[serial_team->
-                           t.t_master_tid];
-#endif // OMP_30_ENABLED
-        this_thr -> th.th_info.ds.ds_tid = 0;
-
-        /* set thread cache values */
-        this_thr -> th.th_team_nproc     = 1;
-        this_thr -> th.th_team_master    = this_thr;
-        this_thr -> th.th_team_serialized = 1;
-
-#if OMP_30_ENABLED
-        serial_team -> t.t_level        = serial_team -> t.t_parent -> t.t_level + 1;
-        serial_team -> t.t_active_level = serial_team -> t.t_parent -> t.t_active_level;
-#endif // OMP_30_ENABLED
-
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-        if ( __kmp_inherit_fp_control ) {
-            __kmp_store_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
-            __kmp_store_mxcsr( &serial_team->t.t_mxcsr );
-            serial_team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
-            serial_team->t.t_fp_control_saved = TRUE;
-        } else {
-            serial_team->t.t_fp_control_saved = FALSE;
-        }
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
-        /* check if we need to allocate dispatch buffers stack */
-        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
-        if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
-            serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
-                __kmp_allocate( sizeof( dispatch_private_info_t ) );
-        }
-        this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
-
-        KMP_MB();
-
-    } else {
-        /* this serialized team is already being used,
-         * that's fine, just add another nested level */
-        KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
-        KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
-        KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
-        ++ serial_team -> t.t_serialized;
-        this_thr -> th.th_team_serialized = serial_team -> t.t_serialized;
-
-#if OMP_30_ENABLED
-        // Nested level will be an index in the nested nthreads array
-        int level = this_thr->th.th_team->t.t_level;
-        // Thread value exists in the nested nthreads array for the next nested level
-        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
-            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
-        }
-        serial_team -> t.t_level++;
-        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
-                        global_tid, serial_team, serial_team -> t.t_level ) );
-#else
-        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing team %p for nested serialized parallel region\n",
-                        global_tid, serial_team ) );
-#endif // OMP_30_ENABLED
-
-        /* allocate/push dispatch buffers stack */
-        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
-        {
-            dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
-                __kmp_allocate( sizeof( dispatch_private_info_t ) );
-            disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
-            serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
-        }
-        this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
-
-        KMP_MB();
-    }
-
-    if ( __kmp_env_consistency_check )
-        __kmp_push_parallel( global_tid, NULL );
-
-// t_level is not available in 2.5 build, so check for OMP_30_ENABLED
-#if USE_ITT_BUILD && OMP_30_ENABLED
-    // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
-    if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
-    {
-        __kmp_itt_region_forking( global_tid, 1 );
-    }
-    if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr )
-    {
-#if USE_ITT_NOTIFY
-        if( this_thr->th.th_team->t.t_level == 1 ) {
-            this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
-        }
-#endif
-    }
-#endif /* USE_ITT_BUILD */
-
+    __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
+                                                 * kmp_fork_call since the tasks to be done are similar in each case.
+                                                 */
 }
 
 /*!
@@ -680,26 +448,13 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
     /* If necessary, pop the internal control stack values and replace the team values */
     top = serial_team -> t.t_control_stack_top;
     if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
-#if OMP_30_ENABLED
-        copy_icvs(
-                  &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs,
-                  top );
-#else
-        serial_team -> t.t_set_nproc[0]   = top -> nproc;
-        serial_team -> t.t_set_dynamic[0] = top -> dynamic;
-        serial_team -> t.t_set_nested[0]  = top -> nested;
-        serial_team -> t.t_set_blocktime[0]   = top -> blocktime;
-        serial_team -> t.t_set_bt_intervals[0] = top -> bt_intervals;
-        serial_team -> t.t_set_bt_set[0]  = top -> bt_set;
-#endif // OMP_30_ENABLED
+        copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
         serial_team -> t.t_control_stack_top = top -> next;
         __kmp_free(top);
     }
 
-#if OMP_30_ENABLED
     //if( serial_team -> t.t_serialized > 1 )
     serial_team -> t.t_level--;
-#endif // OMP_30_ENABLED
 
     /* pop dispatch buffers stack */
     KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
@@ -735,7 +490,6 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
         this_thr -> th.th_dispatch       = & this_thr -> th.th_team ->
             t.t_dispatch[ serial_team -> t.t_master_tid ];
 
-#if OMP_30_ENABLED
         __kmp_pop_current_task_from_thread( this_thr );
 
         KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
@@ -752,32 +506,37 @@ __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
                             global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
         }
-#endif // OMP_30_ENABLED
-
-    }
-    else {
-
-#if OMP_30_ENABLED
+    } else {
         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
             KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
                             global_tid, serial_team, serial_team -> t.t_serialized ) );
         }
-#endif // OMP_30_ENABLED
-
     }
 
-// t_level is not available in 2.5 build, so check for OMP_30_ENABLED
-#if USE_ITT_BUILD && OMP_30_ENABLED
+#if USE_ITT_BUILD
+    kmp_uint64 cur_time = 0;
+#if  USE_ITT_NOTIFY
+    if( __itt_get_timestamp_ptr ) {
+        cur_time = __itt_get_timestamp();
+    }
+#endif /* USE_ITT_NOTIFY */
+    // Report the barrier
+    if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) {
+        if( this_thr->th.th_team->t.t_level == 0 ) {
+            __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
+        }
+    }
     // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
     if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
     {
         this_thr->th.th_ident = loc;
         __kmp_itt_region_joined( global_tid, 1 );
     }
-    if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) {
-        if( this_thr->th.th_team->t.t_level == 0 ) {
-            __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, __itt_timestamp_none, 0, loc );
-        }
+    if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
+    {
+        this_thr->th.th_ident = loc;
+        // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
+        __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time, cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
     }
 #endif /* USE_ITT_BUILD */
 
@@ -805,55 +564,50 @@ __kmpc_flush(ident_t *loc, ...)
     /* need explicit __mf() here since use volatile instead in library */
     KMP_MB();       /* Flush all pending memory write invalidates.  */
 
-    // This is not an OMP 3.0 feature.
-    // This macro is used here just not to let the change go to 10.1.
-    // This change will go to the mainline first.
-    #if OMP_30_ENABLED
-        #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
-            #if KMP_MIC
-                // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
-                // We shouldn't need it, though, since the ABI rules require that
-                // * If the compiler generates NGO stores it also generates the fence
-                // * If users hand-code NGO stores they should insert the fence
-                // therefore no incomplete unordered stores should be visible.
-            #else
-                // C74404
-                // This is to address non-temporal store instructions (sfence needed).
-                // The clflush instruction is addressed either (mfence needed).
-                // Probably the non-temporal load monvtdqa instruction should also be addressed.
-                // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
-                if ( ! __kmp_cpuinfo.initialized ) {
-                    __kmp_query_cpuid( & __kmp_cpuinfo );
-                }; // if
-                if ( ! __kmp_cpuinfo.sse2 ) {
-                    // CPU cannot execute SSE2 instructions.
-                } else {
-                    #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
-                    _mm_mfence();
-                    #else
-                    __sync_synchronize();
-                    #endif // KMP_COMPILER_ICC
-                }; // if
-            #endif // KMP_MIC
-        #elif KMP_ARCH_ARM
-            // Nothing yet
-	     #elif KMP_ARCH_PPC64
-            // Nothing needed here (we have a real MB above).
-            #if KMP_OS_CNK
-	     	 // The flushing thread needs to yield here; this prevents a
-		    // busy-waiting thread from saturating the pipeline. flush is
-		       // often used in loops like this:
-                // while (!flag) {
-                //   #pragma omp flush(flag)
-                // }
-		    // and adding the yield here is good for at least a 10x speedup
-		       // when running >2 threads per core (on the NAS LU benchmark).
-                __kmp_yield(TRUE);
-            #endif
+    #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
+        #if KMP_MIC
+            // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
+            // We shouldn't need it, though, since the ABI rules require that
+            // * If the compiler generates NGO stores it also generates the fence
+            // * If users hand-code NGO stores they should insert the fence
+            // therefore no incomplete unordered stores should be visible.
         #else
-            #error Unknown or unsupported architecture
+            // C74404
+            // This is to address non-temporal store instructions (sfence needed).
+            // The clflush instruction is addressed either (mfence needed).
+            // Probably the non-temporal load monvtdqa instruction should also be addressed.
+            // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
+            if ( ! __kmp_cpuinfo.initialized ) {
+                __kmp_query_cpuid( & __kmp_cpuinfo );
+            }; // if
+            if ( ! __kmp_cpuinfo.sse2 ) {
+                // CPU cannot execute SSE2 instructions.
+            } else {
+                #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
+                _mm_mfence();
+                #else
+                __sync_synchronize();
+                #endif // KMP_COMPILER_ICC
+            }; // if
+        #endif // KMP_MIC
+    #elif KMP_ARCH_ARM
+        // Nothing yet
+    #elif KMP_ARCH_PPC64
+        // Nothing needed here (we have a real MB above).
+        #if KMP_OS_CNK
+        // The flushing thread needs to yield here; this prevents a
+       // busy-waiting thread from saturating the pipeline. flush is
+          // often used in loops like this:
+           // while (!flag) {
+           //   #pragma omp flush(flag)
+           // }
+       // and adding the yield here is good for at least a 10x speedup
+          // when running >2 threads per core (on the NAS LU benchmark).
+            __kmp_yield(TRUE);
         #endif
-    #endif // OMP_30_ENABLED
+    #else
+        #error Unknown or unsupported architecture
+    #endif
 
 }
 
@@ -871,6 +625,8 @@ Execute a barrier.
 void
 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
 {
+    KMP_COUNT_BLOCK(OMP_BARRIER);
+    KMP_TIME_BLOCK(OMP_barrier);
     int explicit_barrier_flag;
     KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
 
@@ -906,6 +662,7 @@ __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
 kmp_int32
 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
 {
+    KMP_COUNT_BLOCK(OMP_MASTER);
     int status = 0;
 
     KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
@@ -1014,11 +771,6 @@ __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
         __kmp_parallel_dxo( & gtid, & cid, loc );
 }
 
-inline void
-__kmp_static_yield( int arg ) { // AC: needed in macro __kmp_acquire_user_lock_with_checks
-    __kmp_yield( arg );
-}
-
 static kmp_user_lock_p
 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
 {
@@ -1082,6 +834,7 @@ This function blocks until the executing thread can enter the critical section.
 */
 void
 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
+    KMP_COUNT_BLOCK(OMP_CRITICAL);
 
     kmp_user_lock_p lck;
 
@@ -1194,6 +947,9 @@ __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
     if ( __kmp_env_consistency_check )
         __kmp_check_barrier( global_tid, ct_barrier, loc );
 
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
     status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
 
     return (status != 0) ? 0 : 1;
@@ -1243,6 +999,9 @@ __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
         __kmp_check_barrier( global_tid, ct_barrier, loc );
     }
 
+#if USE_ITT_NOTIFY
+    __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
     __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
 
     ret = __kmpc_master (loc, global_tid);
@@ -1280,6 +1039,7 @@ introduce an explicit barrier if it is required.
 kmp_int32
 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
 {
+    KMP_COUNT_BLOCK(OMP_SINGLE);
     kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
     return rc;
 }
@@ -1353,8 +1113,6 @@ ompc_set_nested( int flag )
     set__nested( thread, flag ? TRUE : FALSE );
 }
 
-#if OMP_30_ENABLED
-
 void
 ompc_set_max_active_levels( int max_active_levels )
 {
@@ -1384,8 +1142,6 @@ ompc_get_team_size( int level )
     return __kmp_get_team_size( __kmp_entry_gtid(), level );
 }
 
-#endif // OMP_30_ENABLED
-
 void
 kmpc_set_stacksize( int arg )
 {
@@ -1427,8 +1183,6 @@ kmpc_set_defaults( char const * str )
     __kmp_aux_set_defaults( str, strlen( str ) );
 }
 
-#ifdef OMP_30_ENABLED
-
 int
 kmpc_set_affinity_mask_proc( int proc, void **mask )
 {
@@ -1468,7 +1222,6 @@ kmpc_get_affinity_mask_proc( int proc, void **mask )
 #endif
 }
 
-#endif /* OMP_30_ENABLED */
 
 /* -------------------------------------------------------------------------- */
 /*!
@@ -1533,6 +1286,9 @@ __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_dat
     if (didit) *data_ptr = cpy_data;
 
     /* This barrier is not a barrier region boundary */
+#if USE_ITT_NOTIFY
+    __kmp_threads[gtid]->th.th_ident = loc;
+#endif
     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
 
     if (! didit) (*cpy_func)( cpy_data, *data_ptr );
@@ -1540,6 +1296,9 @@ __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_dat
     /* Consider next barrier the user-visible barrier for barrier region boundaries */
     /* Nesting checks are already handled by the single construct checks */
 
+#if USE_ITT_NOTIFY
+    __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
+#endif
     __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
 }
 
@@ -1722,6 +1481,7 @@ __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
 
 void
 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
+    KMP_COUNT_BLOCK(OMP_set_lock);
     kmp_user_lock_p lck;
 
     if ( ( __kmp_user_lock_kind == lk_tas )
@@ -1866,6 +1626,8 @@ __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
 int
 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
 {
+    KMP_COUNT_BLOCK(OMP_test_lock);
+    KMP_TIME_BLOCK(OMP_test_lock);
     kmp_user_lock_p lck;
     int          rc;
 
@@ -2028,9 +1790,14 @@ __kmpc_reduce_nowait(
     kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck ) {
 
+    KMP_COUNT_BLOCK(REDUCE_nowait);
     int retval;
     PACKED_REDUCTION_METHOD_T packed_reduction_method;
-
+#if OMP_40_ENABLED
+    kmp_team_t *team;
+    kmp_info_t *th;
+    int teams_swapped = 0, task_state;
+#endif
     KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
 
     // why do we need this initialization here at all?
@@ -2045,7 +1812,25 @@ __kmpc_reduce_nowait(
     if ( __kmp_env_consistency_check )
         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
 
-    // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
+#if OMP_40_ENABLED
+    th = __kmp_thread_from_gtid(global_tid);
+    if( th->th.th_teams_microtask ) {   // AC: check if we are inside the teams construct?
+        team = th->th.th_team;
+        if( team->t.t_level == th->th.th_teams_level ) {
+            // this is reduction at teams construct
+            KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid);  // AC: check that tid == 0
+            // Let's swap teams temporarily for the reduction barrier
+            teams_swapped = 1;
+            th->th.th_info.ds.ds_tid = team->t.t_master_tid;
+            th->th.th_team = team->t.t_parent;
+            th->th.th_task_team = th->th.th_team->t.t_task_team;
+            th->th.th_team_nproc = th->th.th_team->t.t_nproc;
+            task_state = th->th.th_task_state;
+            if( th->th.th_task_team )
+                th->th.th_task_state = th->th.th_task_team->tt.tt_state;
+        }
+    }
+#endif // OMP_40_ENABLED
 
     // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
     // the variable should be either a construct-specific or thread-specific property, not a team specific property
@@ -2091,6 +1876,9 @@ __kmpc_reduce_nowait(
 
         // this barrier should be invisible to a customer and to the thread profiler
         //              (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
 
@@ -2108,7 +1896,16 @@ __kmpc_reduce_nowait(
         KMP_ASSERT( 0 ); // "unexpected method"
 
     }
-
+#if OMP_40_ENABLED
+    if( teams_swapped ) {
+        // Restore thread structure
+        th->th.th_info.ds.ds_tid = 0;
+        th->th.th_team = team;
+        th->th.th_task_team = team->t.t_task_team;
+        th->th.th_team_nproc = team->t.t_nproc;
+        th->th.th_task_state = task_state;
+    }
+#endif
     KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
 
     return retval;
@@ -2187,6 +1984,7 @@ __kmpc_reduce(
     void (*reduce_func)(void *lhs_data, void *rhs_data),
     kmp_critical_name *lck )
 {
+    KMP_COUNT_BLOCK(REDUCE_wait);
     int retval;
     PACKED_REDUCTION_METHOD_T packed_reduction_method;
 
@@ -2204,8 +2002,6 @@ __kmpc_reduce(
     if ( __kmp_env_consistency_check )
         __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
 
-    // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
-
     packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
     __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
 
@@ -2228,6 +2024,9 @@ __kmpc_reduce(
         //case tree_reduce_block:
         // this barrier should be visible to a customer and to the thread profiler
         //              (it's a terminating barrier on constructs if NOWAIT not specified)
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
+#endif
         retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
         retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
 
@@ -2277,6 +2076,9 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
         __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
 
         // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
 
     } else if( packed_reduction_method == empty_reduce_block ) {
@@ -2284,11 +2086,17 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
         // usage: if team size == 1, no synchronization is required ( Intel platforms only )
 
         // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
 
     } else if( packed_reduction_method == atomic_reduce_block ) {
 
         // TODO: implicit barrier: should be exposed
+#if USE_ITT_NOTIFY
+        __kmp_threads[global_tid]->th.th_ident = loc;
+#endif
         __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
 
     } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
@@ -2319,23 +2127,15 @@ __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck )
 kmp_uint64
 __kmpc_get_taskid() {
 
-    #if OMP_30_ENABLED
-
-        kmp_int32    gtid;
-        kmp_info_t * thread;
-
-        gtid = __kmp_get_gtid();
-        if ( gtid < 0 ) {
-            return 0;
-        }; // if
-        thread = __kmp_thread_from_gtid( gtid );
-        return thread->th.th_current_task->td_task_id;
-
-    #else
+    kmp_int32    gtid;
+    kmp_info_t * thread;
 
+    gtid = __kmp_get_gtid();
+    if ( gtid < 0 ) {
         return 0;
-
-    #endif
+    }; // if
+    thread = __kmp_thread_from_gtid( gtid );
+    return thread->th.th_current_task->td_task_id;
 
 } // __kmpc_get_taskid
 
@@ -2343,25 +2143,17 @@ __kmpc_get_taskid() {
 kmp_uint64
 __kmpc_get_parent_taskid() {
 
-    #if OMP_30_ENABLED
-
-        kmp_int32        gtid;
-        kmp_info_t *     thread;
-        kmp_taskdata_t * parent_task;
-
-        gtid = __kmp_get_gtid();
-        if ( gtid < 0 ) {
-            return 0;
-        }; // if
-        thread      = __kmp_thread_from_gtid( gtid );
-        parent_task = thread->th.th_current_task->td_parent;
-        return ( parent_task == NULL ? 0 : parent_task->td_task_id );
-
-    #else
+    kmp_int32        gtid;
+    kmp_info_t *     thread;
+    kmp_taskdata_t * parent_task;
 
+    gtid = __kmp_get_gtid();
+    if ( gtid < 0 ) {
         return 0;
-
-    #endif
+    }; // if
+    thread      = __kmp_thread_from_gtid( gtid );
+    parent_task = thread->th.th_current_task->td_parent;
+    return ( parent_task == NULL ? 0 : parent_task->td_task_id );
 
 } // __kmpc_get_parent_taskid
 
diff --git a/openmp/runtime/src/kmp_debug.c b/openmp/runtime/src/kmp_debug.c
index d4e4302bb41..ad049d1f25a 100644
--- a/openmp/runtime/src/kmp_debug.c
+++ b/openmp/runtime/src/kmp_debug.c
@@ -1,7 +1,7 @@
 /*
  * kmp_debug.c -- debug utilities for the Guide library
- * $Revision: 42150 $
- * $Date: 2013-03-15 15:40:38 -0500 (Fri, 15 Mar 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_debug.h b/openmp/runtime/src/kmp_debug.h
index ac706ff1ba1..f4288bf5ce2 100644
--- a/openmp/runtime/src/kmp_debug.h
+++ b/openmp/runtime/src/kmp_debug.h
@@ -1,7 +1,7 @@
 /*
  * kmp_debug.h -- debug / assertion code for Assure library
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index 02253053032..cc58f493a69 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -1,7 +1,7 @@
 /*
  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
- * $Revision: 42674 $
- * $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $
+ * $Revision: 43457 $
+ * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
  */
 
 
@@ -32,6 +32,7 @@
 #include "kmp_itt.h"
 #include "kmp_str.h"
 #include "kmp_error.h"
+#include "kmp_stats.h"
 #if KMP_OS_WINDOWS && KMP_ARCH_X86
     #include <float.h>
 #endif
@@ -39,6 +40,34 @@
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
+// template for type limits
+template< typename T >
+struct i_maxmin {
+    static const T mx;
+    static const T mn;
+};
+template<>
+struct i_maxmin< int > {
+    static const int mx = 0x7fffffff;
+    static const int mn = 0x80000000;
+};
+template<>
+struct i_maxmin< unsigned int > {
+    static const unsigned int mx = 0xffffffff;
+    static const unsigned int mn = 0x00000000;
+};
+template<>
+struct i_maxmin< long long > {
+    static const long long mx = 0x7fffffffffffffffLL;
+    static const long long mn = 0x8000000000000000LL;
+};
+template<>
+struct i_maxmin< unsigned long long > {
+    static const unsigned long long mx = 0xffffffffffffffffLL;
+    static const unsigned long long mn = 0x0000000000000000LL;
+};
+//-------------------------------------------------------------------------
+
 #ifdef KMP_STATIC_STEAL_ENABLED
 
     // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
@@ -148,22 +177,6 @@ struct dispatch_shared_info_template {
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
-static void
-__kmp_static_delay( int arg )
-{
-    /* Work around weird code-gen bug that causes assert to trip */
-    #if KMP_ARCH_X86_64 && KMP_OS_LINUX
-    #else
-        KMP_ASSERT( arg >= 0 );
-    #endif
-}
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
-}
-
 #undef USE_TEST_LOCKS
 
 // test_then_add template (general template should NOT be used)
@@ -294,8 +307,6 @@ __kmp_wait_yield( volatile UT * spinner,
         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
             __kmp_abort_thread(); */
 
-        __kmp_static_delay(TRUE);
-
         // if we are oversubscribed,
         // or have waited a bit (and KMP_LIBRARY=throughput, then yield
         // pause is in the following code
@@ -589,6 +600,9 @@ __kmp_dispatch_init(
     if ( ! TCR_4( __kmp_init_parallel ) )
         __kmp_parallel_initialize();
 
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_INIT();
+#endif
     #ifdef KMP_DEBUG
     {
         const char * buff;
@@ -606,6 +620,9 @@ __kmp_dispatch_init(
     active = ! team -> t.t_serialized;
     th->th.th_ident = loc;
 
+#if USE_ITT_BUILD
+    kmp_uint64 cur_chunk = chunk;
+#endif
     if ( ! active ) {
         pr = reinterpret_cast< dispatch_private_info_template< T >* >
             ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
@@ -640,23 +657,16 @@ __kmp_dispatch_init(
         schedule = __kmp_static;
     } else {
         if ( schedule == kmp_sch_runtime ) {
-            #if OMP_30_ENABLED
-                // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
-                schedule = team -> t.t_sched.r_sched_type;
-                // Detail the schedule if needed (global controls are differentiated appropriately)
-                if ( schedule == kmp_sch_guided_chunked ) {
-                    schedule = __kmp_guided;
-                } else if ( schedule == kmp_sch_static ) {
-                    schedule = __kmp_static;
-                }
-                // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
-                chunk = team -> t.t_sched.chunk;
-            #else
-                kmp_r_sched_t r_sched = __kmp_get_schedule_global();
-                // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
-                schedule = r_sched.r_sched_type;
-                chunk    = r_sched.chunk;
-            #endif
+            // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
+            schedule = team -> t.t_sched.r_sched_type;
+            // Detail the schedule if needed (global controls are differentiated appropriately)
+            if ( schedule == kmp_sch_guided_chunked ) {
+                schedule = __kmp_guided;
+            } else if ( schedule == kmp_sch_static ) {
+                schedule = __kmp_static;
+            }
+            // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
+            chunk = team -> t.t_sched.chunk;
 
             #ifdef KMP_DEBUG
             {
@@ -678,7 +688,6 @@ __kmp_dispatch_init(
             }
         }
 
-        #if OMP_30_ENABLED
         if ( schedule == kmp_sch_auto ) {
             // mapping and differentiation: in the __kmp_do_serial_initialize()
             schedule = __kmp_auto;
@@ -694,7 +703,6 @@ __kmp_dispatch_init(
             }
             #endif
         }
-        #endif // OMP_30_ENABLED
 
         /* guided analytical not safe for too many threads */
         if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
@@ -848,6 +856,12 @@ __kmp_dispatch_init(
                     break;
                 }
             }
+#if USE_ITT_BUILD
+            // Calculate chunk for metadata report
+            if(  __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
+                cur_chunk = limit - init + 1;
+            }
+#endif
             if ( st == 1 ) {
                 pr->u.p.lb = lb + init;
                 pr->u.p.ub = lb + limit;
@@ -1101,6 +1115,39 @@ __kmp_dispatch_init(
         }; // if
 #endif /* USE_ITT_BUILD */
     }; // if
+
+#if USE_ITT_BUILD
+    // Report loop metadata
+    if( __itt_metadata_add_ptr  && __kmp_forkjoin_frames_mode == 3 ) {
+        kmp_uint32 tid  = __kmp_tid_from_gtid( gtid );
+        if (KMP_MASTER_TID(tid)) {
+            kmp_uint64 schedtype = 0;
+
+            switch ( schedule ) {
+            case kmp_sch_static_chunked:
+            case kmp_sch_static_balanced:// Chunk is calculated in the switch above
+                break;
+            case kmp_sch_static_greedy:
+                cur_chunk = pr->u.p.parm1;
+                break;
+            case kmp_sch_dynamic_chunked:
+                schedtype = 1;
+                break;
+            case kmp_sch_guided_iterative_chunked:
+            case kmp_sch_guided_analytical_chunked:
+                schedtype = 2;
+                break;
+            default:
+//            Should we put this case under "static"?
+//            case kmp_sch_static_steal:
+                schedtype = 3;
+                break;
+            }
+            __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
+        }
+    }
+#endif /* USE_ITT_BUILD */
+
     #ifdef KMP_DEBUG
     {
         const char * buff;
@@ -1302,6 +1349,7 @@ __kmp_dispatch_next(
     kmp_info_t                          * th   = __kmp_threads[ gtid ];
     kmp_team_t                          * team = th -> th.th_team;
 
+    KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
     #ifdef KMP_DEBUG
     {
         const char * buff;
@@ -1323,9 +1371,10 @@ __kmp_dispatch_next(
         if ( (status = (pr->u.p.tc != 0)) == 0 ) {
             *p_lb = 0;
             *p_ub = 0;
-            if ( p_st != 0 ) {
+//            if ( p_last != NULL )
+//                *p_last = 0;
+            if ( p_st != NULL )
                 *p_st = 0;
-            }
             if ( __kmp_env_consistency_check ) {
                 if ( pr->pushed_ws != ct_none ) {
                     pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
@@ -1346,7 +1395,10 @@ __kmp_dispatch_next(
             if ( (status = (init <= trip)) == 0 ) {
                 *p_lb = 0;
                 *p_ub = 0;
-                if ( p_st != 0 ) *p_st = 0;
+//                if ( p_last != NULL )
+//                    *p_last = 0;
+                if ( p_st != NULL )
+                    *p_st = 0;
                 if ( __kmp_env_consistency_check ) {
                     if ( pr->pushed_ws != ct_none ) {
                         pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
@@ -1363,12 +1415,10 @@ __kmp_dispatch_next(
                     pr->u.p.last_upper = pr->u.p.ub;
                     #endif /* KMP_OS_WINDOWS */
                 }
-                if ( p_last ) {
+                if ( p_last != NULL )
                     *p_last = last;
-                }
-                if ( p_st != 0 ) {
+                if ( p_st != NULL )
                     *p_st = incr;
-                }
                 if ( incr == 1 ) {
                     *p_lb = start + init;
                     *p_ub = start + limit;
@@ -1395,19 +1445,15 @@ __kmp_dispatch_next(
             } // if
         } else {
             pr->u.p.tc = 0;
-
             *p_lb = pr->u.p.lb;
             *p_ub = pr->u.p.ub;
             #if KMP_OS_WINDOWS
             pr->u.p.last_upper = *p_ub;
             #endif /* KMP_OS_WINDOWS */
-
-            if ( p_st != 0 ) {
-                *p_st = pr->u.p.st;
-            }
-            if ( p_last ) {
+            if ( p_last != NULL )
                 *p_last = TRUE;
-            }
+            if ( p_st != NULL )
+                *p_st = pr->u.p.st;
         } // if
         #ifdef KMP_DEBUG
         {
@@ -1415,12 +1461,15 @@ __kmp_dispatch_next(
             // create format specifiers before the debug output
             buff = __kmp_str_format(
                 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
-                "p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
+                "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
                 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
-            KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
+            KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
             __kmp_str_free( &buff );
         }
         #endif
+#if INCLUDE_SSC_MARKS
+        SSC_MARK_DISPATCH_NEXT();
+#endif
         return status;
     } else {
         kmp_int32 last = 0;
@@ -1572,7 +1621,7 @@ __kmp_dispatch_next(
                     if ( !status ) {
                         *p_lb = 0;
                         *p_ub = 0;
-                        if ( p_st != 0 ) *p_st = 0;
+                        if ( p_st != NULL ) *p_st = 0;
                     } else {
                         start = pr->u.p.parm2;
                         init *= chunk;
@@ -1582,10 +1631,7 @@ __kmp_dispatch_next(
                         KMP_DEBUG_ASSERT(init <= trip);
                         if ( (last = (limit >= trip)) != 0 )
                             limit = trip;
-                        if ( p_last ) {
-                            *p_last = last;
-                        }
-                        if ( p_st != 0 ) *p_st = incr;
+                        if ( p_st != NULL ) *p_st = incr;
 
                         if ( incr == 1 ) {
                             *p_lb = start + init;
@@ -1622,10 +1668,7 @@ __kmp_dispatch_next(
                         *p_lb = pr->u.p.lb;
                         *p_ub = pr->u.p.ub;
                         last = pr->u.p.parm1;
-                        if ( p_last ) {
-                            *p_last = last;
-                        }
-                        if ( p_st )
+                        if ( p_st != NULL )
                             *p_st = pr->u.p.st;
                     } else {  /* no iterations to do */
                         pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
@@ -1665,10 +1708,7 @@ __kmp_dispatch_next(
                         if ( (last = (limit >= trip)) != 0 )
                             limit = trip;
 
-                        if ( p_last ) {
-                            *p_last = last;
-                        }
-                        if ( p_st != 0 ) *p_st = incr;
+                        if ( p_st != NULL ) *p_st = incr;
 
                         pr->u.p.count += team->t.t_nproc;
 
@@ -1713,7 +1753,7 @@ __kmp_dispatch_next(
                     if ( (status = (init <= trip)) == 0 ) {
                         *p_lb = 0;
                         *p_ub = 0;
-                        if ( p_st != 0 ) *p_st = 0;
+                        if ( p_st != NULL ) *p_st = 0;
                     } else {
                         start = pr->u.p.lb;
                         limit = chunk + init - 1;
@@ -1721,10 +1761,8 @@ __kmp_dispatch_next(
 
                         if ( (last = (limit >= trip)) != 0 )
                             limit = trip;
-                        if ( p_last ) {
-                            *p_last = last;
-                        }
-                        if ( p_st != 0 ) *p_st = incr;
+
+                        if ( p_st != NULL ) *p_st = incr;
 
                         if ( incr == 1 ) {
                             *p_lb = start + init;
@@ -1801,8 +1839,6 @@ __kmp_dispatch_next(
                         incr = pr->u.p.st;
                         if ( p_st != NULL )
                             *p_st = incr;
-                        if ( p_last != NULL )
-                            *p_last = last;
                         *p_lb = start + init * incr;
                         *p_ub = start + limit * incr;
                         if ( pr->ordered ) {
@@ -1906,8 +1942,6 @@ __kmp_dispatch_next(
                         incr = pr->u.p.st;
                         if ( p_st != NULL )
                             *p_st = incr;
-                        if ( p_last != NULL )
-                            *p_last = last;
                         *p_lb = start + init * incr;
                         *p_ub = start + limit * incr;
                         if ( pr->ordered ) {
@@ -1951,7 +1985,7 @@ __kmp_dispatch_next(
                     if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
                         *p_lb = 0;
                         *p_ub = 0;
-                        if ( p_st != 0 ) *p_st = 0;
+                        if ( p_st != NULL ) *p_st = 0;
                     } else {
                         start = pr->u.p.lb;
                         limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
@@ -1960,10 +1994,7 @@ __kmp_dispatch_next(
                         if ( (last = (limit >= trip)) != 0 )
                             limit = trip;
 
-                        if ( p_last != 0 ) {
-                            *p_last = last;
-                        }
-                        if ( p_st != 0 ) *p_st = incr;
+                        if ( p_st != NULL ) *p_st = incr;
 
                         if ( incr == 1 ) {
                             *p_lb = start + init;
@@ -1991,6 +2022,17 @@ __kmp_dispatch_next(
                     } // if
                 } // case
                 break;
+            default:
+                {
+                    status = 0; // to avoid complaints on uninitialized variable use
+                    __kmp_msg(
+                        kmp_ms_fatal,                        // Severity
+                        KMP_MSG( UnknownSchedTypeDetected ), // Primary message
+                        KMP_HNT( GetNewerLibrary ),          // Hint
+                        __kmp_msg_null                       // Variadic argument list terminator
+                    );
+                }
+                break;
             } // switch
         } // if tc == 0;
 
@@ -2010,7 +2052,7 @@ __kmp_dispatch_next(
             }
             #endif
 
-            if ( num_done == team->t.t_nproc-1 ) {
+            if ( (ST)num_done == team->t.t_nproc-1 ) {
                 /* NOTE: release this buffer to be reused */
 
                 KMP_MB();       /* Flush all pending memory write invalidates.  */
@@ -2048,6 +2090,8 @@ __kmp_dispatch_next(
             pr->u.p.last_upper = pr->u.p.ub;
         }
 #endif /* KMP_OS_WINDOWS */
+        if ( p_last != NULL && status != 0 )
+            *p_last = last;
     } // if
 
     #ifdef KMP_DEBUG
@@ -2062,9 +2106,129 @@ __kmp_dispatch_next(
         __kmp_str_free( &buff );
     }
     #endif
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_DISPATCH_NEXT();
+#endif
     return status;
 }
 
+template< typename T >
+static void
+__kmp_dist_get_bounds(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                        *plastiter,
+    T                                *plower,
+    T                                *pupper,
+    typename traits_t< T >::signed_t  incr
+) {
+    KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    register kmp_uint32  team_id;
+    register kmp_uint32  nteams;
+    register UT          trip_count;
+    register kmp_team_t *team;
+    kmp_info_t * th;
+
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper );
+    KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if( __kmp_env_consistency_check ) {
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute global trip count
+    if( incr == 1 ) {
+        trip_count = *pupper - *plower + 1;
+    } else if(incr == -1) {
+        trip_count = *plower - *pupper + 1;
+    } else {
+        trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    if( trip_count <= nteams ) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy || \
+            __kmp_static == kmp_sch_static_balanced
+        ); // Unknown static scheduling type.
+        // only some teams get single iteration, others get nothing
+        if( team_id < trip_count ) {
+            *pupper = *plower = *plower + team_id * incr;
+        } else {
+            *plower = *pupper + incr; // zero-trip loop
+        }
+        if( plastiter != NULL )
+            *plastiter = ( team_id == trip_count - 1 );
+    } else {
+        if( __kmp_static == kmp_sch_static_balanced ) {
+            register UT chunk = trip_count / nteams;
+            register UT extras = trip_count % nteams;
+            *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
+            *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
+            if( plastiter != NULL )
+                *plastiter = ( team_id == nteams - 1 );
+        } else {
+            register T chunk_inc_count =
+                ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
+            register T upper = *pupper;
+            KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                // Unknown static scheduling type.
+            *plower += team_id * chunk_inc_count;
+            *pupper = *plower + chunk_inc_count - incr;
+            // Check/correct bounds if needed
+            if( incr > 0 ) {
+                if( *pupper < *plower )
+                    *pupper = i_maxmin< T >::mx;
+                if( plastiter != NULL )
+                    *plastiter = *plower <= upper && *pupper > upper - incr;
+                if( *pupper > upper )
+                    *pupper = upper; // tracker C73258
+            } else {
+                if( *pupper > *plower )
+                    *pupper = i_maxmin< T >::mn;
+                if( plastiter != NULL )
+                    *plastiter = *plower >= upper && *pupper < upper - incr;
+                if( *pupper < upper )
+                    *pupper = upper; // tracker C73258
+            }
+        }
+    }
+}
+
 //-----------------------------------------------------------------------------------------
 // Dispatch routines
 //    Transfer call to template< type T >
@@ -2091,6 +2255,7 @@ void
 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
                         kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
 {
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
     KMP_DEBUG_ASSERT( __kmp_init_serial );
     __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
 }
@@ -2101,6 +2266,7 @@ void
 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
                         kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
 {
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
     KMP_DEBUG_ASSERT( __kmp_init_serial );
     __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
 }
@@ -2113,6 +2279,7 @@ __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
                         kmp_int64 lb, kmp_int64 ub,
                         kmp_int64 st, kmp_int64 chunk )
 {
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
     KMP_DEBUG_ASSERT( __kmp_init_serial );
     __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
 }
@@ -2125,11 +2292,61 @@ __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
                          kmp_uint64 lb, kmp_uint64 ub,
                          kmp_int64 st, kmp_int64 chunk )
 {
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
     KMP_DEBUG_ASSERT( __kmp_init_serial );
     __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
 }
 
 /*!
+See @ref __kmpc_dispatch_init_4
+
+Difference from __kmpc_dispatch_init set of functions is these functions
+are called for composite distribute parallel for construct. Thus before
+regular iterations dispatching we need to calc per-team iteration space.
+
+These functions are all identical apart from the types of the arguments.
+*/
+void
+__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+void
+__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
+    kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
+{
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
+    __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
+}
+
+/*!
 @param loc Source code location
 @param gtid Global thread id
 @param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
@@ -2284,8 +2501,6 @@ __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
             __kmp_abort_thread(); */
 
-        __kmp_static_delay(TRUE);
-
         /* if we have waited a bit, or are oversubscribed, yield */
         /* pause is in the following code */
         KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
@@ -2320,8 +2535,6 @@ __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
         /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
             __kmp_abort_thread(); */
 
-        __kmp_static_delay(TRUE);
-
         // if we are oversubscribed,
         // or have waited a bit (and KMP_LIBARRY=throughput, then yield
         // pause is in the following code
diff --git a/openmp/runtime/src/kmp_environment.c b/openmp/runtime/src/kmp_environment.c
index 0be9dc90c6e..2d69ce71646 100644
--- a/openmp/runtime/src/kmp_environment.c
+++ b/openmp/runtime/src/kmp_environment.c
@@ -1,7 +1,7 @@
 /*
  * kmp_environment.c -- Handle environment variables OS-independently.
- * $Revision: 42263 $
- * $Date: 2013-04-04 11:03:19 -0500 (Thu, 04 Apr 2013) $
+ * $Revision: 43084 $
+ * $Date: 2014-04-15 09:15:14 -0500 (Tue, 15 Apr 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_environment.h b/openmp/runtime/src/kmp_environment.h
index 06c2ec40cd7..8655277de93 100644
--- a/openmp/runtime/src/kmp_environment.h
+++ b/openmp/runtime/src/kmp_environment.h
@@ -1,7 +1,7 @@
 /*
  * kmp_environment.h -- Handle environment varoiables OS-independently.
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_error.c b/openmp/runtime/src/kmp_error.c
index 99a1728f068..bb69b2e2eeb 100644
--- a/openmp/runtime/src/kmp_error.c
+++ b/openmp/runtime/src/kmp_error.c
@@ -1,7 +1,7 @@
 /* 
  * kmp_error.c -- KPTS functions for error checking at runtime
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_error.h b/openmp/runtime/src/kmp_error.h
index f400b2a9b23..09817c88ae9 100644
--- a/openmp/runtime/src/kmp_error.h
+++ b/openmp/runtime/src/kmp_error.h
@@ -1,7 +1,7 @@
 /*
  * kmp_error.h -- PTS functions for error checking at runtime.
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_ftn_cdecl.c b/openmp/runtime/src/kmp_ftn_cdecl.c
index 135a7cb7eb3..e503e6b0eec 100644
--- a/openmp/runtime/src/kmp_ftn_cdecl.c
+++ b/openmp/runtime/src/kmp_ftn_cdecl.c
@@ -1,7 +1,7 @@
 /*
  * kmp_ftn_cdecl.c -- Fortran __cdecl linkage support for OpenMP.
- * $Revision: 42757 $
- * $Date: 2013-10-18 08:20:57 -0500 (Fri, 18 Oct 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h
index 280d76fe405..8263a657877 100644
--- a/openmp/runtime/src/kmp_ftn_entry.h
+++ b/openmp/runtime/src/kmp_ftn_entry.h
@@ -1,7 +1,7 @@
 /*
  * kmp_ftn_entry.h -- Fortran entry linkage support for OpenMP.
- * $Revision: 42798 $
- * $Date: 2013-10-30 16:39:54 -0500 (Wed, 30 Oct 2013) $
+ * $Revision: 43435 $
+ * $Date: 2014-09-04 15:16:08 -0500 (Thu, 04 Sep 2014) $
  */
 
 
@@ -217,8 +217,6 @@ FTN_GET_LIBRARY (void)
     #endif
 }
 
-#if OMP_30_ENABLED
-
 int FTN_STDCALL
 FTN_SET_AFFINITY( void **mask )
 {
@@ -348,8 +346,6 @@ FTN_GET_AFFINITY_MASK_PROC( int KMP_DEREF proc, void **mask )
     #endif
 }
 
-#endif /* OMP_30_ENABLED */
-
 
 /* ------------------------------------------------------------------------ */
 
@@ -391,12 +387,8 @@ xexpand(FTN_GET_MAX_THREADS)( void )
         }
         gtid   = __kmp_entry_gtid();
         thread = __kmp_threads[ gtid ];
-        #if OMP_30_ENABLED
         //return thread -> th.th_team -> t.t_current_task[ thread->th.th_info.ds.ds_tid ] -> icvs.nproc;
 	return thread -> th.th_current_task -> td_icvs.nproc;
-        #else
-        return thread -> th.th_team -> t.t_set_nproc[ thread->th.th_info.ds.ds_tid ];
-        #endif
     #endif
 }
 
@@ -533,7 +525,7 @@ xexpand(FTN_IN_PARALLEL)( void )
     #else
         kmp_info_t *th = __kmp_entry_thread();
 #if OMP_40_ENABLED
-        if ( th->th.th_team_microtask ) {
+        if ( th->th.th_teams_microtask ) {
             // AC: r_in_parallel does not work inside teams construct
             //     where real parallel is inactive, but all threads have same root,
             //     so setting it in one team affects other teams.
@@ -546,8 +538,6 @@ xexpand(FTN_IN_PARALLEL)( void )
     #endif
 }
 
-#if OMP_30_ENABLED
-
 void FTN_STDCALL
 xexpand(FTN_SET_SCHEDULE)( kmp_sched_t KMP_DEREF kind, int KMP_DEREF modifier )
 {
@@ -667,8 +657,6 @@ xexpand(FTN_IN_FINAL)( void )
     #endif
 }
 
-#endif // OMP_30_ENABLED
-
 #if OMP_40_ENABLED
 
 
@@ -689,7 +677,7 @@ xexpand(FTN_GET_NUM_TEAMS)( void )
         return 1;
     #else
         kmp_info_t *thr = __kmp_entry_thread();
-        if ( thr->th.th_team_microtask ) {
+        if ( thr->th.th_teams_microtask ) {
             kmp_team_t *team = thr->th.th_team;
             int tlevel = thr->th.th_teams_level;
             int ii = team->t.t_level;            // the level of the teams construct
@@ -728,7 +716,7 @@ xexpand(FTN_GET_TEAM_NUM)( void )
         return 0;
     #else
         kmp_info_t *thr = __kmp_entry_thread();
-        if ( thr->th.th_team_microtask ) {
+        if ( thr->th.th_teams_microtask ) {
             kmp_team_t *team = thr->th.th_team;
             int tlevel = thr->th.th_teams_level; // the level of the teams construct
             int ii = team->t.t_level;
@@ -1048,19 +1036,19 @@ FTN_GET_CANCELLATION_STATUS(int cancel_kind) {
 #endif // OMP_40_ENABLED
 
 // GCC compatibility (versioned symbols)
-#if KMP_OS_LINUX
+#ifdef KMP_USE_VERSION_SYMBOLS
 
 /*
     These following sections create function aliases (dummy symbols) for the omp_* routines.
-    These aliases will then be versioned according to how libgomp ``versions'' its 
-    symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also retaining the 
+    These aliases will then be versioned according to how libgomp ``versions'' its
+    symbols (OMP_1.0, OMP_2.0, OMP_3.0, ...) while also retaining the
     default version which libiomp5 uses: VERSION (defined in exports_so.txt)
-    If you want to see the versioned symbols for libgomp.so.1 then just type: 
+    If you want to see the versioned symbols for libgomp.so.1 then just type:
 
     objdump -T /path/to/libgomp.so.1 | grep omp_
 
     Example:
-    Step 1)  Create __kmp_api_omp_set_num_threads_10_alias 
+    Step 1)  Create __kmp_api_omp_set_num_threads_10_alias
              which is alias of __kmp_api_omp_set_num_threads
     Step 2)  Set __kmp_api_omp_set_num_threads_10_alias to version: omp_set_num_threads@OMP_1.0
     Step 2B) Set __kmp_api_omp_set_num_threads to default version : omp_set_num_threads@@VERSION
@@ -1092,7 +1080,6 @@ xaliasify(FTN_TEST_NEST_LOCK,    10);
 xaliasify(FTN_GET_WTICK, 20);
 xaliasify(FTN_GET_WTIME, 20);
 
-#if OMP_30_ENABLED
 // OMP_3.0 aliases
 xaliasify(FTN_SET_SCHEDULE,            30);
 xaliasify(FTN_GET_SCHEDULE,            30);
@@ -1116,7 +1103,6 @@ xaliasify(FTN_TEST_NEST_LOCK,          30);
 
 // OMP_3.1 aliases
 xaliasify(FTN_IN_FINAL, 31);
-#endif /* OMP_30_ENABLED */
 
 #if OMP_40_ENABLED
 // OMP_4.0 aliases
@@ -1160,7 +1146,6 @@ xversionify(FTN_TEST_NEST_LOCK,    10, "OMP_1.0");
 xversionify(FTN_GET_WTICK,         20, "OMP_2.0");
 xversionify(FTN_GET_WTIME,         20, "OMP_2.0");
 
-#if OMP_30_ENABLED
 // OMP_3.0 versioned symbols
 xversionify(FTN_SET_SCHEDULE,      30, "OMP_3.0");
 xversionify(FTN_GET_SCHEDULE,      30, "OMP_3.0");
@@ -1186,7 +1171,6 @@ xversionify(FTN_TEST_NEST_LOCK,    30, "OMP_3.0");
 
 // OMP_3.1 versioned symbol
 xversionify(FTN_IN_FINAL,          31, "OMP_3.1");
-#endif /* OMP_30_ENABLED */
 
 #if OMP_40_ENABLED
 // OMP_4.0 versioned symbols
@@ -1204,7 +1188,7 @@ xversionify(FTN_GET_CANCELLATION,  40, "OMP_4.0");
 // OMP_5.0 versioned symbols
 #endif
 
-#endif /* KMP_OS_LINUX */
+#endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
     } //extern "C"
diff --git a/openmp/runtime/src/kmp_ftn_extra.c b/openmp/runtime/src/kmp_ftn_extra.c
index 6777e01ba98..f3e0aa55675 100644
--- a/openmp/runtime/src/kmp_ftn_extra.c
+++ b/openmp/runtime/src/kmp_ftn_extra.c
@@ -1,7 +1,7 @@
 /*
  * kmp_ftn_extra.c -- Fortran 'extra' linkage support for OpenMP.
- * $Revision: 42757 $
- * $Date: 2013-10-18 08:20:57 -0500 (Fri, 18 Oct 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h
index d78d846d136..4be6ae84820 100644
--- a/openmp/runtime/src/kmp_ftn_os.h
+++ b/openmp/runtime/src/kmp_ftn_os.h
@@ -1,7 +1,7 @@
 /*
  * kmp_ftn_os.h -- KPTS Fortran defines header file.
- * $Revision: 42745 $
- * $Date: 2013-10-14 17:02:04 -0500 (Mon, 14 Oct 2013) $
+ * $Revision: 43354 $
+ * $Date: 2014-07-22 17:15:02 -0500 (Tue, 22 Jul 2014) $
  */
 
 
@@ -472,14 +472,14 @@
 #define KMP_API_NAME_GOMP_TASKGROUP_START                GOMP_taskgroup_start
 #define KMP_API_NAME_GOMP_TASKGROUP_END                  GOMP_taskgroup_end
 /* Target functions should be taken care of by liboffload */
-//#define KMP_API_NAME_GOMP_TARGET                       GOMP_target
-//#define KMP_API_NAME_GOMP_TARGET_DATA                  GOMP_target_data
-//#define KMP_API_NAME_GOMP_TARGET_END_DATA              GOMP_target_end_data
-//#define KMP_API_NAME_GOMP_TARGET_UPDATE                GOMP_target_update
+#define KMP_API_NAME_GOMP_TARGET                         GOMP_target
+#define KMP_API_NAME_GOMP_TARGET_DATA                    GOMP_target_data
+#define KMP_API_NAME_GOMP_TARGET_END_DATA                GOMP_target_end_data
+#define KMP_API_NAME_GOMP_TARGET_UPDATE                  GOMP_target_update
 #define KMP_API_NAME_GOMP_TEAMS                          GOMP_teams
 
-#if KMP_OS_LINUX && !KMP_OS_CNK && !KMP_ARCH_PPC64
-    #define xstr(x) str(x) 
+#ifdef KMP_USE_VERSION_SYMBOLS
+    #define xstr(x) str(x)
     #define str(x) #x
 
     // If Linux, xexpand prepends __kmp_api_ to the real API name
@@ -494,7 +494,7 @@
     __asm__(".symver " xstr(__kmp_api_##api_name##_##version_num##_alias) "," xstr(api_name) "@" version_str "\n\t"); \
     __asm__(".symver " xstr(__kmp_api_##api_name) "," xstr(api_name) "@@" default_ver "\n\t")
 
-#else /* KMP_OS_LINUX */
+#else // KMP_USE_VERSION_SYMBOLS
     #define xstr(x) /* Nothing */
     #define str(x)  /* Nothing */
 
@@ -508,7 +508,7 @@
     #define xversionify(api_name, version_num, version_str) /* Nothing */
     #define versionify(api_name, version_num, version_str, default_ver) /* Nothing */
 
-#endif /* KMP_OS_LINUX */
+#endif // KMP_USE_VERSION_SYMBOLS
 
 #endif /* KMP_FTN_OS_H */
 
diff --git a/openmp/runtime/src/kmp_ftn_stdcall.c b/openmp/runtime/src/kmp_ftn_stdcall.c
index 671bc063f26..07c074816ab 100644
--- a/openmp/runtime/src/kmp_ftn_stdcall.c
+++ b/openmp/runtime/src/kmp_ftn_stdcall.c
@@ -1,7 +1,7 @@
 /*
  * kmp_ftn_stdcall.c -- Fortran __stdcall linkage support for OpenMP.
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_global.c b/openmp/runtime/src/kmp_global.c
index d3c31952d0f..5f188d03c5d 100644
--- a/openmp/runtime/src/kmp_global.c
+++ b/openmp/runtime/src/kmp_global.c
@@ -1,7 +1,7 @@
 /*
  * kmp_global.c -- KPTS global variables for runtime support library
- * $Revision: 42816 $
- * $Date: 2013-11-11 15:33:37 -0600 (Mon, 11 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -25,6 +25,20 @@ kmp_key_t __kmp_gtid_threadprivate_key;
 
 kmp_cpuinfo_t   __kmp_cpuinfo = { 0 }; // Not initialized
 
+#if KMP_STATS_ENABLED
+#include "kmp_stats.h"
+// lock for modifying the global __kmp_stats_list
+kmp_tas_lock_t __kmp_stats_lock = KMP_TAS_LOCK_INITIALIZER(__kmp_stats_lock);
+
+// global list of per thread stats, the head is a sentinel node which accumulates all stats produced before __kmp_create_worker is called.
+kmp_stats_list __kmp_stats_list;
+
+// thread local pointer to stats node within list
+__thread kmp_stats_list* __kmp_stats_thread_ptr = &__kmp_stats_list;
+
+// gives reference tick for all events (considered the 0 tick)
+tsc_tick_count __kmp_stats_start_time;
+#endif
 
 /* ----------------------------------------------------- */
 /* INITIALIZATION VARIABLES */
@@ -53,6 +67,7 @@ unsigned int __kmp_next_wait = KMP_DEFAULT_NEXT_WAIT;   /* susequent number of s
 size_t      __kmp_stksize         = KMP_DEFAULT_STKSIZE;
 size_t      __kmp_monitor_stksize = 0;  // auto adjust
 size_t      __kmp_stkoffset       = KMP_DEFAULT_STKOFFSET;
+int         __kmp_stkpadding      = KMP_MIN_STKPADDING;
 
 size_t    __kmp_malloc_pool_incr  = KMP_DEFAULT_MALLOC_POOL_INCR;
 
@@ -94,7 +109,7 @@ char const *__kmp_barrier_type_name           [ bs_last_barrier ] =
                                     , "reduction"
                                 #endif // KMP_FAST_REDUCTION_BARRIER
                             };
-char const *__kmp_barrier_pattern_name [ bp_last_bar ] = { "linear", "tree", "hyper" };
+char const *__kmp_barrier_pattern_name [ bp_last_bar ] = { "linear", "tree", "hyper", "hierarchical" };
 
 
 int       __kmp_allThreadsSpecified = 0;
@@ -114,16 +129,17 @@ int      __kmp_dflt_team_nth_ub = 0;
 int           __kmp_tp_capacity = 0;
 int             __kmp_tp_cached = 0;
 int           __kmp_dflt_nested = FALSE;
-#if OMP_30_ENABLED
 int __kmp_dflt_max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; /* max_active_levels limit */
-#endif // OMP_30_ENABLED
+#if KMP_NESTED_HOT_TEAMS
+int __kmp_hot_teams_mode         = 0; /* 0 - free extra threads when reduced */
+                                      /* 1 - keep extra threads when reduced */
+int __kmp_hot_teams_max_level    = 1; /* nesting level of hot teams */
+#endif
 enum library_type __kmp_library = library_none;
 enum sched_type     __kmp_sched = kmp_sch_default;  /* scheduling method for runtime scheduling */
 enum sched_type    __kmp_static = kmp_sch_static_greedy; /* default static scheduling method */
 enum sched_type    __kmp_guided = kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
-#if OMP_30_ENABLED
 enum sched_type      __kmp_auto = kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
-#endif // OMP_30_ENABLED
 int        __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
 int       __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
 int          __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( KMP_DEFAULT_BLOCKTIME, KMP_MIN_MONITOR_WAKEUPS );
@@ -242,7 +258,6 @@ unsigned int __kmp_place_num_threads_per_core = 0;
 unsigned int __kmp_place_core_offset = 0;
 #endif
 
-#if OMP_30_ENABLED
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
 
 /* This check ensures that the compiler is passing the correct data type
@@ -255,8 +270,6 @@ KMP_BUILD_ASSERT( sizeof(kmp_tasking_flags_t) == 4 );
 
 kmp_int32 __kmp_task_stealing_constraint = 1;   /* Constrain task stealing by default */
 
-#endif /* OMP_30_ENABLED */
-
 #ifdef DEBUG_SUSPEND
 int         __kmp_suspend_count = 0;
 #endif
@@ -364,6 +377,29 @@ kmp_global_t __kmp_global = {{ 0 }};
 /* ----------------------------------------------- */
 /* GLOBAL SYNCHRONIZATION LOCKS */
 /* TODO verify the need for these locks and if they need to be global */
+
+#if KMP_USE_INTERNODE_ALIGNMENT
+/* Multinode systems have larger cache line granularity which can cause
+ * false sharing if the alignment is not large enough for these locks */
+KMP_ALIGN_CACHE_INTERNODE
+
+kmp_bootstrap_lock_t __kmp_initz_lock   = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_initz_lock   ); /* Control initializations */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_exit_lock;   /* exit() is not always thread-safe */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_bootstrap_lock_t __kmp_tp_cached_lock; /* used for the hack to allow threadprivate cache and __kmp_threads expansion to co-exist */
+
+KMP_ALIGN_CACHE_INTERNODE
+kmp_lock_t __kmp_global_lock;           /* Control OS/global access */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_queuing_lock_t __kmp_dispatch_lock;         /* Control dispatch access  */
+KMP_ALIGN_CACHE_INTERNODE
+kmp_lock_t __kmp_debug_lock;            /* Control I/O access for KMP_DEBUG */
+#else
 KMP_ALIGN_CACHE
 
 kmp_bootstrap_lock_t __kmp_initz_lock   = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_initz_lock   ); /* Control initializations */
@@ -378,6 +414,7 @@ KMP_ALIGN(128)
 kmp_queuing_lock_t __kmp_dispatch_lock;         /* Control dispatch access  */
 KMP_ALIGN(128)
 kmp_lock_t __kmp_debug_lock;            /* Control I/O access for KMP_DEBUG */
+#endif
 
 /* ----------------------------------------------- */
 
diff --git a/openmp/runtime/src/kmp_gsupport.c b/openmp/runtime/src/kmp_gsupport.c
index aa520249674..3cce67c742c 100644
--- a/openmp/runtime/src/kmp_gsupport.c
+++ b/openmp/runtime/src/kmp_gsupport.c
@@ -1,7 +1,7 @@
 /*
  * kmp_gsupport.c
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -244,7 +244,7 @@ xexpand(KMP_API_NAME_GOMP_ORDERED_END)(void)
 // The parallel contruct
 //
 
-#ifdef KMP_DEBUG
+#ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
 void
@@ -255,7 +255,7 @@ __kmp_GOMP_microtask_wrapper(int *gtid, int *npr, void (*task)(void *),
 }
 
 
-#ifdef KMP_DEBUG
+#ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
 void
@@ -276,7 +276,7 @@ __kmp_GOMP_parallel_microtask_wrapper(int *gtid, int *npr,
 }
 
 
-#ifdef KMP_DEBUG
+#ifndef KMP_DEBUG
 static
 #endif /* KMP_DEBUG */
 void
@@ -287,7 +287,7 @@ __kmp_GOMP_fork_call(ident_t *loc, int gtid, microtask_t wrapper, int argc,...)
     va_list ap;
     va_start(ap, argc);
 
-    rc = __kmp_fork_call(loc, gtid, FALSE, argc, wrapper, __kmp_invoke_task_func,
+    rc = __kmp_fork_call(loc, gtid, fork_context_gnu, argc, wrapper, __kmp_invoke_task_func,
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
       &ap
 #else
@@ -563,7 +563,7 @@ xexpand(KMP_API_NAME_GOMP_LOOP_END_NOWAIT)(void)
             status = KMP_DISPATCH_NEXT_ULL(&loc, gtid, NULL,                 \
               (kmp_uint64 *)p_lb, (kmp_uint64 *)p_ub, (kmp_int64 *)&stride); \
             if (status) {                                                    \
-                KMP_DEBUG_ASSERT(stride == str2);                            \
+                KMP_DEBUG_ASSERT((long long)stride == str2);                 \
                 *p_ub += (str > 0) ? 1 : -1;                                 \
             }                                                                \
         }                                                                    \
@@ -666,9 +666,6 @@ PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED_START), kmp_s
 PARALLEL_LOOP_START(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME_START), kmp_sch_runtime)
 
 
-#if OMP_30_ENABLED
-
-
 /**/
 //
 // Tasking constructs
@@ -742,9 +739,6 @@ xexpand(KMP_API_NAME_GOMP_TASKWAIT)(void)
 }
 
 
-#endif /* OMP_30_ENABLED */
-
-
 /**/
 //
 // Sections worksharing constructs
@@ -861,9 +855,268 @@ xexpand(KMP_API_NAME_GOMP_SECTIONS_END_NOWAIT)(void)
 void
 xexpand(KMP_API_NAME_GOMP_TASKYIELD)(void)
 {
+    KA_TRACE(20, ("GOMP_taskyield: T#%d\n", __kmp_get_gtid()))
+    return;
+}
+
+#if OMP_40_ENABLED // these are new GOMP_4.0 entry points
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL)(void (*task)(void *), void *data, unsigned num_threads, unsigned int flags)
+{
+    int gtid = __kmp_entry_gtid();
+    MKLOC(loc, "GOMP_parallel");
+    KA_TRACE(20, ("GOMP_parallel: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        if(flags != 0) {
+            __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid,
+          (microtask_t)__kmp_GOMP_microtask_wrapper, 2, task, data);
+    }
+    else {
+        __kmpc_serialized_parallel(&loc, gtid);
+    }
+    task(data);
+    xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_PARALLEL_SECTIONS)(void (*task) (void *), void *data,
+  unsigned num_threads, unsigned count, unsigned flags)
+{
+    int gtid = __kmp_entry_gtid();
+    int last = FALSE;
+    MKLOC(loc, "GOMP_parallel_sections");
+    KA_TRACE(20, ("GOMP_parallel_sections: T#%d\n", gtid));
+
+    if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {
+        if (num_threads != 0) {
+            __kmp_push_num_threads(&loc, gtid, num_threads);
+        }
+        if(flags != 0) {
+            __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);
+        }
+        __kmp_GOMP_fork_call(&loc, gtid,
+          (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9, task, data,
+          num_threads, &loc, kmp_nm_dynamic_chunked, (kmp_int)1,
+          (kmp_int)count, (kmp_int)1, (kmp_int)1);
+    }
+    else {
+        __kmpc_serialized_parallel(&loc, gtid);
+    }
+
+    KMP_DISPATCH_INIT(&loc, gtid, kmp_nm_dynamic_chunked, 1, count, 1, 1, TRUE);
+
+    task(data);
+    xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();
+    KA_TRACE(20, ("GOMP_parallel_sections exit: T#%d\n", gtid));
+}
+
+#define PARALLEL_LOOP(func, schedule) \
+    void func (void (*task) (void *), void *data, unsigned num_threads,      \
+      long lb, long ub, long str, long chunk_sz, unsigned flags)             \
+    {                                                                        \
+        int gtid = __kmp_entry_gtid();                                       \
+        int last = FALSE;                                                    \
+        MKLOC(loc, #func);                                                   \
+        KA_TRACE(20, ( #func ": T#%d, lb 0x%lx, ub 0x%lx, str 0x%lx, chunk_sz 0x%lx\n",        \
+          gtid, lb, ub, str, chunk_sz ));                                    \
+                                                                             \
+        if (__kmpc_ok_to_fork(&loc) && (num_threads != 1)) {                 \
+            if (num_threads != 0) {                                          \
+                __kmp_push_num_threads(&loc, gtid, num_threads);             \
+            }                                                                \
+            if (flags != 0) {                                                \
+                __kmp_push_proc_bind(&loc, gtid, (kmp_proc_bind_t)flags);    \
+            }                                                                \
+            __kmp_GOMP_fork_call(&loc, gtid,                                 \
+              (microtask_t)__kmp_GOMP_parallel_microtask_wrapper, 9,         \
+              task, data, num_threads, &loc, (schedule), lb,                 \
+              (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz);               \
+        }                                                                    \
+        else {                                                               \
+            __kmpc_serialized_parallel(&loc, gtid);                          \
+        }                                                                    \
+                                                                             \
+        KMP_DISPATCH_INIT(&loc, gtid, (schedule), lb,                        \
+          (str > 0) ? (ub - 1) : (ub + 1), str, chunk_sz,                    \
+          (schedule) != kmp_sch_static);                                     \
+        task(data);                                                          \
+        xexpand(KMP_API_NAME_GOMP_PARALLEL_END)();                           \
+                                                                             \
+        KA_TRACE(20, ( #func " exit: T#%d\n", gtid));                        \
+    }
+
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC), kmp_sch_static)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC), kmp_sch_dynamic_chunked)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED), kmp_sch_guided_chunked)
+PARALLEL_LOOP(xexpand(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME), kmp_sch_runtime)
+
+
+void
+xexpand(KMP_API_NAME_GOMP_TASKGROUP_START)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_taskgroup_start");
+    KA_TRACE(20, ("GOMP_taskgroup_start: T#%d\n", gtid));
+
+    __kmpc_taskgroup(&loc, gtid);
+
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TASKGROUP_END)(void)
+{
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_taskgroup_end");
+    KA_TRACE(20, ("GOMP_taskgroup_end: T#%d\n", gtid));
+
+    __kmpc_end_taskgroup(&loc, gtid);
+
+    return;
+}
+
+#ifndef KMP_DEBUG
+static
+#endif /* KMP_DEBUG */
+kmp_int32 __kmp_gomp_to_iomp_cancellation_kind(int gomp_kind) {
+    kmp_int32 cncl_kind = 0;
+    switch(gomp_kind) {
+      case 1:
+        cncl_kind = cancel_parallel;
+        break;
+      case 2:
+        cncl_kind = cancel_loop;
+        break;
+      case 4:
+        cncl_kind = cancel_sections;
+        break;
+      case 8:
+        cncl_kind = cancel_taskgroup;
+        break;
+    }
+    return cncl_kind;
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_CANCELLATION_POINT)(int which)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_cancellation_point");
+    KA_TRACE(20, ("GOMP_cancellation_point: T#%d\n", gtid));
+
+    kmp_int32 cncl_kind = __kmp_gomp_to_iomp_cancellation_kind(which);
+
+    return __kmpc_cancellationpoint(&loc, gtid, cncl_kind);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_BARRIER_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    KMP_FATAL(NoGompCancellation);
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_barrier_cancel");
+    KA_TRACE(20, ("GOMP_barrier_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_CANCEL)(int which, bool do_cancel)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    } else {
+        return FALSE;
+    }
+
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_cancel");
+    KA_TRACE(20, ("GOMP_cancel: T#%d\n", gtid));
 
+    kmp_int32 cncl_kind = __kmp_gomp_to_iomp_cancellation_kind(which);
+
+    if(do_cancel == FALSE) {
+        return xexpand(KMP_API_NAME_GOMP_CANCELLATION_POINT)(which);
+    } else {
+        return __kmpc_cancel(&loc, gtid, cncl_kind);
+    }
 }
 
+bool
+xexpand(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_sections_end_cancel");
+    KA_TRACE(20, ("GOMP_sections_end_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+bool
+xexpand(KMP_API_NAME_GOMP_LOOP_END_CANCEL)(void)
+{
+    if(__kmp_omp_cancellation) {
+        KMP_FATAL(NoGompCancellation);
+    }
+    int gtid = __kmp_get_gtid();
+    MKLOC(loc, "GOMP_loop_end_cancel");
+    KA_TRACE(20, ("GOMP_loop_end_cancel: T#%d\n", gtid));
+
+    return __kmpc_cancel_barrier(&loc, gtid);
+}
+
+// All target functions are empty as of 2014-05-29
+void
+xexpand(KMP_API_NAME_GOMP_TARGET)(int device, void (*fn) (void *), const void *openmp_target,
+             size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_DATA)(int device, const void *openmp_target, size_t mapnum,
+                  void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_END_DATA)(void)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TARGET_UPDATE)(int device, const void *openmp_target, size_t mapnum,
+                    void **hostaddrs, size_t *sizes, unsigned char *kinds)
+{
+    return;
+}
+
+void
+xexpand(KMP_API_NAME_GOMP_TEAMS)(unsigned int num_teams, unsigned int thread_limit)
+{
+    return;
+}
+#endif // OMP_40_ENABLED
+
+
 /*
     The following sections of code create aliases for the GOMP_* functions,
     then create versioned symbols using the assembler directive .symver.
@@ -871,7 +1124,7 @@ xexpand(KMP_API_NAME_GOMP_TASKYIELD)(void)
     xaliasify and xversionify are defined in kmp_ftn_os.h
 */
 
-#if KMP_OS_LINUX
+#ifdef KMP_USE_VERSION_SYMBOLS
 
 // GOMP_1.0 aliases
 xaliasify(KMP_API_NAME_GOMP_ATOMIC_END, 10);
@@ -917,10 +1170,8 @@ xaliasify(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10);
 xaliasify(KMP_API_NAME_GOMP_SINGLE_START, 10);
 
 // GOMP_2.0 aliases
-#if OMP_30_ENABLED
 xaliasify(KMP_API_NAME_GOMP_TASK, 20);
 xaliasify(KMP_API_NAME_GOMP_TASKWAIT, 20);
-#endif
 xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20);
 xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20);
 xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20);
@@ -942,9 +1193,27 @@ xaliasify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20);
 xaliasify(KMP_API_NAME_GOMP_TASKYIELD, 30);
 
 // GOMP_4.0 aliases
-/* TODO: add GOMP_4.0 aliases when corresponding
-         GOMP_* functions are implemented
-*/
+// The GOMP_parallel* entry points below aren't OpenMP 4.0 related.
+#if OMP_40_ENABLED
+xaliasify(KMP_API_NAME_GOMP_PARALLEL, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40);
+xaliasify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40);
+xaliasify(KMP_API_NAME_GOMP_TASKGROUP_START, 40);
+xaliasify(KMP_API_NAME_GOMP_TASKGROUP_END, 40);
+xaliasify(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40);
+xaliasify(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_DATA, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_END_DATA, 40);
+xaliasify(KMP_API_NAME_GOMP_TARGET_UPDATE, 40);
+xaliasify(KMP_API_NAME_GOMP_TEAMS, 40);
+#endif
 
 // GOMP_1.0 versioned symbols
 xversionify(KMP_API_NAME_GOMP_ATOMIC_END, 10, "GOMP_1.0");
@@ -990,10 +1259,8 @@ xversionify(KMP_API_NAME_GOMP_SINGLE_COPY_START, 10, "GOMP_1.0");
 xversionify(KMP_API_NAME_GOMP_SINGLE_START, 10, "GOMP_1.0");
 
 // GOMP_2.0 versioned symbols
-#if OMP_30_ENABLED
 xversionify(KMP_API_NAME_GOMP_TASK, 20, "GOMP_2.0");
 xversionify(KMP_API_NAME_GOMP_TASKWAIT, 20, "GOMP_2.0");
-#endif
 xversionify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_NEXT, 20, "GOMP_2.0");
 xversionify(KMP_API_NAME_GOMP_LOOP_ULL_DYNAMIC_START, 20, "GOMP_2.0");
 xversionify(KMP_API_NAME_GOMP_LOOP_ULL_GUIDED_NEXT, 20, "GOMP_2.0");
@@ -1015,11 +1282,28 @@ xversionify(KMP_API_NAME_GOMP_LOOP_ULL_STATIC_START, 20, "GOMP_2.0");
 xversionify(KMP_API_NAME_GOMP_TASKYIELD, 30, "GOMP_3.0");
 
 // GOMP_4.0 versioned symbols
-/* TODO: add GOMP_4.0 versioned symbols when corresponding
-         GOMP_* functions are implemented
-*/
+#if OMP_40_ENABLED
+xversionify(KMP_API_NAME_GOMP_PARALLEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_SECTIONS, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_DYNAMIC, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_GUIDED, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_RUNTIME, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_PARALLEL_LOOP_STATIC, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TASKGROUP_START, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TASKGROUP_END, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_BARRIER_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_CANCELLATION_POINT, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_LOOP_END_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_SECTIONS_END_CANCEL, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_DATA, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_END_DATA, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TARGET_UPDATE, 40, "GOMP_4.0");
+xversionify(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0");
+#endif
 
-#endif /* KMP_OS_LINUX */
+#endif // KMP_USE_VERSION_SYMBOLS
 
 #ifdef __cplusplus
     } //extern "C"
diff --git a/openmp/runtime/src/kmp_i18n.c b/openmp/runtime/src/kmp_i18n.c
index d50102ae839..b26d5e714ab 100644
--- a/openmp/runtime/src/kmp_i18n.c
+++ b/openmp/runtime/src/kmp_i18n.c
@@ -1,7 +1,7 @@
 /*
  * kmp_i18n.c
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43084 $
+ * $Date: 2014-04-15 09:15:14 -0500 (Tue, 15 Apr 2014) $
  */
 
 
@@ -815,7 +815,7 @@ sys_error(
                 // not issue warning if strerror_r() returns `int' instead of expected `char *'.
             message = __kmp_str_format( "%s", err_msg );
 
-        #else // OS X*, FreeBSD etc.
+        #else // OS X*, FreeBSD* etc.
 
             // XSI version of strerror_r.
 
diff --git a/openmp/runtime/src/kmp_i18n.h b/openmp/runtime/src/kmp_i18n.h
index fea8de42648..deb464e19eb 100644
--- a/openmp/runtime/src/kmp_i18n.h
+++ b/openmp/runtime/src/kmp_i18n.h
@@ -1,7 +1,7 @@
 /*
  * kmp_i18n.h
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_import.c b/openmp/runtime/src/kmp_import.c
index d549a2f0a9f..95e8211d2df 100644
--- a/openmp/runtime/src/kmp_import.c
+++ b/openmp/runtime/src/kmp_import.c
@@ -1,7 +1,7 @@
 /*
  * kmp_import.c
- * $Revision: 42286 $
- * $Date: 2013-04-18 10:53:26 -0500 (Thu, 18 Apr 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_io.c b/openmp/runtime/src/kmp_io.c
index 23b60816317..03966345744 100644
--- a/openmp/runtime/src/kmp_io.c
+++ b/openmp/runtime/src/kmp_io.c
@@ -1,7 +1,7 @@
 /*
  * kmp_io.c -- RTL IO
- * $Revision: 42150 $
- * $Date: 2013-03-15 15:40:38 -0500 (Fri, 15 Mar 2013) $
+ * $Revision: 43236 $
+ * $Date: 2014-06-04 16:42:35 -0500 (Wed, 04 Jun 2014) $
  */
 
 
@@ -171,7 +171,7 @@ __kmp_vprintf( enum kmp_io __kmp_io, char const * format, va_list ap )
         int chars = 0;
 
         #ifdef KMP_DEBUG_PIDS
-            chars = sprintf( db, "pid=%d: ", getpid() );
+            chars = sprintf( db, "pid=%d: ", (kmp_int32)getpid() );
         #endif
         chars += vsprintf( db, format, ap );
 
@@ -200,7 +200,8 @@ __kmp_vprintf( enum kmp_io __kmp_io, char const * format, va_list ap )
         #if KMP_OS_WINDOWS
             DWORD count;
             #ifdef KMP_DEBUG_PIDS
-                __kmp_str_buf_print( &__kmp_console_buf, "pid=%d: ", getpid() );
+                __kmp_str_buf_print( &__kmp_console_buf, "pid=%d: ",
+                  (kmp_int32)getpid() );
             #endif
             __kmp_str_buf_vprint( &__kmp_console_buf, format, ap );
             WriteFile(
@@ -213,7 +214,7 @@ __kmp_vprintf( enum kmp_io __kmp_io, char const * format, va_list ap )
             __kmp_str_buf_clear( &__kmp_console_buf );
         #else
             #ifdef KMP_DEBUG_PIDS
-                fprintf( __kmp_stderr, "pid=%d: ", getpid() );
+                fprintf( __kmp_stderr, "pid=%d: ", (kmp_int32)getpid() );
             #endif
             vfprintf( __kmp_stderr, format, ap );
             fflush( __kmp_stderr );
diff --git a/openmp/runtime/src/kmp_io.h b/openmp/runtime/src/kmp_io.h
index a81f459ba1b..26db4dad5cc 100644
--- a/openmp/runtime/src/kmp_io.h
+++ b/openmp/runtime/src/kmp_io.h
@@ -1,7 +1,7 @@
 /*
  * kmp_io.h -- RTL IO header file.
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_itt.c b/openmp/runtime/src/kmp_itt.c
index b2ebb04b1eb..bff97f8680a 100644
--- a/openmp/runtime/src/kmp_itt.c
+++ b/openmp/runtime/src/kmp_itt.c
@@ -1,8 +1,8 @@
 #if USE_ITT_BUILD
 /*
  * kmp_itt.c -- ITT Notify interface.
- * $Revision: 42489 $
- * $Date: 2013-07-08 11:00:09 -0500 (Mon, 08 Jul 2013) $
+ * $Revision: 43457 $
+ * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
  */
 
 
@@ -25,8 +25,13 @@
 
 #if USE_ITT_NOTIFY
 
-    kmp_int32 __kmp_frame_domain_count = 0;
-    __itt_domain* __kmp_itt_domains[KMP_MAX_FRAME_DOMAINS];
+    kmp_int32 __kmp_barrier_domain_count;
+    kmp_int32 __kmp_region_domain_count;
+    __itt_domain* __kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain* __kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain* __kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+    kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+    __itt_domain * metadata_domain = NULL;
 
     #include "kmp_version.h"
     #include "kmp_i18n.h"
diff --git a/openmp/runtime/src/kmp_itt.h b/openmp/runtime/src/kmp_itt.h
index 0ee79b6fe5d..c5ddbc645c2 100644
--- a/openmp/runtime/src/kmp_itt.h
+++ b/openmp/runtime/src/kmp_itt.h
@@ -1,8 +1,8 @@
 #if USE_ITT_BUILD
 /*
  * kmp_itt.h -- ITT Notify interface.
- * $Revision: 42829 $
- * $Date: 2013-11-21 05:44:01 -0600 (Thu, 21 Nov 2013) $
+ * $Revision: 43457 $
+ * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
  */
 
 
@@ -55,12 +55,20 @@ void __kmp_itt_destroy();
 //     __kmp_itt_xxxed()  function should be called after action.
 
 // --- Parallel region reporting ---
-__kmp_inline void __kmp_itt_region_forking(  int gtid, int serialized = 0 ); // Master only, before forking threads.
+__kmp_inline void __kmp_itt_region_forking(  int gtid, int team_size, int barriers, int serialized = 0 ); // Master only, before forking threads.
 __kmp_inline void __kmp_itt_region_joined(   int gtid, int serialized = 0 ); // Master only, after joining threads.
     // (*) Note: A thread may execute tasks after this point, though.
 
 // --- Frame reporting ---
-__kmp_inline void __kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t *loc );
+// region = 0 - no regions, region = 1 - parallel, region = 2 - serialized parallel
+__kmp_inline void __kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t *loc, int team_size, int region = 0 );
+
+// --- Metadata reporting ---
+// begin/end - begin/end timestamps of a barrier frame, imbalance - aggregated wait time value, reduction -if this is a reduction barrier
+__kmp_inline void __kmp_itt_metadata_imbalance( int gtid, kmp_uint64 begin, kmp_uint64 end, kmp_uint64 imbalance, kmp_uint64 reduction );
+// sched_type: 0 - static, 1 - dynamic, 2 - guided, 3 - custom (all others); iterations - loop trip count, chunk - chunk size
+__kmp_inline void __kmp_itt_metadata_loop( ident_t * loc, kmp_uint64 sched_type, kmp_uint64 iterations, kmp_uint64 chunk );
+__kmp_inline void __kmp_itt_metadata_single();
 
 // --- Barrier reporting ---
 __kmp_inline void * __kmp_itt_barrier_object( int gtid, int bt, int set_name = 0, int delta = 0 );
@@ -135,8 +143,12 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
     #if (INCLUDE_SSC_MARKS && KMP_OS_LINUX && KMP_ARCH_X86_64)
     // Portable (at least for gcc and icc) code to insert the necessary instructions
     // to set %ebx and execute the unlikely no-op.
-    # define INSERT_SSC_MARK(tag)                                           \
-        __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag):"%ebx")
+      #if defined( __INTEL_COMPILER )
+      # define INSERT_SSC_MARK(tag) __SSC_MARK(tag)
+      #else
+      # define INSERT_SSC_MARK(tag)                                          \
+      __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag):"%ebx")
+      #endif
     #else
     # define INSERT_SSC_MARK(tag) ((void)0)
     #endif
@@ -150,6 +162,18 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
     #define SSC_MARK_SPIN_START() INSERT_SSC_MARK(0x4376)
     #define SSC_MARK_SPIN_END()   INSERT_SSC_MARK(0x4377)
 
+    // Markers for architecture simulation.
+    // FORKING      : Before the master thread forks.
+    // JOINING      : At the start of the join.
+    // INVOKING     : Before the threads invoke microtasks.
+    // DISPATCH_INIT: At the start of dynamically scheduled loop.
+    // DISPATCH_NEXT: After claming next iteration of dynamically scheduled loop.
+    #define SSC_MARK_FORKING()          INSERT_SSC_MARK(0xd693)
+    #define SSC_MARK_JOINING()          INSERT_SSC_MARK(0xd694)
+    #define SSC_MARK_INVOKING()         INSERT_SSC_MARK(0xd695)
+    #define SSC_MARK_DISPATCH_INIT()    INSERT_SSC_MARK(0xd696)
+    #define SSC_MARK_DISPATCH_NEXT()    INSERT_SSC_MARK(0xd697)
+
     // The object is an address that associates a specific set of the prepare, acquire, release,
     // and cancel operations.
 
@@ -227,8 +251,14 @@ __kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);
 
     const int KMP_MAX_FRAME_DOMAINS = 512; // Maximum number of frame domains to use (maps to
                                            // different OpenMP regions in the user source code).
-    extern kmp_int32 __kmp_frame_domain_count;
-    extern __itt_domain* __kmp_itt_domains[KMP_MAX_FRAME_DOMAINS];
+    extern kmp_int32 __kmp_barrier_domain_count;
+    extern kmp_int32 __kmp_region_domain_count;
+    extern __itt_domain* __kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain* __kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain* __kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
+    extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
+    extern __itt_domain * metadata_domain;
+
 #else
 
 // Null definitions of the synchronization tracing functions.
diff --git a/openmp/runtime/src/kmp_itt.inl b/openmp/runtime/src/kmp_itt.inl
index 56953d02b7a..70aafcce7d1 100644
--- a/openmp/runtime/src/kmp_itt.inl
+++ b/openmp/runtime/src/kmp_itt.inl
@@ -1,8 +1,8 @@
 #if USE_ITT_BUILD
 /*
  * kmp_itt.inl -- Inline functions of ITT Notify.
- * $Revision: 42866 $
- * $Date: 2013-12-10 15:15:58 -0600 (Tue, 10 Dec 2013) $
+ * $Revision: 43457 $
+ * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
  */
 
 
@@ -63,6 +63,8 @@
 #endif
 #endif
 
+static kmp_bootstrap_lock_t  metadata_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( metadata_lock );
+
 /*
     ------------------------------------------------------------------------------------------------
     Parallel region reporting.
@@ -89,12 +91,10 @@
 // -------------------------------------------------------------------------------------------------
 
 LINKAGE void
-__kmp_itt_region_forking( int gtid, int serialized ) {
+__kmp_itt_region_forking( int gtid, int team_size, int barriers, int serialized ) {
 #if USE_ITT_NOTIFY
     kmp_team_t *      team = __kmp_team_from_gtid( gtid );
-#if OMP_30_ENABLED
     if (team->t.t_active_level + serialized > 1)
-#endif
     {
         // The frame notifications are only supported for the outermost teams.
         return;
@@ -105,40 +105,81 @@ __kmp_itt_region_forking( int gtid, int serialized ) {
         // Assume that reserved_2 contains zero initially.  Since zero is special
         // value here, store the index into domain array increased by 1.
         if (loc->reserved_2 == 0) {
-            if (__kmp_frame_domain_count < KMP_MAX_FRAME_DOMAINS) {
-                int frm = KMP_TEST_THEN_INC32( & __kmp_frame_domain_count ); // get "old" value
+            if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                int frm = KMP_TEST_THEN_INC32( & __kmp_region_domain_count ); // get "old" value
                 if (frm >= KMP_MAX_FRAME_DOMAINS) {
-                    KMP_TEST_THEN_DEC32( & __kmp_frame_domain_count );       // revert the count
+                    KMP_TEST_THEN_DEC32( & __kmp_region_domain_count );       // revert the count
                     return;                      // loc->reserved_2 is still 0
                 }
                 //if (!KMP_COMPARE_AND_STORE_ACQ32( &loc->reserved_2, 0, frm + 1 )) {
                 //    frm = loc->reserved_2 - 1;   // get value saved by other thread for same loc
                 //} // AC: this block is to replace next unsynchronized line
-                loc->reserved_2 = frm + 1;                                   // save "new" value
+
+                // We need to save indexes for both region and barrier frames. We'll use loc->reserved_2
+                // field but put region index to the low two bytes and barrier indexes to the high
+                // two bytes. It is OK because KMP_MAX_FRAME_DOMAINS = 512.
+                loc->reserved_2 |= (frm + 1);                                    // save "new" value
 
                 // Transform compiler-generated region location into the format
                 // that the tools more or less standardized on:
                 //                               "<func>$omp$parallel@[file:]<line>[:<col>]"
                 const char * buff = NULL;
                 kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
-                buff = __kmp_str_format("%s$omp$parallel@%s:%d:%d",
-                                        str_loc.func, str_loc.file,
+                buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                        str_loc.func, team_size, str_loc.file,
                                         str_loc.line, str_loc.col);
+
+                __itt_suppress_push(__itt_suppress_memory_errors);
+                __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                __itt_suppress_pop();
+
+                __kmp_str_free( &buff );
+                if( barriers ) {
+                    if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                        int frm = KMP_TEST_THEN_INC32( & __kmp_barrier_domain_count ); // get "old" value
+                        if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                            KMP_TEST_THEN_DEC32( & __kmp_barrier_domain_count );       // revert the count
+                            return;                      // loc->reserved_2 is still 0
+                        }
+                        const char * buff = NULL;
+                        buff = __kmp_str_format("%s$omp$barrier@%s:%d",
+                                                str_loc.func, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_barrier_domains[ frm ] = __itt_domain_create( buff );
+                        __itt_suppress_pop();
+                        __kmp_str_free( &buff );
+                        // Save the barrier frame index to the high two bytes.
+                        loc->reserved_2 |= (frm + 1) << 16;
+                    }
+                }
                 __kmp_str_loc_free( &str_loc );
+                __itt_frame_begin_v3(__kmp_itt_region_domains[ frm ], NULL);
+            }
+        } else { // Region domain exists for this location
+            // Check if team size was changed. Then create new region domain for this location
+            int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+            if( __kmp_itt_region_team_size[frm] != team_size ) {
+                const char * buff = NULL;
+                kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                        str_loc.func, team_size, str_loc.file,
+                                        str_loc.line, str_loc.col);
 
                 __itt_suppress_push(__itt_suppress_memory_errors);
-                __kmp_itt_domains[ frm ] = __itt_domain_create( buff );
+                __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
                 __itt_suppress_pop();
 
                 __kmp_str_free( &buff );
-                __itt_frame_begin_v3(__kmp_itt_domains[ frm ], NULL);
+                __kmp_str_loc_free( &str_loc );
+                __kmp_itt_region_team_size[frm] = team_size;
+                __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
+            } else { // Team size was not changed. Use existing domain.
+                __itt_frame_begin_v3(__kmp_itt_region_domains[frm], NULL);
             }
-        } else { // if it is not 0 then it should be <= KMP_MAX_FRAME_DOMAINS
-            __itt_frame_begin_v3(__kmp_itt_domains[loc->reserved_2 - 1], NULL);
         }
         KMP_ITT_DEBUG_LOCK();
-        KMP_ITT_DEBUG_PRINT( "[frm beg] gtid=%d, idx=%d, serialized:%d, loc:%p\n",
-                         gtid, loc->reserved_2 - 1, serialized, loc );
+        KMP_ITT_DEBUG_PRINT( "[frm beg] gtid=%d, idx=%x, serialized:%d, loc:%p\n",
+                         gtid, loc->reserved_2, serialized, loc );
     }
 #endif
 } // __kmp_itt_region_forking
@@ -146,51 +187,208 @@ __kmp_itt_region_forking( int gtid, int serialized ) {
 // -------------------------------------------------------------------------------------------------
 
 LINKAGE void
-__kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t * loc ) {
+__kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t * loc, int team_size, int region ) {
 #if USE_ITT_NOTIFY
+    if( region ) {
+        kmp_team_t *      team = __kmp_team_from_gtid( gtid );
+        int serialized = ( region == 2 ? 1 : 0 );
+        if (team->t.t_active_level + serialized > 1)
+        {
+            // The frame notifications are only supported for the outermost teams.
+            return;
+        }
+         //Check region domain has not been created before. It's index is saved in the low two bytes.
+         if ((loc->reserved_2 & 0x0000FFFF) == 0) {
+             if (__kmp_region_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                 int frm = KMP_TEST_THEN_INC32( & __kmp_region_domain_count ); // get "old" value
+                 if (frm >= KMP_MAX_FRAME_DOMAINS) {
+                     KMP_TEST_THEN_DEC32( & __kmp_region_domain_count );       // revert the count
+                     return;                      // loc->reserved_2 is still 0
+                 }
+
+                 // We need to save indexes for both region and barrier frames. We'll use loc->reserved_2
+                 // field but put region index to the low two bytes and barrier indexes to the high
+                 // two bytes. It is OK because KMP_MAX_FRAME_DOMAINS = 512.
+                 loc->reserved_2 |= (frm + 1);                                 // save "new" value
+
+                 // Transform compiler-generated region location into the format
+                 // that the tools more or less standardized on:
+                 //                               "<func>$omp$parallel:team_size@[file:]<line>[:<col>]"
+                 const char * buff = NULL;
+                 kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                 buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                         str_loc.func, team_size, str_loc.file,
+                                         str_loc.line, str_loc.col);
+
+                 __itt_suppress_push(__itt_suppress_memory_errors);
+                 __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                 __itt_suppress_pop();
+
+                 __kmp_str_free( &buff );
+                 __kmp_str_loc_free( &str_loc );
+                 __kmp_itt_region_team_size[frm] = team_size;
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             }
+         } else { // Region domain exists for this location
+             // Check if team size was changed. Then create new region domain for this location
+             int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+             if( __kmp_itt_region_team_size[frm] != team_size ) {
+                 const char * buff = NULL;
+                 kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+                 buff = __kmp_str_format("%s$omp$parallel:%d@%s:%d:%d",
+                                         str_loc.func, team_size, str_loc.file,
+                                         str_loc.line, str_loc.col);
+
+                 __itt_suppress_push(__itt_suppress_memory_errors);
+                 __kmp_itt_region_domains[ frm ] = __itt_domain_create( buff );
+                 __itt_suppress_pop();
+
+                 __kmp_str_free( &buff );
+                 __kmp_str_loc_free( &str_loc );
+                 __kmp_itt_region_team_size[frm] = team_size;
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             } else { // Team size was not changed. Use existing domain.
+                 __itt_frame_submit_v3(__kmp_itt_region_domains[ frm ], NULL, begin, end );
+             }
+         }
+         KMP_ITT_DEBUG_LOCK();
+         KMP_ITT_DEBUG_PRINT( "[reg sub] gtid=%d, idx=%x, region:%d, loc:%p, beg:%llu, end:%llu\n",
+                          gtid, loc->reserved_2, region, loc, begin, end );
+         return;
+    } else { // called for barrier reporting
         if (loc) {
-            if (loc->reserved_2 == 0) {
-                if (__kmp_frame_domain_count < KMP_MAX_FRAME_DOMAINS) {
-                    int frm = KMP_TEST_THEN_INC32( & __kmp_frame_domain_count ); // get "old" value
+            if ((loc->reserved_2 & 0xFFFF0000) == 0) {
+                if (__kmp_barrier_domain_count < KMP_MAX_FRAME_DOMAINS) {
+                    int frm = KMP_TEST_THEN_INC32( & __kmp_barrier_domain_count ); // get "old" value
                     if (frm >= KMP_MAX_FRAME_DOMAINS) {
-                        KMP_TEST_THEN_DEC32( & __kmp_frame_domain_count );       // revert the count
+                        KMP_TEST_THEN_DEC32( & __kmp_barrier_domain_count );       // revert the count
                         return;                      // loc->reserved_2 is still 0
                     }
-                    // Should it be synchronized? See the comment in __kmp_itt_region_forking
-                    loc->reserved_2 = frm + 1;                                   // save "new" value
+                    // Save the barrier frame index to the high two bytes.
+                    loc->reserved_2 |= (frm + 1) << 16;                          // save "new" value
 
                     // Transform compiler-generated region location into the format
                     // that the tools more or less standardized on:
                     //                               "<func>$omp$frame@[file:]<line>[:<col>]"
-                    const char * buff = NULL;
                     kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
                     if( imbalance ) {
-                        buff = __kmp_str_format("%s$omp$barrier-imbalance@%s:%d",
-                                                str_loc.func, str_loc.file, str_loc.col);
+                        const char * buff_imb = NULL;
+                        buff_imb = __kmp_str_format("%s$omp$barrier-imbalance:%d@%s:%d",
+                                                str_loc.func, team_size, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_imbalance_domains[ frm ] = __itt_domain_create( buff_imb );
+                        __itt_suppress_pop();
+                        __itt_frame_submit_v3(__kmp_itt_imbalance_domains[ frm ], NULL, begin, end );
+                        __kmp_str_free( &buff_imb );
                     } else {
+                        const char * buff = NULL;
                         buff = __kmp_str_format("%s$omp$barrier@%s:%d",
                                                 str_loc.func, str_loc.file, str_loc.col);
+                        __itt_suppress_push(__itt_suppress_memory_errors);
+                        __kmp_itt_barrier_domains[ frm ] = __itt_domain_create( buff );
+                        __itt_suppress_pop();
+                        __itt_frame_submit_v3(__kmp_itt_barrier_domains[ frm ], NULL, begin, end );
+                        __kmp_str_free( &buff );
                     }
                     __kmp_str_loc_free( &str_loc );
-
-                    __itt_suppress_push(__itt_suppress_memory_errors);
-                    __kmp_itt_domains[ frm ] = __itt_domain_create( buff );
-                    __itt_suppress_pop();
-
-                    __kmp_str_free( &buff );
-                    __itt_frame_submit_v3(__kmp_itt_domains[ frm ], NULL, begin, end );
                 }
             } else { // if it is not 0 then it should be <= KMP_MAX_FRAME_DOMAINS
-                __itt_frame_submit_v3(__kmp_itt_domains[loc->reserved_2 - 1], NULL, begin, end );
+                if( imbalance ) {
+                    __itt_frame_submit_v3(__kmp_itt_imbalance_domains[ (loc->reserved_2 >> 16) - 1 ], NULL, begin, end );
+                } else {
+                    __itt_frame_submit_v3(__kmp_itt_barrier_domains[(loc->reserved_2 >> 16) - 1], NULL, begin, end );
+                }
             }
+            KMP_ITT_DEBUG_LOCK();
+            KMP_ITT_DEBUG_PRINT( "[frm sub] gtid=%d, idx=%x, loc:%p, beg:%llu, end:%llu\n",
+                             gtid, loc->reserved_2, loc, begin, end );
+        }
     }
-
 #endif
 } // __kmp_itt_frame_submit
 
 // -------------------------------------------------------------------------------------------------
 
 LINKAGE void
+__kmp_itt_metadata_imbalance( int gtid, kmp_uint64 begin, kmp_uint64 end, kmp_uint64 imbalance, kmp_uint64 reduction ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_imbalance");
+
+    kmp_uint64 imbalance_data[ 4 ];
+    imbalance_data[ 0 ] = begin;
+    imbalance_data[ 1 ] = end;
+    imbalance_data[ 2 ] = imbalance;
+    imbalance_data[ 3 ] = reduction;
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 4, imbalance_data);
+#endif
+} // __kmp_itt_metadata_imbalance
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_metadata_loop( ident_t * loc, kmp_uint64 sched_type, kmp_uint64 iterations, kmp_uint64 chunk ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_loop");
+    kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
+
+    kmp_uint64 loop_data[ 5 ];
+    loop_data[ 0 ] = str_loc.line;
+    loop_data[ 1 ] = str_loc.col;
+    loop_data[ 2 ] = sched_type;
+    loop_data[ 3 ] = iterations;
+    loop_data[ 4 ] = chunk;
+
+    __kmp_str_loc_free( &str_loc );
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 5, loop_data);
+#endif
+} // __kmp_itt_metadata_loop
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
+__kmp_itt_metadata_single( ) {
+#if USE_ITT_NOTIFY
+    if( metadata_domain == NULL) {
+        __kmp_acquire_bootstrap_lock( & metadata_lock );
+        if( metadata_domain == NULL) {
+            __itt_suppress_push(__itt_suppress_memory_errors);
+            metadata_domain = __itt_domain_create( "OMP Metadata" );
+            __itt_suppress_pop();
+        }
+        __kmp_release_bootstrap_lock( & metadata_lock );
+    }
+
+    __itt_string_handle * string_handle = __itt_string_handle_create( "omp_metadata_single");
+
+    __itt_metadata_add(metadata_domain, __itt_null, string_handle, __itt_metadata_u64, 0, NULL);
+#endif
+} // __kmp_itt_metadata_single
+
+// -------------------------------------------------------------------------------------------------
+
+LINKAGE void
 __kmp_itt_region_starting( int gtid ) {
 #if USE_ITT_NOTIFY
 #endif
@@ -210,19 +408,21 @@ LINKAGE void
 __kmp_itt_region_joined( int gtid, int serialized ) {
 #if USE_ITT_NOTIFY
     kmp_team_t *      team = __kmp_team_from_gtid( gtid );
-#if OMP_30_ENABLED
     if (team->t.t_active_level + serialized > 1)
-#endif
     {
         // The frame notifications are only supported for the outermost teams.
         return;
     }
     ident_t *         loc  = __kmp_thread_from_gtid( gtid )->th.th_ident;
-    if (loc && loc->reserved_2 && loc->reserved_2 <= KMP_MAX_FRAME_DOMAINS) {
-        KMP_ITT_DEBUG_LOCK();
-        __itt_frame_end_v3(__kmp_itt_domains[loc->reserved_2 - 1], NULL);
-        KMP_ITT_DEBUG_PRINT( "[frm end] gtid=%d, idx=%d, serialized:%d, loc:%p\n",
-                         gtid, loc->reserved_2 - 1, serialized, loc );
+    if (loc && loc->reserved_2)
+    {
+        int frm = (loc->reserved_2 & 0x0000FFFF) - 1;
+        if(frm < KMP_MAX_FRAME_DOMAINS) {
+            KMP_ITT_DEBUG_LOCK();
+            __itt_frame_end_v3(__kmp_itt_region_domains[frm], NULL);
+            KMP_ITT_DEBUG_PRINT( "[frm end] gtid=%d, idx=%x, serialized:%d, loc:%p\n",
+                         gtid, loc->reserved_2, serialized, loc );
+        }
     }
 #endif
 } // __kmp_itt_region_joined
@@ -409,8 +609,6 @@ __kmp_itt_barrier_finished( int gtid, void * object ) {
 #endif
 } // __kmp_itt_barrier_finished
 
-#if OMP_30_ENABLED
-
 /*
     ------------------------------------------------------------------------------------------------
     Taskwait reporting.
@@ -507,8 +705,6 @@ __kmp_itt_task_finished(
 
 // -------------------------------------------------------------------------------------------------
 
-#endif /* OMP_30_ENABLED */
-
 /*
     ------------------------------------------------------------------------------------------------
     Lock reporting.
@@ -757,7 +953,11 @@ __kmp_itt_thread_name( int gtid ) {
     if ( __itt_thr_name_set_ptr ) {
         kmp_str_buf_t name;
         __kmp_str_buf_init( & name );
-        __kmp_str_buf_print( & name, "OMP Worker Thread #%d", gtid );
+        if( KMP_MASTER_GTID(gtid) ) {
+            __kmp_str_buf_print( & name, "OMP Master Thread #%d", gtid );
+        } else {
+            __kmp_str_buf_print( & name, "OMP Worker Thread #%d", gtid );
+        }
         KMP_ITT_DEBUG_LOCK();
         __itt_thr_name_set( name.str, name.used );
         KMP_ITT_DEBUG_PRINT( "[thr nam] name( \"%s\")\n", name.str );
diff --git a/openmp/runtime/src/kmp_lock.cpp b/openmp/runtime/src/kmp_lock.cpp
index 4c924635ecd..679d7e935c5 100644
--- a/openmp/runtime/src/kmp_lock.cpp
+++ b/openmp/runtime/src/kmp_lock.cpp
@@ -1,7 +1,7 @@
 /*
  * kmp_lock.cpp -- lock-related functions
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43389 $
+ * $Date: 2014-08-11 10:54:01 -0500 (Mon, 11 Aug 2014) $
  */
 
 
@@ -39,29 +39,6 @@
 # endif
 #endif
 
-
-#ifndef KMP_DEBUG
-# define __kmp_static_delay( arg )     /* nothing to do */
-#else
-
-static void
-__kmp_static_delay( int arg )
-{
-/* Work around weird code-gen bug that causes assert to trip */
-# if KMP_ARCH_X86_64 && KMP_OS_LINUX
-    KMP_ASSERT( arg != 0 );
-# else
-    KMP_ASSERT( arg >= 0 );
-# endif
-}
-#endif /* KMP_DEBUG */
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
-}
-
 /* Implement spin locks for internal library use.             */
 /* The algorithm implemented is Lamport's bakery lock [1974]. */
 
@@ -163,15 +140,13 @@ __kmp_acquire_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) == gtid ) ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
     __kmp_acquire_tas_lock( lck, gtid );
 }
@@ -190,12 +165,10 @@ __kmp_test_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
     return __kmp_test_tas_lock( lck, gtid );
 }
@@ -217,20 +190,18 @@ __kmp_release_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) >= 0 )
-          && ( __kmp_get_tas_lock_owner( lck ) != gtid ) ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_tas_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_tas_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_tas_lock( lck, gtid );
 }
@@ -256,15 +227,13 @@ __kmp_destroy_tas_lock( kmp_tas_lock_t *lck )
 static void
 __kmp_destroy_tas_lock_with_checks( kmp_tas_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( ( sizeof ( kmp_tas_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_tas_lock( lck );
 }
@@ -291,11 +260,9 @@ __kmp_acquire_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_nest_lock";
-        if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_set_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     __kmp_acquire_nested_tas_lock( lck, gtid );
 }
@@ -323,11 +290,9 @@ __kmp_test_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_nest_lock";
-        if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_test_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     return __kmp_test_nested_tas_lock( lck, gtid );
 }
@@ -346,18 +311,16 @@ __kmp_release_nested_tas_lock( kmp_tas_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_nested_tas_lock_with_checks( kmp_tas_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_nest_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_tas_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_nested_tas_lock( lck, gtid );
 }
@@ -385,14 +348,12 @@ __kmp_destroy_nested_tas_lock( kmp_tas_lock_t *lck )
 static void
 __kmp_destroy_nested_tas_lock_with_checks( kmp_tas_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_nest_lock";
-        if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_nest_lock";
+    if ( ! __kmp_is_tas_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_tas_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_nested_tas_lock( lck );
 }
@@ -506,15 +467,13 @@ __kmp_acquire_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) == gtid ) ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
     __kmp_acquire_futex_lock( lck, gtid );
 }
@@ -532,12 +491,10 @@ __kmp_test_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
     return __kmp_test_futex_lock( lck, gtid );
 }
@@ -575,20 +532,18 @@ __kmp_release_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) >= 0 )
-          && ( __kmp_get_futex_lock_owner( lck ) != gtid ) ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_futex_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_futex_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_futex_lock( lck, gtid );
 }
@@ -614,15 +569,13 @@ __kmp_destroy_futex_lock( kmp_futex_lock_t *lck )
 static void
 __kmp_destroy_futex_lock_with_checks( kmp_futex_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
-          && __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( ( sizeof ( kmp_futex_lock_t ) <= OMP_LOCK_T_SIZE )
+      && __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_futex_lock( lck );
 }
@@ -649,11 +602,9 @@ __kmp_acquire_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_nest_lock";
-        if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_set_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     __kmp_acquire_nested_futex_lock( lck, gtid );
 }
@@ -681,11 +632,9 @@ __kmp_test_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_nest_lock";
-        if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_test_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     return __kmp_test_nested_futex_lock( lck, gtid );
 }
@@ -704,18 +653,16 @@ __kmp_release_nested_futex_lock( kmp_futex_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_nested_futex_lock_with_checks( kmp_futex_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_nest_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_futex_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_nested_futex_lock( lck, gtid );
 }
@@ -743,14 +690,12 @@ __kmp_destroy_nested_futex_lock( kmp_futex_lock_t *lck )
 static void
 __kmp_destroy_nested_futex_lock_with_checks( kmp_futex_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_nest_lock";
-        if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_nest_lock";
+    if ( ! __kmp_is_futex_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_futex_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_nested_futex_lock( lck );
 }
@@ -781,9 +726,7 @@ __kmp_bakery_check(kmp_uint value, kmp_uint checker)
     if (value == checker) {
         return TRUE;
     }
-    for (pause = checker - value; pause != 0; --pause) {
-        __kmp_static_delay(TRUE);
-    }
+    for (pause = checker - value; pause != 0; --pause);
     return FALSE;
 }
 
@@ -818,24 +761,20 @@ __kmp_acquire_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) == gtid ) ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
 
     __kmp_acquire_ticket_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check ) {
-        lck->lk.owner_id = gtid + 1;
-    }
+    lck->lk.owner_id = gtid + 1;
 }
 
 int
@@ -856,19 +795,17 @@ __kmp_test_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
 
     int retval = __kmp_test_ticket_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check && retval ) {
+    if ( retval ) {
         lck->lk.owner_id = gtid + 1;
     }
     return retval;
@@ -895,24 +832,22 @@ __kmp_release_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) >= 0 )
-          && ( __kmp_get_ticket_lock_owner( lck ) != gtid ) ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
-        lck->lk.owner_id = 0;
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
+    if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_ticket_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_ticket_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.owner_id = 0;
     __kmp_release_ticket_lock( lck, gtid );
 }
 
@@ -947,17 +882,15 @@ __kmp_destroy_ticket_lock( kmp_ticket_lock_t *lck )
 static void
 __kmp_destroy_ticket_lock_with_checks( kmp_ticket_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_ticket_lock( lck );
 }
@@ -987,14 +920,12 @@ __kmp_acquire_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     __kmp_acquire_nested_ticket_lock( lck, gtid );
 }
@@ -1025,14 +956,12 @@ static int
 __kmp_test_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck,
   kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     return __kmp_test_nested_ticket_lock( lck, gtid );
 }
@@ -1053,21 +982,19 @@ __kmp_release_nested_ticket_lock( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_nest_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_ticket_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_nested_ticket_lock( lck, gtid );
 }
@@ -1095,17 +1022,15 @@ __kmp_destroy_nested_ticket_lock( kmp_ticket_lock_t *lck )
 static void
 __kmp_destroy_nested_ticket_lock_with_checks( kmp_ticket_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_ticket_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_ticket_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_nested_ticket_lock( lck );
 }
@@ -1443,24 +1368,20 @@ static void
 __kmp_acquire_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
   kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
 
     __kmp_acquire_queuing_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check ) {
-        lck->lk.owner_id = gtid + 1;
-    }
+    lck->lk.owner_id = gtid + 1;
 }
 
 int
@@ -1500,19 +1421,17 @@ __kmp_test_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
 
     int retval = __kmp_test_queuing_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check && retval ) {
+    if ( retval ) {
         lck->lk.owner_id = gtid + 1;
     }
     return retval;
@@ -1655,23 +1574,21 @@ static void
 __kmp_release_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
   kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
-        lck->lk.owner_id = 0;
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
+    lck->lk.owner_id = 0;
     __kmp_release_queuing_lock( lck, gtid );
 }
 
@@ -1712,17 +1629,15 @@ __kmp_destroy_queuing_lock( kmp_queuing_lock_t *lck )
 static void
 __kmp_destroy_queuing_lock_with_checks( kmp_queuing_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_queuing_lock( lck );
 }
@@ -1752,14 +1667,12 @@ __kmp_acquire_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     __kmp_acquire_nested_queuing_lock( lck, gtid );
 }
@@ -1790,14 +1703,12 @@ static int
 __kmp_test_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck,
   kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     return __kmp_test_nested_queuing_lock( lck, gtid );
 }
@@ -1818,21 +1729,19 @@ __kmp_release_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_nest_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_nested_queuing_lock( lck, gtid );
 }
@@ -1860,17 +1769,15 @@ __kmp_destroy_nested_queuing_lock( kmp_queuing_lock_t *lck )
 static void
 __kmp_destroy_nested_queuing_lock_with_checks( kmp_queuing_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_queuing_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_nested_queuing_lock( lck );
 }
@@ -2032,7 +1939,7 @@ static __inline void _xend()
 static kmp_adaptive_lock_statistics_t destroyedStats;
 
 // To hold the list of live locks.
-static kmp_adaptive_lock_t liveLocks;
+static kmp_adaptive_lock_info_t liveLocks;
 
 // A lock so we can safely update the list of locks.
 static kmp_bootstrap_lock_t chain_lock;
@@ -2041,7 +1948,7 @@ static kmp_bootstrap_lock_t chain_lock;
 void
 __kmp_init_speculative_stats()
 {
-    kmp_adaptive_lock *lck = &liveLocks;
+    kmp_adaptive_lock_info_t *lck = &liveLocks;
 
     memset( ( void * ) & ( lck->stats ), 0, sizeof( lck->stats ) );
     lck->stats.next = lck;
@@ -2056,7 +1963,7 @@ __kmp_init_speculative_stats()
 
 // Insert the lock into the circular list
 static void
-__kmp_remember_lock( kmp_adaptive_lock * lck )
+__kmp_remember_lock( kmp_adaptive_lock_info_t * lck )
 {
     __kmp_acquire_bootstrap_lock( &chain_lock );
 
@@ -2073,27 +1980,27 @@ __kmp_remember_lock( kmp_adaptive_lock * lck )
 }
 
 static void
-__kmp_forget_lock( kmp_adaptive_lock * lck )
+__kmp_forget_lock( kmp_adaptive_lock_info_t * lck )
 {
     KMP_ASSERT( lck->stats.next->stats.prev == lck );
     KMP_ASSERT( lck->stats.prev->stats.next == lck );
 
-    kmp_adaptive_lock * n = lck->stats.next;
-    kmp_adaptive_lock * p = lck->stats.prev;
+    kmp_adaptive_lock_info_t * n = lck->stats.next;
+    kmp_adaptive_lock_info_t * p = lck->stats.prev;
 
     n->stats.prev = p;
     p->stats.next = n;
 }
 
 static void
-__kmp_zero_speculative_stats( kmp_adaptive_lock * lck )
+__kmp_zero_speculative_stats( kmp_adaptive_lock_info_t * lck )
 {
     memset( ( void * )&lck->stats, 0, sizeof( lck->stats ) );
     __kmp_remember_lock( lck );
 }
 
 static void
-__kmp_add_stats( kmp_adaptive_lock_statistics_t * t, kmp_adaptive_lock_t * lck )
+__kmp_add_stats( kmp_adaptive_lock_statistics_t * t, kmp_adaptive_lock_info_t * lck )
 {
     kmp_adaptive_lock_statistics_t volatile *s = &lck->stats;
 
@@ -2106,7 +2013,7 @@ __kmp_add_stats( kmp_adaptive_lock_statistics_t * t, kmp_adaptive_lock_t * lck )
 }
 
 static void
-__kmp_accumulate_speculative_stats( kmp_adaptive_lock * lck)
+__kmp_accumulate_speculative_stats( kmp_adaptive_lock_info_t * lck)
 {
     kmp_adaptive_lock_statistics_t *t = &destroyedStats;
 
@@ -2132,7 +2039,8 @@ FILE * __kmp_open_stats_file()
 
     size_t buffLen = strlen( __kmp_speculative_statsfile ) + 20;
     char buffer[buffLen];
-    snprintf (&buffer[0], buffLen, __kmp_speculative_statsfile, getpid());
+    snprintf (&buffer[0], buffLen, __kmp_speculative_statsfile,
+      (kmp_int32)getpid());
     FILE * result = fopen(&buffer[0], "w");
 
     // Maybe we should issue a warning here...
@@ -2148,7 +2056,7 @@ __kmp_print_speculative_stats()
     FILE * statsFile = __kmp_open_stats_file();
 
     kmp_adaptive_lock_statistics_t total = destroyedStats;
-    kmp_adaptive_lock *lck;
+    kmp_adaptive_lock_info_t *lck;
 
     for (lck = liveLocks.stats.next; lck != &liveLocks; lck = lck->stats.next) {
         __kmp_add_stats( &total, lck );
@@ -2210,7 +2118,7 @@ __kmp_is_unlocked_queuing_lock( kmp_queuing_lock_t *lck )
 
 // Functions for manipulating the badness
 static __inline void
-__kmp_update_badness_after_success( kmp_queuing_lock_t *lck )
+__kmp_update_badness_after_success( kmp_adaptive_lock_t *lck )
 {
     // Reset the badness to zero so we eagerly try to speculate again
     lck->lk.adaptive.badness = 0;
@@ -2219,7 +2127,7 @@ __kmp_update_badness_after_success( kmp_queuing_lock_t *lck )
 
 // Create a bit mask with one more set bit.
 static __inline void
-__kmp_step_badness( kmp_queuing_lock_t *lck )
+__kmp_step_badness( kmp_adaptive_lock_t *lck )
 {
     kmp_uint32 newBadness = ( lck->lk.adaptive.badness << 1 ) | 1;
     if ( newBadness > lck->lk.adaptive.max_badness) {
@@ -2231,7 +2139,7 @@ __kmp_step_badness( kmp_queuing_lock_t *lck )
 
 // Check whether speculation should be attempted.
 static __inline int
-__kmp_should_speculate( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_should_speculate( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
     kmp_uint32 badness = lck->lk.adaptive.badness;
     kmp_uint32 attempts= lck->lk.adaptive.acquire_attempts;
@@ -2243,7 +2151,7 @@ __kmp_should_speculate( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 // Does not back off to the non-speculative lock.
 //
 static int
-__kmp_test_adaptive_lock_only( kmp_queuing_lock_t * lck, kmp_int32 gtid )
+__kmp_test_adaptive_lock_only( kmp_adaptive_lock_t * lck, kmp_int32 gtid )
 {
     int retries = lck->lk.adaptive.max_soft_retries;
 
@@ -2264,7 +2172,7 @@ __kmp_test_adaptive_lock_only( kmp_queuing_lock_t * lck, kmp_int32 gtid )
            * and now. This also gets the lock cache line into our read-set,
            * which we need so that we'll abort if anyone later claims it for real.
            */
-            if (! __kmp_is_unlocked_queuing_lock( lck ) )
+            if (! __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
             {
                 // Lock is now visibly acquired, so someone beat us to it.
                 // Abort the transaction so we'll restart from _xbegin with the
@@ -2299,7 +2207,7 @@ __kmp_test_adaptive_lock_only( kmp_queuing_lock_t * lck, kmp_int32 gtid )
 // if the speculative lock cannot be acquired.
 // We can succeed speculatively, non-speculatively, or fail.
 static int
-__kmp_test_adaptive_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_test_adaptive_lock( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
     // First try to acquire the lock speculatively
     if ( __kmp_should_speculate( lck, gtid ) && __kmp_test_adaptive_lock_only( lck, gtid ) )
@@ -2310,7 +2218,7 @@ __kmp_test_adaptive_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
     lck->lk.adaptive.acquire_attempts++;
 
     // Use base, non-speculative lock.
-    if ( __kmp_test_queuing_lock( lck, gtid ) )
+    if ( __kmp_test_queuing_lock( GET_QLK_PTR(lck), gtid ) )
     {
         KMP_INC_STAT(lck,nonSpeculativeAcquires);
         return 1;       // Lock is acquired (non-speculatively)
@@ -2322,19 +2230,17 @@ __kmp_test_adaptive_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 }
 
 static int
-__kmp_test_adaptive_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_test_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
     }
 
     int retval = __kmp_test_adaptive_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check && retval ) {
-        lck->lk.owner_id = gtid + 1;
+    if ( retval ) {
+        lck->lk.qlk.owner_id = gtid + 1;
     }
     return retval;
 }
@@ -2354,11 +2260,11 @@ __kmp_test_adaptive_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
 // is no longer fair. However OpenMP makes no guarantee that its
 // locks are fair, so this isn't a real problem.
 static void
-__kmp_acquire_adaptive_lock( kmp_queuing_lock_t * lck, kmp_int32 gtid )
+__kmp_acquire_adaptive_lock( kmp_adaptive_lock_t * lck, kmp_int32 gtid )
 {
     if ( __kmp_should_speculate( lck, gtid ) )
     {
-        if ( __kmp_is_unlocked_queuing_lock( lck ) )
+        if ( __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
         {
             if ( __kmp_test_adaptive_lock_only( lck , gtid ) )
                 return;
@@ -2372,7 +2278,7 @@ __kmp_acquire_adaptive_lock( kmp_queuing_lock_t * lck, kmp_int32 gtid )
             // All other threads will also see the same result for
             // shouldSpeculate, so will be doing the same if they
             // try to claim the lock from now on.
-            while ( ! __kmp_is_unlocked_queuing_lock( lck ) )
+            while ( ! __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
             {
                 KMP_INC_STAT(lck,lemmingYields);
                 __kmp_yield (TRUE);
@@ -2387,35 +2293,31 @@ __kmp_acquire_adaptive_lock( kmp_queuing_lock_t * lck, kmp_int32 gtid )
     // Count the non-speculative acquire attempt
     lck->lk.adaptive.acquire_attempts++;
 
-    __kmp_acquire_queuing_lock_timed_template<FALSE>( lck, gtid );
+    __kmp_acquire_queuing_lock_timed_template<FALSE>( GET_QLK_PTR(lck), gtid );
     // We have acquired the base lock, so count that.
     KMP_INC_STAT(lck,nonSpeculativeAcquires );
 }
 
 static void
-__kmp_acquire_adaptive_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_acquire_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) == gtid ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) == gtid ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
 
     __kmp_acquire_adaptive_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check ) {
-        lck->lk.owner_id = gtid + 1;
-    }
+    lck->lk.qlk.owner_id = gtid + 1;
 }
 
 static void
-__kmp_release_adaptive_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_release_adaptive_lock( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_is_unlocked_queuing_lock( lck ) )
+    if ( __kmp_is_unlocked_queuing_lock( GET_QLK_PTR(lck) ) )
     {   // If the lock doesn't look claimed we must be speculating.
         // (Or the user's code is buggy and they're releasing without locking;
         // if we had XTEST we'd be able to check that case...)
@@ -2425,34 +2327,32 @@ __kmp_release_adaptive_lock( kmp_queuing_lock_t *lck, kmp_int32 gtid )
     else
     {   // Since the lock *is* visibly locked we're not speculating,
         // so should use the underlying lock's release scheme.
-        __kmp_release_queuing_lock( lck, gtid );
+        __kmp_release_queuing_lock( GET_QLK_PTR(lck), gtid );
     }
 }
 
 static void
-__kmp_release_adaptive_lock_with_checks( kmp_queuing_lock_t *lck, kmp_int32 gtid )
+__kmp_release_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
-        lck->lk.owner_id = 0;
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
+    lck->lk.qlk.owner_id = 0;
     __kmp_release_adaptive_lock( lck, gtid );
 }
 
 static void
-__kmp_init_adaptive_lock( kmp_queuing_lock_t *lck )
+__kmp_init_adaptive_lock( kmp_adaptive_lock_t *lck )
 {
-    __kmp_init_queuing_lock( lck );
+    __kmp_init_queuing_lock( GET_QLK_PTR(lck) );
     lck->lk.adaptive.badness = 0;
     lck->lk.adaptive.acquire_attempts = 0; //nonSpeculativeAcquireAttempts = 0;
     lck->lk.adaptive.max_soft_retries = __kmp_adaptive_backoff_params.max_soft_retries;
@@ -2464,32 +2364,30 @@ __kmp_init_adaptive_lock( kmp_queuing_lock_t *lck )
 }
 
 static void
-__kmp_init_adaptive_lock_with_checks( kmp_queuing_lock_t * lck )
+__kmp_init_adaptive_lock_with_checks( kmp_adaptive_lock_t * lck )
 {
     __kmp_init_adaptive_lock( lck );
 }
 
 static void
-__kmp_destroy_adaptive_lock( kmp_queuing_lock_t *lck )
+__kmp_destroy_adaptive_lock( kmp_adaptive_lock_t *lck )
 {
 #if KMP_DEBUG_ADAPTIVE_LOCKS
     __kmp_accumulate_speculative_stats( &lck->lk.adaptive );
 #endif
-    __kmp_destroy_queuing_lock (lck);
+    __kmp_destroy_queuing_lock (GET_QLK_PTR(lck));
     // Nothing needed for the speculative part.
 }
 
 static void
-__kmp_destroy_adaptive_lock_with_checks( kmp_queuing_lock_t *lck )
+__kmp_destroy_adaptive_lock_with_checks( kmp_adaptive_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_get_queuing_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.qlk.initialized != GET_QLK_PTR(lck) ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_get_queuing_lock_owner( GET_QLK_PTR(lck) ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_adaptive_lock( lck );
 }
@@ -2544,9 +2442,6 @@ __kmp_acquire_drdpa_lock_timed_template( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
     KMP_FSYNC_PREPARE(lck);
     KMP_INIT_YIELD(spins);
     while (TCR_8(polls[ticket & mask]).poll < ticket) { // volatile load
-        __kmp_static_delay(TRUE);
-
-        //
         // If we are oversubscribed,
         // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
         // CPU Pause is in the macros for yield.
@@ -2691,24 +2586,20 @@ __kmp_acquire_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) == gtid ) ) {
-            KMP_FATAL( LockIsAlreadyOwned, func );
-        }
+    char const * const func = "omp_set_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) == gtid ) ) {
+        KMP_FATAL( LockIsAlreadyOwned, func );
     }
 
     __kmp_acquire_drdpa_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check ) {
-        lck->lk.owner_id = gtid + 1;
-    }
+    lck->lk.owner_id = gtid + 1;
 }
 
 int
@@ -2750,19 +2641,17 @@ __kmp_test_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
+    char const * const func = "omp_test_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
     }
 
     int retval = __kmp_test_drdpa_lock( lck, gtid );
 
-    if ( __kmp_env_consistency_check && retval ) {
+    if ( retval ) {
         lck->lk.owner_id = gtid + 1;
     }
     return retval;
@@ -2790,24 +2679,22 @@ __kmp_release_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) >= 0 )
-          && ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
-        lck->lk.owner_id = 0;
+    char const * const func = "omp_unset_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
     }
+    if ( ( gtid >= 0 ) && ( __kmp_get_drdpa_lock_owner( lck ) >= 0 )
+      && ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
+    }
+    lck->lk.owner_id = 0;
     __kmp_release_drdpa_lock( lck, gtid );
 }
 
@@ -2861,17 +2748,15 @@ __kmp_destroy_drdpa_lock( kmp_drdpa_lock_t *lck )
 static void
 __kmp_destroy_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockNestableUsedAsSimple, func );
-        }
-        if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockNestableUsedAsSimple, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_drdpa_lock( lck );
 }
@@ -2901,14 +2786,12 @@ __kmp_acquire_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_acquire_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_set_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_set_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     __kmp_acquire_nested_drdpa_lock( lck, gtid );
 }
@@ -2938,14 +2821,12 @@ __kmp_test_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static int
 __kmp_test_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_test_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
+    char const * const func = "omp_test_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
     }
     return __kmp_test_nested_drdpa_lock( lck, gtid );
 }
@@ -2966,21 +2847,19 @@ __kmp_release_nested_drdpa_lock( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 static void
 __kmp_release_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck, kmp_int32 gtid )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_unset_nest_lock";
-        KMP_MB();  /* in case another processor initialized lock */
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
-            KMP_FATAL( LockUnsettingFree, func );
-        }
-        if ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) {
-            KMP_FATAL( LockUnsettingSetByAnother, func );
-        }
+    char const * const func = "omp_unset_nest_lock";
+    KMP_MB();  /* in case another processor initialized lock */
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) == -1 ) {
+        KMP_FATAL( LockUnsettingFree, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != gtid ) {
+        KMP_FATAL( LockUnsettingSetByAnother, func );
     }
     __kmp_release_nested_drdpa_lock( lck, gtid );
 }
@@ -3008,17 +2887,15 @@ __kmp_destroy_nested_drdpa_lock( kmp_drdpa_lock_t *lck )
 static void
 __kmp_destroy_nested_drdpa_lock_with_checks( kmp_drdpa_lock_t *lck )
 {
-    if ( __kmp_env_consistency_check ) {
-        char const * const func = "omp_destroy_nest_lock";
-        if ( lck->lk.initialized != lck ) {
-            KMP_FATAL( LockIsUninitialized, func );
-        }
-        if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
-            KMP_FATAL( LockSimpleUsedAsNestable, func );
-        }
-        if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
-            KMP_FATAL( LockStillOwned, func );
-        }
+    char const * const func = "omp_destroy_nest_lock";
+    if ( lck->lk.initialized != lck ) {
+        KMP_FATAL( LockIsUninitialized, func );
+    }
+    if ( ! __kmp_is_drdpa_lock_nestable( lck ) ) {
+        KMP_FATAL( LockSimpleUsedAsNestable, func );
+    }
+    if ( __kmp_get_drdpa_lock_owner( lck ) != -1 ) {
+        KMP_FATAL( LockStillOwned, func );
     }
     __kmp_destroy_nested_drdpa_lock( lck );
 }
@@ -3106,50 +2983,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_tas_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_tas_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_tas_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_tas_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_tas_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(tas);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(tas);
+            }
+            else {
+                KMP_BIND_USER_LOCK(tas);
+                KMP_BIND_NESTED_USER_LOCK(tas);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
               ( &__kmp_destroy_tas_lock );
 
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_tas_lock_with_checks );
-
-            __kmp_acquire_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_nested_tas_lock_with_checks );
-
-            __kmp_test_nested_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_nested_tas_lock_with_checks );
-
-            __kmp_release_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_nested_tas_lock_with_checks );
-
-            __kmp_init_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_nested_tas_lock_with_checks );
-
-            __kmp_destroy_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_nested_tas_lock_with_checks );
-
              __kmp_is_user_lock_initialized_ =
                ( int ( * )( kmp_user_lock_p ) ) NULL;
 
@@ -3177,50 +3023,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_futex_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_futex_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_futex_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_futex_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_futex_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(futex);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(futex);
+            }
+            else {
+                KMP_BIND_USER_LOCK(futex);
+                KMP_BIND_NESTED_USER_LOCK(futex);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
               ( &__kmp_destroy_futex_lock );
 
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_futex_lock_with_checks );
-
-            __kmp_acquire_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_nested_futex_lock_with_checks );
-
-            __kmp_test_nested_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_nested_futex_lock_with_checks );
-
-            __kmp_release_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_nested_futex_lock_with_checks );
-
-            __kmp_init_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_nested_futex_lock_with_checks );
-
-            __kmp_destroy_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_nested_futex_lock_with_checks );
-
              __kmp_is_user_lock_initialized_ =
                ( int ( * )( kmp_user_lock_p ) ) NULL;
 
@@ -3248,50 +3063,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_ticket_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_ticket_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_ticket_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_ticket_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_ticket_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(ticket);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(ticket);
+            }
+            else {
+                KMP_BIND_USER_LOCK(ticket);
+                KMP_BIND_NESTED_USER_LOCK(ticket);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
               ( &__kmp_destroy_ticket_lock );
 
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_ticket_lock_with_checks );
-
-            __kmp_acquire_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_nested_ticket_lock_with_checks );
-
-            __kmp_test_nested_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_nested_ticket_lock_with_checks );
-
-            __kmp_release_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_nested_ticket_lock_with_checks );
-
-            __kmp_init_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_nested_ticket_lock_with_checks );
-
-            __kmp_destroy_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_nested_ticket_lock_with_checks );
-
              __kmp_is_user_lock_initialized_ =
                ( int ( * )( kmp_user_lock_p ) )
                ( &__kmp_is_ticket_lock_initialized );
@@ -3322,50 +3106,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_queuing_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_queuing_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_queuing_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_queuing_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_queuing_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(queuing);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(queuing);
+            }
+            else {
+                KMP_BIND_USER_LOCK(queuing);
+                KMP_BIND_NESTED_USER_LOCK(queuing);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
               ( &__kmp_destroy_queuing_lock );
 
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_queuing_lock_with_checks );
-
-            __kmp_acquire_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_nested_queuing_lock_with_checks );
-
-            __kmp_test_nested_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_nested_queuing_lock_with_checks );
-
-            __kmp_release_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_nested_queuing_lock_with_checks );
-
-            __kmp_init_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_nested_queuing_lock_with_checks );
-
-            __kmp_destroy_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_nested_queuing_lock_with_checks );
-
              __kmp_is_user_lock_initialized_ =
                ( int ( * )( kmp_user_lock_p ) )
                ( &__kmp_is_queuing_lock_initialized );
@@ -3390,32 +3143,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
 
 #if KMP_USE_ADAPTIVE_LOCKS
         case lk_adaptive: {
-            __kmp_base_user_lock_size = sizeof( kmp_base_queuing_lock_t );
-            __kmp_user_lock_size = sizeof( kmp_queuing_lock_t );
+            __kmp_base_user_lock_size = sizeof( kmp_base_adaptive_lock_t );
+            __kmp_user_lock_size = sizeof( kmp_adaptive_lock_t );
 
             __kmp_get_user_lock_owner_ =
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_queuing_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_adaptive_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_adaptive_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_adaptive_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_adaptive_lock_with_checks );
-
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_adaptive_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(adaptive);
+            }
+            else {
+                KMP_BIND_USER_LOCK(adaptive);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
@@ -3453,50 +3193,19 @@ void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind )
               ( kmp_int32 ( * )( kmp_user_lock_p ) )
               ( &__kmp_get_drdpa_lock_owner );
 
-            __kmp_acquire_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_drdpa_lock_with_checks );
-
-            __kmp_test_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_drdpa_lock_with_checks );
-
-            __kmp_release_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_drdpa_lock_with_checks );
-
-            __kmp_init_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_drdpa_lock_with_checks );
+            if ( __kmp_env_consistency_check ) {
+                KMP_BIND_USER_LOCK_WITH_CHECKS(drdpa);
+                KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(drdpa);
+            }
+            else {
+                KMP_BIND_USER_LOCK(drdpa);
+                KMP_BIND_NESTED_USER_LOCK(drdpa);
+            }
 
             __kmp_destroy_user_lock_ =
               ( void ( * )( kmp_user_lock_p ) )
               ( &__kmp_destroy_drdpa_lock );
 
-            __kmp_destroy_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_drdpa_lock_with_checks );
-
-            __kmp_acquire_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_acquire_nested_drdpa_lock_with_checks );
-
-            __kmp_test_nested_user_lock_with_checks_ =
-              ( int  ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_test_nested_drdpa_lock_with_checks );
-
-            __kmp_release_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p, kmp_int32 ) )
-              ( &__kmp_release_nested_drdpa_lock_with_checks );
-
-            __kmp_init_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_init_nested_drdpa_lock_with_checks );
-
-            __kmp_destroy_nested_user_lock_with_checks_ =
-              ( void ( * )( kmp_user_lock_p ) )
-              ( &__kmp_destroy_nested_drdpa_lock_with_checks );
-
              __kmp_is_user_lock_initialized_ =
                ( int ( * )( kmp_user_lock_p ) )
                ( &__kmp_is_drdpa_lock_initialized );
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index c5ce83823c0..31a93f56cde 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -1,7 +1,7 @@
 /*
  * kmp_lock.h -- lock header file
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -280,16 +280,16 @@ extern void __kmp_destroy_nested_ticket_lock( kmp_ticket_lock_t *lck );
 
 #if KMP_USE_ADAPTIVE_LOCKS
 
-struct kmp_adaptive_lock;
+struct kmp_adaptive_lock_info;
 
-typedef struct kmp_adaptive_lock kmp_adaptive_lock_t;
+typedef struct kmp_adaptive_lock_info kmp_adaptive_lock_info_t;
 
 #if KMP_DEBUG_ADAPTIVE_LOCKS
 
 struct kmp_adaptive_lock_statistics {
     /* So we can get stats from locks that haven't been destroyed. */
-    kmp_adaptive_lock_t * next;
-    kmp_adaptive_lock_t * prev;
+    kmp_adaptive_lock_info_t * next;
+    kmp_adaptive_lock_info_t * prev;
 
     /* Other statistics */
     kmp_uint32 successfulSpeculations;
@@ -307,7 +307,7 @@ extern void __kmp_init_speculative_stats();
 
 #endif // KMP_DEBUG_ADAPTIVE_LOCKS
 
-struct kmp_adaptive_lock
+struct kmp_adaptive_lock_info
 {
     /* Values used for adaptivity.
      * Although these are accessed from multiple threads we don't access them atomically,
@@ -348,10 +348,6 @@ struct kmp_base_queuing_lock {
     kmp_int32           depth_locked; // depth locked, for nested locks only
 
     kmp_lock_flags_t    flags;        // lock specifics, e.g. critical section lock
-#if KMP_USE_ADAPTIVE_LOCKS
-    KMP_ALIGN(CACHE_LINE)
-    kmp_adaptive_lock_t adaptive;     // Information for the speculative adaptive lock
-#endif
 };
 
 typedef struct kmp_base_queuing_lock kmp_base_queuing_lock_t;
@@ -379,6 +375,30 @@ extern void __kmp_release_nested_queuing_lock( kmp_queuing_lock_t *lck, kmp_int3
 extern void __kmp_init_nested_queuing_lock( kmp_queuing_lock_t *lck );
 extern void __kmp_destroy_nested_queuing_lock( kmp_queuing_lock_t *lck );
 
+#if KMP_USE_ADAPTIVE_LOCKS
+
+// ----------------------------------------------------------------------------
+// Adaptive locks.
+// ----------------------------------------------------------------------------
+struct kmp_base_adaptive_lock {
+    kmp_base_queuing_lock qlk;
+    KMP_ALIGN(CACHE_LINE)
+    kmp_adaptive_lock_info_t adaptive;     // Information for the speculative adaptive lock
+};
+
+typedef struct kmp_base_adaptive_lock kmp_base_adaptive_lock_t;
+
+union KMP_ALIGN_CACHE kmp_adaptive_lock {
+    kmp_base_adaptive_lock_t lk;
+    kmp_lock_pool_t pool;
+    double lk_align;
+    char lk_pad[ KMP_PAD(kmp_base_adaptive_lock_t, CACHE_LINE) ];
+};
+typedef union kmp_adaptive_lock kmp_adaptive_lock_t;
+
+# define GET_QLK_PTR(l) ((kmp_queuing_lock_t *) & (l)->lk.qlk)
+
+#endif // KMP_USE_ADAPTIVE_LOCKS
 
 // ----------------------------------------------------------------------------
 // DRDPA ticket locks.
@@ -913,7 +933,26 @@ __kmp_set_user_lock_flags( kmp_user_lock_p lck, kmp_lock_flags_t flags )
 //
 extern void __kmp_set_user_lock_vptrs( kmp_lock_kind_t user_lock_kind );
 
+//
+// Macros for binding user lock functions.
+//
+#define KMP_BIND_USER_LOCK_TEMPLATE(nest, kind, suffix) {                                       \
+    __kmp_acquire##nest##user_lock_with_checks_ = ( void (*)( kmp_user_lock_p, kmp_int32 ) )    \
+                                                  __kmp_acquire##nest##kind##_##suffix;         \
+    __kmp_release##nest##user_lock_with_checks_ = ( void (*)( kmp_user_lock_p, kmp_int32 ) )    \
+                                                  __kmp_release##nest##kind##_##suffix;         \
+    __kmp_test##nest##user_lock_with_checks_    = ( int (*)( kmp_user_lock_p, kmp_int32 ) )     \
+                                                  __kmp_test##nest##kind##_##suffix;            \
+    __kmp_init##nest##user_lock_with_checks_    = ( void (*)( kmp_user_lock_p ) )               \
+                                                  __kmp_init##nest##kind##_##suffix;            \
+    __kmp_destroy##nest##user_lock_with_checks_ = ( void (*)( kmp_user_lock_p ) )               \
+                                                  __kmp_destroy##nest##kind##_##suffix;         \
+}
 
+#define KMP_BIND_USER_LOCK(kind)                    KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock)
+#define KMP_BIND_USER_LOCK_WITH_CHECKS(kind)        KMP_BIND_USER_LOCK_TEMPLATE(_, kind, lock_with_checks)
+#define KMP_BIND_NESTED_USER_LOCK(kind)             KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock)
+#define KMP_BIND_NESTED_USER_LOCK_WITH_CHECKS(kind) KMP_BIND_USER_LOCK_TEMPLATE(_nested_, kind, lock_with_checks)
 
 // ----------------------------------------------------------------------------
 // User lock table & lock allocation
diff --git a/openmp/runtime/src/kmp_omp.h b/openmp/runtime/src/kmp_omp.h
index 667dbc9773c..e663739d462 100644
--- a/openmp/runtime/src/kmp_omp.h
+++ b/openmp/runtime/src/kmp_omp.h
@@ -1,8 +1,8 @@
 /*
  * kmp_omp.h -- OpenMP definition for kmp_omp_struct_info_t.
  *              This is for information about runtime library structures.
- * $Revision: 42105 $
- * $Date: 2013-03-11 14:51:34 -0500 (Mon, 11 Mar 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 489f8f7a323..c65caf8ac4e 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -1,7 +1,7 @@
 /*
  * kmp_os.h -- KPTS runtime header file.
- * $Revision: 42820 $
- * $Date: 2013-11-13 16:53:44 -0600 (Wed, 13 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -69,7 +69,7 @@
 #define KMP_OS_LINUX    0
 #define KMP_OS_FREEBSD  0
 #define KMP_OS_DARWIN   0
-#define KMP_OS_WINDOWS    0
+#define KMP_OS_WINDOWS  0
 #define KMP_OS_CNK      0
 #define KMP_OS_UNIX     0  /* disjunction of KMP_OS_LINUX, KMP_OS_DARWIN etc. */
 
@@ -116,6 +116,12 @@
 # define KMP_OS_UNIX 1
 #endif
 
+#if (KMP_OS_LINUX || KMP_OS_WINDOWS) && !KMP_OS_CNK && !KMP_ARCH_PPC64
+# define KMP_AFFINITY_SUPPORTED 1
+#else
+# define KMP_AFFINITY_SUPPORTED 0
+#endif
+
 #if KMP_OS_WINDOWS
 # if defined _M_AMD64
 #  undef KMP_ARCH_X86_64
@@ -356,6 +362,8 @@ typedef double  kmp_real64;
 extern "C" {
 #endif // __cplusplus
 
+#define INTERNODE_CACHE_LINE 4096 /* for multi-node systems */
+
 /* Define the default size of the cache line */
 #ifndef CACHE_LINE
     #define CACHE_LINE                  128         /* cache line size in bytes */
@@ -366,16 +374,6 @@ extern "C" {
     #endif
 #endif /* CACHE_LINE */
 
-/* SGI's cache padding improvements using align decl specs (Ver 19) */
-#if !defined KMP_PERF_V19
-# define KMP_PERF_V19			KMP_ON
-#endif
-
-/* SGI's improvements for inline argv (Ver 106) */
-#if !defined KMP_PERF_V106
-# define KMP_PERF_V106			KMP_ON
-#endif
-
 #define KMP_CACHE_PREFETCH(ADDR) 	/* nothing */
 
 /* Temporary note: if performance testing of this passes, we can remove
@@ -383,10 +381,12 @@ extern "C" {
 #if KMP_OS_UNIX && defined(__GNUC__)
 # define KMP_DO_ALIGN(bytes)  __attribute__((aligned(bytes)))
 # define KMP_ALIGN_CACHE      __attribute__((aligned(CACHE_LINE)))
+# define KMP_ALIGN_CACHE_INTERNODE __attribute__((aligned(INTERNODE_CACHE_LINE)))
 # define KMP_ALIGN(bytes)     __attribute__((aligned(bytes)))
 #else
 # define KMP_DO_ALIGN(bytes)  __declspec( align(bytes) )
 # define KMP_ALIGN_CACHE      __declspec( align(CACHE_LINE) )
+# define KMP_ALIGN_CACHE_INTERNODE      __declspec( align(INTERNODE_CACHE_LINE) )
 # define KMP_ALIGN(bytes)     __declspec( align(bytes) )
 #endif
 
@@ -525,7 +525,7 @@ extern kmp_real64 __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 v );
 # define KMP_XCHG_REAL64(p, v)                  __kmp_xchg_real64( (p), (v) );
 
 
-#elif (KMP_ASM_INTRINS && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_DARWIN)) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
+#elif (KMP_ASM_INTRINS && KMP_OS_UNIX) || !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
 
 /* cast p to correct type so that proper intrinsic will be used */
 # define KMP_TEST_THEN_INC32(p)                 __sync_fetch_and_add( (kmp_int32 *)(p), 1 )
@@ -654,17 +654,6 @@ extern kmp_real64 __kmp_xchg_real64( volatile kmp_real64 *p, kmp_real64 v );
 
 #endif /* KMP_ASM_INTRINS */
 
-# if !KMP_MIC
-//
-// no routines for floating addition on MIC
-// no intrinsic support for floating addition on UNIX
-//
-extern kmp_real32 __kmp_test_then_add_real32 ( volatile kmp_real32 *p, kmp_real32 v );
-extern kmp_real64 __kmp_test_then_add_real64 ( volatile kmp_real64 *p, kmp_real64 v );
-#  define KMP_TEST_THEN_ADD_REAL32(p, v)        __kmp_test_then_add_real32( (p), (v) )
-#  define KMP_TEST_THEN_ADD_REAL64(p, v)        __kmp_test_then_add_real64( (p), (v) )
-# endif
-
 
 /* ------------- relaxed consistency memory model stuff ------------------ */
 
diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c
index c31b5614395..e6755181958 100644
--- a/openmp/runtime/src/kmp_runtime.c
+++ b/openmp/runtime/src/kmp_runtime.c
@@ -1,7 +1,7 @@
 /*
  * kmp_runtime.c -- KPTS runtime support library
- * $Revision: 42839 $
- * $Date: 2013-11-24 13:01:00 -0600 (Sun, 24 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -25,26 +25,13 @@
 #include "kmp_i18n.h"
 #include "kmp_io.h"
 #include "kmp_error.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
 
 /* these are temporary issues to be dealt with */
 #define KMP_USE_PRCTL 0
 #define KMP_USE_POOLED_ALLOC 0
 
-#if KMP_MIC
-#include <immintrin.h>
-#define USE_NGO_STORES 1
-#endif // KMP_MIC
-
-#if KMP_MIC && USE_NGO_STORES
-#define load_icvs(src)         __m512d Vt_icvs = _mm512_load_pd((void *)(src))
-#define store_icvs(dst, src)   _mm512_storenrngo_pd((void *)(dst), Vt_icvs)
-#define sync_icvs()            __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
-#else
-#define load_icvs(src)         ((void)0)
-#define store_icvs(dst, src)   copy_icvs((dst), (src))
-#define sync_icvs()            ((void)0)
-#endif /* KMP_MIC && USE_NGO_STORES */
-
 #if KMP_OS_WINDOWS
 #include <process.h>
 #endif
@@ -57,34 +44,12 @@ char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler s
 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
 #if OMP_40_ENABLED
     "4.0 (201307)";
-#elif OMP_30_ENABLED
-    "3.1 (201107)";
 #else
-    "2.5 (200505)";
+    "3.1 (201107)";
 #endif
 
 #ifdef KMP_DEBUG
-
 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
-
-char const __kmp_version_perf_v19[] = KMP_VERSION_PREFIX "perf v19: "
-#if KMP_PERF_V19 == KMP_ON
-    "on";
-#elif KMP_PERF_V19 == KMP_OFF
-    "off";
-#else
-    #error "Must specify KMP_PERF_V19 option"
-#endif
-
-char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: "
-#if KMP_PERF_V106 == KMP_ON
-    "on";
-#elif KMP_PERF_V106 == KMP_OFF
-    "off";
-#else
-    #error "Must specify KMP_PERF_V106 option"
-#endif
-
 #endif /* KMP_DEBUG */
 
 
@@ -103,19 +68,12 @@ kmp_info_t __kmp_monitor;
 void __kmp_cleanup( void );
 
 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
-static void __kmp_initialize_team(
-    kmp_team_t * team,
-    int          new_nproc,
-    #if OMP_30_ENABLED
-        kmp_internal_control_t * new_icvs,
-        ident_t *                loc
-    #else
-        int new_set_nproc, int new_set_dynamic, int new_set_nested,
-        int new_set_blocktime, int new_bt_intervals, int new_bt_set
-    #endif // OMP_30_ENABLED
-);
+static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
 static void __kmp_partition_places( kmp_team_t *team );
 static void __kmp_do_serial_initialize( void );
+void __kmp_fork_barrier( int gtid, int tid );
+void __kmp_join_barrier( int gtid );
+void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
 
 
 #ifdef USE_LOAD_BALANCE
@@ -189,8 +147,8 @@ __kmp_get_global_thread_id( )
         kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
         if( !thr ) continue;
 
-        stack_size =  (size_t)TCR_PTR(thr -> th.th_info.ds.ds_stacksize);
-        stack_base =  (char *)TCR_PTR(thr -> th.th_info.ds.ds_stackbase);
+        stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
+        stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
 
         /* stack grows down -- search through all of the active threads */
 
@@ -221,7 +179,7 @@ __kmp_get_global_thread_id( )
         KMP_FATAL( StackOverflow, i );
     }
 
-    stack_base = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
+    stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
     if( stack_addr > stack_base ) {
         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
@@ -232,10 +190,10 @@ __kmp_get_global_thread_id( )
 
     /* Reprint stack bounds for ubermaster since they have been refined */
     if ( __kmp_storage_map ) {
-        char *stack_end = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
-        char *stack_beg = stack_end - other_threads[i] -> th.th_info.ds.ds_stacksize;
+        char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
+        char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
         __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
-                                      other_threads[i] -> th.th_info.ds.ds_stacksize,
+                                      other_threads[i]->th.th_info.ds.ds_stacksize,
                                       "th_%d stack (refinement)", i );
     }
     return i;
@@ -294,8 +252,8 @@ __kmp_check_stack_overlap( kmp_info_t *th )
 
     KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
     if ( __kmp_storage_map ) {
-        stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
-        stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
+        stack_end = (char *) th->th.th_info.ds.ds_stackbase;
+        stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
 
         gtid = __kmp_gtid_from_thread( th );
 
@@ -315,8 +273,8 @@ __kmp_check_stack_overlap( kmp_info_t *th )
     {
         KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
         if ( stack_beg == NULL ) {
-            stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
-            stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
+            stack_end = (char *) th->th.th_info.ds.ds_stackbase;
+            stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
         }
 
         for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
@@ -347,279 +305,6 @@ __kmp_check_stack_overlap( kmp_info_t *th )
 
 /* ------------------------------------------------------------------------ */
 
-#ifndef KMP_DEBUG
-# define __kmp_static_delay( arg )     /* nothing to do */
-#else
-
-static void
-__kmp_static_delay( int arg )
-{
-/* Work around weird code-gen bug that causes assert to trip */
-# if KMP_ARCH_X86_64 && KMP_OS_LINUX
-    KMP_ASSERT( arg != 0 );
-# else
-    KMP_ASSERT( arg >= 0 );
-# endif
-}
-#endif /* KMP_DEBUG */
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
-}
-
-/*
- * Spin wait loop that first does pause, then yield, then sleep.
- * Wait until spinner is equal to checker to exit.
- *
- * A thread that calls __kmp_wait_sleep must make certain that another thread
- * calls __kmp_release to wake it back up up to prevent deadlocks!
- */
-
-void
-__kmp_wait_sleep( kmp_info_t *this_thr,
-                  volatile kmp_uint *spinner,
-                  kmp_uint checker,
-                  int final_spin
-                  USE_ITT_BUILD_ARG (void * itt_sync_obj)
-)
-{
-    /* note: we may not belong to a team at this point */
-    register volatile kmp_uint    *spin      = spinner;
-    register          kmp_uint     check     = checker;
-    register          kmp_uint32   spins;
-    register          kmp_uint32   hibernate;
-                      int          th_gtid, th_tid;
-#if OMP_30_ENABLED
-                      int          flag = FALSE;
-#endif /* OMP_30_ENABLED */
-
-    KMP_FSYNC_SPIN_INIT( spin, NULL );
-    if( TCR_4(*spin) == check ) {
-        KMP_FSYNC_SPIN_ACQUIRED( spin );
-        return;
-    }
-
-    th_gtid = this_thr->th.th_info.ds.ds_gtid;
-
-    KA_TRACE( 20, ("__kmp_wait_sleep: T#%d waiting for spin(%p) == %d\n",
-                  th_gtid,
-                  spin, check ) );
-
-    /* setup for waiting */
-    KMP_INIT_YIELD( spins );
-
-    if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
-        //
-        // The worker threads cannot rely on the team struct existing at this
-        // point.  Use the bt values cached in the thread struct instead.
-        //
- #ifdef KMP_ADJUST_BLOCKTIME
-        if ( __kmp_zero_bt && ! this_thr->th.th_team_bt_set ) {
-            /* force immediate suspend if not set by user and more threads than available procs */
-            hibernate = 0;
-        } else {
-            hibernate = this_thr->th.th_team_bt_intervals;
-        }
- #else
-        hibernate = this_thr->th.th_team_bt_intervals;
- #endif /* KMP_ADJUST_BLOCKTIME */
-
-        //
-        // If the blocktime is nonzero, we want to make sure that we spin
-        // wait for the entirety of the specified #intervals, plus up to
-        // one interval more.  This increment make certain that this thread
-        // doesn't go to sleep too soon.
-        //
-        if ( hibernate != 0 ) {
-            hibernate++;
-        }
-
-        //
-        // Add in the current time value.
-        //
-        hibernate += TCR_4( __kmp_global.g.g_time.dt.t_value );
-
-        KF_TRACE( 20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
-                       th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
-                       hibernate - __kmp_global.g.g_time.dt.t_value ));
-    }
-
-    KMP_MB();
-
-    /* main wait spin loop */
-    while( TCR_4(*spin) != check ) {
-        int in_pool;
-
-#if OMP_30_ENABLED
-        //
-        // If the task team is NULL, it means one of things:
-        //   1) A newly-created thread is first being released by
-        //      __kmp_fork_barrier(), and its task team has not been set up
-        //      yet.
-        //   2) All tasks have been executed to completion, this thread has
-        //      decremented the task team's ref ct and possibly deallocated
-        //      it, and should no longer reference it.
-        //   3) Tasking is off for this region.  This could be because we
-        //      are in a serialized region (perhaps the outer one), or else
-        //      tasking was manually disabled (KMP_TASKING=0).
-        //
-        kmp_task_team_t * task_team = NULL;
-        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-            task_team = this_thr->th.th_task_team;
-            if ( task_team != NULL ) {
-                if ( ! TCR_SYNC_4( task_team->tt.tt_active ) ) {
-                    KMP_DEBUG_ASSERT( ! KMP_MASTER_TID( this_thr->th.th_info.ds.ds_tid ) );
-                    __kmp_unref_task_team( task_team, this_thr );
-                } else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) {
-                    __kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag
-                                         USE_ITT_BUILD_ARG( itt_sync_obj ), 0);
-                }
-            }; // if
-        }; // if
-#endif /* OMP_30_ENABLED */
-
-        KMP_FSYNC_SPIN_PREPARE( spin );
-        if( TCR_4(__kmp_global.g.g_done) ) {
-            if( __kmp_global.g.g_abort )
-                __kmp_abort_thread( );
-            break;
-        }
-
-        __kmp_static_delay( 1 );
-
-        /* if we are oversubscribed,
-           or have waited a bit (and KMP_LIBRARY=throughput), then yield */
-        KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
-        // TODO: Should it be number of cores instead of thread contexts? Like:
-        // KMP_YIELD( TCR_4(__kmp_nth) > __kmp_ncores );
-        // Need performance improvement data to make the change...
-        KMP_YIELD_SPIN( spins );
-
-        //
-        // Check if this thread was transferred from a team
-        // to the thread pool (or vice-versa) while spinning.
-        //
-        in_pool = !!TCR_4(this_thr->th.th_in_pool);
-        if ( in_pool != !!this_thr->th.th_active_in_pool ) {
-            if ( in_pool ) {
-                //
-                // recently transferred from team to pool
-                //
-                KMP_TEST_THEN_INC32(
-                                    (kmp_int32 *) &__kmp_thread_pool_active_nth );
-                this_thr->th.th_active_in_pool = TRUE;
-
-                //
-                // Here, we cannot assert that
-                //
-                // KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth)
-                //  <= __kmp_thread_pool_nth );
-                //
-                // __kmp_thread_pool_nth is inc/dec'd by the master thread
-                // while the fork/join lock is held, whereas
-                // __kmp_thread_pool_active_nth is inc/dec'd asynchronously
-                // by the workers.  The two can get out of sync for brief
-                // periods of time.
-                //
-            }
-            else {
-                //
-                // recently transferred from pool to team
-                //
-                KMP_TEST_THEN_DEC32(
-                                    (kmp_int32 *) &__kmp_thread_pool_active_nth );
-                KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
-                this_thr->th.th_active_in_pool = FALSE;
-            }
-        }
-
-#if OMP_30_ENABLED
-        // Don't suspend if there is a likelihood of new tasks being spawned.
-        if ( ( task_team != NULL ) && TCR_4(task_team->tt.tt_found_tasks) ) {
-            continue;
-        }
-#endif /* OMP_30_ENABLED */
-
-        /* Don't suspend if KMP_BLOCKTIME is set to "infinite" */
-        if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
-            continue;
-        }
-
-        /* if we have waited a bit more, fall asleep */
-        if ( TCR_4( __kmp_global.g.g_time.dt.t_value ) < hibernate ) {
-            continue;
-        }
-
-        KF_TRACE( 50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid ) );
-
-        __kmp_suspend( th_gtid, spin, check );
-
-        if( TCR_4( __kmp_global.g.g_done ) && __kmp_global.g.g_abort ) {
-            __kmp_abort_thread( );
-        }
-
-        /* TODO */
-        /* if thread is done with work and timesout, disband/free */
-    }
-
-    KMP_FSYNC_SPIN_ACQUIRED( spin );
-}
-
-
-/*
- * Release the thread specified by target_thr from waiting by setting the location
- * specified by spin and resume the thread if indicated by the sleep parameter.
- *
- * A thread that calls __kmp_wait_sleep must call this function to wake up the
- * potentially sleeping thread and prevent deadlocks!
- */
-
-void
-__kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin,
-               enum kmp_mem_fence_type fetchadd_fence )
-{
-    kmp_uint old_spin;
-    #ifdef KMP_DEBUG
-        int target_gtid = target_thr->th.th_info.ds.ds_gtid;
-        int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
-    #endif
-
-    KF_TRACE( 20, ( "__kmp_release: T#%d releasing T#%d spin(%p) fence_type(%d)\n",
-                    gtid, target_gtid, spin, fetchadd_fence ));
-
-    KMP_DEBUG_ASSERT( spin );
-
-    KMP_DEBUG_ASSERT( fetchadd_fence == kmp_acquire_fence ||
-                      fetchadd_fence == kmp_release_fence );
-
-    KMP_FSYNC_RELEASING( spin );
-
-    old_spin = ( fetchadd_fence == kmp_acquire_fence )
-                 ? KMP_TEST_THEN_ADD4_ACQ32( (volatile kmp_int32 *) spin )
-                 : KMP_TEST_THEN_ADD4_32( (volatile kmp_int32 *) spin );
-
-    KF_TRACE( 100, ( "__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
-                     gtid, spin, old_spin, *spin ) );
-
-    if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
-        /* Only need to check sleep stuff if infinite block time not set */
-        if ( old_spin & KMP_BARRIER_SLEEP_STATE ) {
- #ifndef KMP_DEBUG
-            int target_gtid = target_thr->th.th_info.ds.ds_gtid;
- #endif
-            /* wake up thread if needed */
-            KF_TRACE( 50, ( "__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
-                            gtid, target_gtid, spin ));
-            __kmp_resume( target_gtid, spin );
-        } else {
-            KF_TRACE( 50, ( "__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
-                            gtid, target_gtid, spin ));
-        }
-    }
-}
-
 /* ------------------------------------------------------------------------ */
 
 void
@@ -848,13 +533,11 @@ __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id,
     __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
                              sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
 
-#if OMP_30_ENABLED
     //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
     //                        sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
 
     __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
                              sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
-#endif // OMP_30_ENABLED
 #if OMP_40_ENABLED
     __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
                              sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
@@ -953,7 +636,7 @@ DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
                 // the problem of unreleased forkjoin lock as described below.
 
                 // A worker thread can take the forkjoin lock
-                // in __kmp_suspend()->__kmp_rml_decrease_load_before_sleep().
+                // in __kmp_suspend_template()->__kmp_rml_decrease_load_before_sleep().
                 // The problem comes up if that worker thread becomes dead
                 // before it releases the forkjoin lock.
                 // The forkjoin lock remains taken, while the thread
@@ -1034,15 +717,15 @@ __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
 #endif /* BUILD_PARALLEL_ORDERED */
 
     if( __kmp_env_consistency_check ) {
-        if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
+        if( __kmp_threads[gtid]->th.th_root->r.r_active )
             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
     }
 #ifdef BUILD_PARALLEL_ORDERED
-    if( !team -> t.t_serialized ) {
+    if( !team->t.t_serialized ) {
         kmp_uint32  spins;
 
         KMP_MB();
-        KMP_WAIT_YIELD(&team -> t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
+        KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
         KMP_MB();
     }
 #endif /* BUILD_PARALLEL_ORDERED */
@@ -1062,16 +745,16 @@ __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
 #endif /* BUILD_PARALLEL_ORDERED */
 
     if( __kmp_env_consistency_check ) {
-        if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
+        if( __kmp_threads[gtid]->th.th_root->r.r_active )
             __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
     }
 #ifdef BUILD_PARALLEL_ORDERED
-    if ( ! team -> t.t_serialized ) {
+    if ( ! team->t.t_serialized ) {
         KMP_MB();       /* Flush all pending memory write invalidates.  */
 
         /* use the tid of the next thread in this team */
         /* TODO repleace with general release procedure */
-        team -> t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
+        team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
 
         KMP_MB();       /* Flush all pending memory write invalidates.  */
     }
@@ -1097,12 +780,12 @@ __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
         __kmp_parallel_initialize();
 
     th   = __kmp_threads[ gtid ];
-    team = th -> th.th_team;
+    team = th->th.th_team;
     status = 0;
 
     th->th.th_ident = id_ref;
 
-    if ( team -> t.t_serialized ) {
+    if ( team->t.t_serialized ) {
         status = 1;
     } else {
         kmp_int32 old_this = th->th.th_local.this_construct;
@@ -1112,7 +795,7 @@ __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
            single block
         */
         /* TODO: Should this be acquire or release? */
-        status = KMP_COMPARE_AND_STORE_ACQ32(&team -> t.t_construct, old_this,
+        status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
                                              th->th.th_local.this_construct);
     }
 
@@ -1127,6 +810,10 @@ __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
     if ( status ) {
         __kmp_itt_single_start( gtid );
     }
+    if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
+        __kmp_itt_metadata_single();
+    }
+
 #endif /* USE_ITT_BUILD */
     return status;
 }
@@ -1142,987 +829,6 @@ __kmp_exit_single( int gtid )
 }
 
 
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
-static void
-__kmp_linear_barrier_gather( enum barrier_type bt,
-                             kmp_info_t *this_thr,
-                             int gtid,
-                             int tid,
-                             void (*reduce)(void *, void *)
-                             USE_ITT_BUILD_ARG(void * itt_sync_obj)
-                             )
-{
-    register kmp_team_t    *team          = this_thr -> th.th_team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_info_t   **other_threads = team -> t.t_threads;
-
-    KA_TRACE( 20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
-                   gtid, team->t.t_id, tid, bt ) );
-
-    KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
-
-    /*
-     * We now perform a linear reduction to signal that all
-     * of the threads have arrived.
-     *
-     * Collect all the worker team member threads.
-     */
-    if ( ! KMP_MASTER_TID( tid )) {
-
-        KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
-                        "arrived(%p): %u => %u\n",
-                        gtid, team->t.t_id, tid,
-                        __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
-                        &thr_bar -> b_arrived, thr_bar -> b_arrived,
-                        thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
-                      ) );
-
-        /* mark arrival to master thread */
-        //
-        // After performing this write, a worker thread may not assume that
-        // the team is valid any more - it could be deallocated by the master
-        // thread at any time.
-        //
-        __kmp_release( other_threads[0], &thr_bar -> b_arrived, kmp_release_fence );
-
-    } else {
-        register kmp_balign_team_t *team_bar  = & team -> t.t_bar[ bt ];
-        register int                nproc     = this_thr -> th.th_team_nproc;
-        register int                i;
-        /* Don't have to worry about sleep bit here or atomic since team setting */
-        register kmp_uint           new_state  = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
-
-        /* Collect all the worker team member threads. */
-        for (i = 1; i < nproc; i++) {
-#if KMP_CACHE_MANAGE
-            /* prefetch next thread's arrived count */
-            if ( i+1 < nproc )
-                KMP_CACHE_PREFETCH( &other_threads[ i+1 ] -> th.th_bar[ bt ].bb.b_arrived );
-#endif /* KMP_CACHE_MANAGE */
-            KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
-                            "arrived(%p) == %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
-                            &other_threads[i] -> th.th_bar[ bt ].bb.b_arrived,
-                            new_state ) );
-
-            /* wait for worker thread to arrive */
-            __kmp_wait_sleep( this_thr,
-                              & other_threads[ i ] -> th.th_bar[ bt ].bb.b_arrived,
-                              new_state, FALSE
-                              USE_ITT_BUILD_ARG( itt_sync_obj )
-                              );
-
-            if (reduce) {
-
-                KA_TRACE( 100, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
-                                 gtid, team->t.t_id, tid,
-                                 __kmp_gtid_from_tid( i, team ), team->t.t_id, i ) );
-
-                (*reduce)( this_thr -> th.th_local.reduce_data,
-                           other_threads[ i ] -> th.th_local.reduce_data );
-
-            }
-
-        }
-
-        /* Don't have to worry about sleep bit here or atomic since team setting */
-        team_bar -> b_arrived = new_state;
-        KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
-                        "arrived(%p) = %u\n",
-                        gtid, team->t.t_id, tid, team->t.t_id,
-                        &team_bar -> b_arrived, new_state ) );
-    }
-
-    KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
-                    gtid, team->t.t_id, tid, bt ) );
-}
-
-
-static void
-__kmp_tree_barrier_gather( enum barrier_type bt,
-                           kmp_info_t *this_thr,
-                           int gtid,
-                           int tid,
-                           void (*reduce) (void *, void *)
-                           USE_ITT_BUILD_ARG( void * itt_sync_obj )
-                           )
-{
-    register kmp_team_t    *team          = this_thr -> th.th_team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_info_t   **other_threads = team -> t.t_threads;
-    register kmp_uint32     nproc         = this_thr -> th.th_team_nproc;
-    register kmp_uint32     branch_bits   = __kmp_barrier_gather_branch_bits[ bt ];
-    register kmp_uint32     branch_factor = 1 << branch_bits ;
-    register kmp_uint32     child;
-    register kmp_uint32     child_tid;
-    register kmp_uint       new_state;
-
-    KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
-                    gtid, team->t.t_id, tid, bt ) );
-
-    KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
-
-    /*
-     * We now perform a tree gather to wait until all
-     * of the threads have arrived, and reduce any required data
-     * as we go.
-     */
-
-    child_tid = (tid << branch_bits) + 1;
-
-    if ( child_tid < nproc ) {
-
-        /* parent threads wait for all their children to arrive */
-        new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
-        child = 1;
-
-        do {
-            register kmp_info_t   *child_thr = other_threads[ child_tid ];
-            register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
-#if KMP_CACHE_MANAGE
-            /* prefetch next thread's arrived count */
-            if ( child+1 <= branch_factor && child_tid+1 < nproc )
-                KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_arrived );
-#endif /* KMP_CACHE_MANAGE */
-            KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
-                            "arrived(%p) == %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
-                            &child_bar -> b_arrived, new_state ) );
-
-            /* wait for child to arrive */
-            __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
-                              USE_ITT_BUILD_ARG( itt_sync_obj)
-                              );
-
-            if (reduce) {
-
-                KA_TRACE( 100, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
-                                 gtid, team->t.t_id, tid,
-                                 __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
-                                 child_tid ) );
-
-                (*reduce)( this_thr -> th.th_local.reduce_data,
-                           child_thr -> th.th_local.reduce_data );
-
-            }
-
-            child++;
-            child_tid++;
-        }
-        while ( child <= branch_factor && child_tid < nproc );
-    }
-
-    if ( !KMP_MASTER_TID(tid) ) {
-        /* worker threads */
-        register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
-
-        KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
-                        "arrived(%p): %u => %u\n",
-                        gtid, team->t.t_id, tid,
-                        __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
-                        &thr_bar -> b_arrived, thr_bar -> b_arrived,
-                        thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
-                      ) );
-
-        /* mark arrival to parent thread */
-        //
-        // After performing this write, a worker thread may not assume that
-        // the team is valid any more - it could be deallocated by the master
-        // thread at any time.
-        //
-        __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
-
-    } else {
-        /* Need to update the team arrived pointer if we are the master thread */
-
-        if ( nproc > 1 )
-            /* New value was already computed above */
-            team -> t.t_bar[ bt ].b_arrived = new_state;
-        else
-            team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
-
-        KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
-                        gtid, team->t.t_id, tid, team->t.t_id,
-                        &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
-    }
-
-    KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
-                    gtid, team->t.t_id, tid, bt ) );
-}
-
-
-static void
-__kmp_hyper_barrier_gather( enum barrier_type bt,
-                            kmp_info_t *this_thr,
-                            int gtid,
-                            int tid,
-                            void (*reduce) (void *, void *)
-                            USE_ITT_BUILD_ARG (void * itt_sync_obj)
-                            )
-{
-    register kmp_team_t    *team          = this_thr -> th.th_team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_info_t   **other_threads = team -> t.t_threads;
-    register kmp_uint       new_state     = KMP_BARRIER_UNUSED_STATE;
-    register kmp_uint32     num_threads   = this_thr -> th.th_team_nproc;
-    register kmp_uint32     branch_bits   = __kmp_barrier_gather_branch_bits[ bt ];
-    register kmp_uint32     branch_factor = 1 << branch_bits ;
-    register kmp_uint32     offset;
-    register kmp_uint32     level;
-
-    KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
-                    gtid, team->t.t_id, tid, bt ) );
-
-    KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-    // Barrier imbalance - save arrive time to the thread
-    if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
-        this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
-    }
-#endif
-    /*
-     * We now perform a hypercube-embedded tree gather to wait until all
-     * of the threads have arrived, and reduce any required data
-     * as we go.
-     */
-
-    for ( level=0, offset =1;
-          offset < num_threads;
-          level += branch_bits, offset <<= branch_bits )
-    {
-        register kmp_uint32     child;
-        register kmp_uint32 child_tid;
-
-        if ( ((tid >> level) & (branch_factor - 1)) != 0 ) {
-            register kmp_int32 parent_tid = tid & ~( (1 << (level + branch_bits)) -1 );
-
-            KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
-                            "arrived(%p): %u => %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
-                            &thr_bar -> b_arrived, thr_bar -> b_arrived,
-                            thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
-                          ) );
-
-            /* mark arrival to parent thread */
-            //
-            // After performing this write (in the last iteration of the
-            // enclosing for loop), a worker thread may not assume that the
-            // team is valid any more - it could be deallocated by the master
-            // thread at any time.
-            //
-            __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
-            break;
-        }
-
-        /* parent threads wait for children to arrive */
-
-        if (new_state == KMP_BARRIER_UNUSED_STATE)
-            new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
-
-        for ( child = 1, child_tid = tid + (1 << level);
-              child < branch_factor && child_tid < num_threads;
-              child++, child_tid += (1 << level) )
-        {
-            register kmp_info_t   *child_thr = other_threads[ child_tid ];
-            register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
-#if KMP_CACHE_MANAGE
-            register kmp_uint32 next_child_tid = child_tid + (1 << level);
-            /* prefetch next thread's arrived count */
-            if ( child+1 < branch_factor && next_child_tid < num_threads )
-                KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived );
-#endif /* KMP_CACHE_MANAGE */
-            KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
-                            "arrived(%p) == %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
-                            &child_bar -> b_arrived, new_state ) );
-
-            /* wait for child to arrive */
-            __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
-                              USE_ITT_BUILD_ARG (itt_sync_obj)
-                              );
-
-#if USE_ITT_BUILD
-            // Barrier imbalance - write min of the thread time and a child time to the thread.
-            if( __kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3 ) {
-                this_thr->th.th_bar_arrive_time = KMP_MIN( this_thr->th.th_bar_arrive_time, child_thr->th.th_bar_arrive_time );
-            }
-#endif
-            if (reduce) {
-
-                KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
-                                 gtid, team->t.t_id, tid,
-                                 __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
-                                 child_tid ) );
-
-                (*reduce)( this_thr -> th.th_local.reduce_data,
-                           child_thr -> th.th_local.reduce_data );
-
-            }
-        }
-    }
-
-
-    if ( KMP_MASTER_TID(tid) ) {
-        /* Need to update the team arrived pointer if we are the master thread */
-
-        if (new_state == KMP_BARRIER_UNUSED_STATE)
-            team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
-        else
-            team -> t.t_bar[ bt ].b_arrived = new_state;
-
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
-                        gtid, team->t.t_id, tid, team->t.t_id,
-                        &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
-    }
-
-    KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
-                    gtid, team->t.t_id, tid, bt ) );
-
-}
-
-static void
-__kmp_linear_barrier_release( enum barrier_type bt,
-                              kmp_info_t *this_thr,
-                              int gtid,
-                              int tid,
-                              int propagate_icvs
-                              USE_ITT_BUILD_ARG(void * itt_sync_obj)
-                              )
-{
-    register kmp_bstate_t *thr_bar = &this_thr -> th.th_bar[ bt ].bb;
-    register kmp_team_t *team;
-
-    if (KMP_MASTER_TID( tid )) {
-        register unsigned int i;
-        register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
-        register kmp_info_t **other_threads;
-
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-        other_threads = team -> t.t_threads;
-
-        KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
-          gtid, team->t.t_id, tid, bt ) );
-
-        if (nproc > 1) {
-#if KMP_BARRIER_ICV_PUSH
-            if ( propagate_icvs ) {
-                load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
-                for (i = 1; i < nproc; i++) {
-                    __kmp_init_implicit_task( team->t.t_ident,
-                                              team->t.t_threads[i], team, i, FALSE );
-                    store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
-                }
-                sync_icvs();
-            }
-#endif // KMP_BARRIER_ICV_PUSH
-
-            /* Now, release all of the worker threads */
-            for (i = 1; i < nproc; i++) {
-#if KMP_CACHE_MANAGE
-                /* prefetch next thread's go flag */
-                if( i+1 < nproc )
-                    KMP_CACHE_PREFETCH( &other_threads[ i+1 ]-> th.th_bar[ bt ].bb.b_go );
-#endif /* KMP_CACHE_MANAGE */
-                KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
-                                "go(%p): %u => %u\n",
-                                gtid, team->t.t_id, tid,
-                                other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
-                                &other_threads[i]->th.th_bar[bt].bb.b_go,
-                                other_threads[i]->th.th_bar[bt].bb.b_go,
-                                other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP
-                                ) );
-
-                __kmp_release( other_threads[ i ],
-                               &other_threads[ i ]-> th.th_bar[ bt ].bb.b_go, kmp_acquire_fence );
-            }
-        }
-    } else {
-        /* Wait for the MASTER thread to release us */
-
-        KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
-          gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
-
-        __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
-                          USE_ITT_BUILD_ARG(itt_sync_obj)
-                          );
-
-#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
-            // cancel wait on previous parallel region...
-            __kmp_itt_task_starting( itt_sync_obj );
-
-            if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-                return;
-
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );  // call prepare as early as possible for "new" barrier
-
-        } else
-#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
-        //
-        // early exit for reaping threads releasing forkjoin barrier
-        //
-        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-            return;
-
-        //
-        // The worker thread may now assume that the team is valid.
-        //
-#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
-        // libguide only code (cannot use *itt_task* routines)
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            __kmp_itt_barrier_starting( gtid, itt_sync_obj );  // no need to call releasing, but we have paired calls...
-        }
-#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
-        #ifdef KMP_DEBUG
-            tid = __kmp_tid_from_gtid( gtid );
-            team = __kmp_threads[ gtid ]-> th.th_team;
-        #endif
-        KMP_DEBUG_ASSERT( team != NULL );
-
-        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
-        KA_TRACE( 20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
-          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
-
-        KMP_MB();       /* Flush all pending memory write invalidates.  */
-    }
-
-    KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
-      gtid, team->t.t_id, tid, bt ) );
-}
-
-
-static void
-__kmp_tree_barrier_release( enum barrier_type bt,
-                            kmp_info_t *this_thr,
-                            int gtid,
-                            int tid,
-                            int propagate_icvs
-                            USE_ITT_BUILD_ARG(void * itt_sync_obj)
-                            )
-{
-    /* handle fork barrier workers who aren't part of a team yet */
-    register kmp_team_t    *team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_uint32     nproc;
-    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
-    register kmp_uint32     branch_factor = 1 << branch_bits ;
-    register kmp_uint32     child;
-    register kmp_uint32     child_tid;
-
-    /*
-     * We now perform a tree release for all
-     * of the threads that have been gathered
-     */
-
-    if ( ! KMP_MASTER_TID( tid )) {
-        /* worker threads */
-
-        KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
-          gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
-
-        /* wait for parent thread to release us */
-        __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
-                          USE_ITT_BUILD_ARG(itt_sync_obj)
-                          );
-
-#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
-            // cancel wait on previous parallel region...
-            __kmp_itt_task_starting( itt_sync_obj );
-
-            if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-                return;
-
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );  // call prepare as early as possible for "new" barrier
-
-        } else
-#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
-        //
-        // early exit for reaping threads releasing forkjoin barrier
-        //
-        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-            return;
-
-        //
-        // The worker thread may now assume that the team is valid.
-        //
-#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
-        // libguide only code (cannot use *itt_task* routines)
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            __kmp_itt_barrier_starting( gtid, itt_sync_obj );  // no need to call releasing, but we have paired calls...
-        }
-#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-        tid = __kmp_tid_from_gtid( gtid );
-
-        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
-        KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
-          gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
-
-        KMP_MB();       /* Flush all pending memory write invalidates.  */
-
-    } else {
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-
-        KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
-          gtid, team->t.t_id, tid, bt ) );
-    }
-
-    nproc     = this_thr -> th.th_team_nproc;
-    child_tid = ( tid << branch_bits ) + 1;
-
-    if ( child_tid < nproc ) {
-        register kmp_info_t **other_threads = team -> t.t_threads;
-        child = 1;
-        /* parent threads release all their children */
-
-        do {
-            register kmp_info_t   *child_thr = other_threads[ child_tid ];
-            register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
-#if KMP_CACHE_MANAGE
-            /* prefetch next thread's go count */
-            if ( child+1 <= branch_factor && child_tid+1 < nproc )
-                KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_go );
-#endif /* KMP_CACHE_MANAGE */
-
-#if KMP_BARRIER_ICV_PUSH
-            if ( propagate_icvs ) {
-                __kmp_init_implicit_task( team->t.t_ident,
-                  team->t.t_threads[child_tid], team, child_tid, FALSE );
-                load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
-                store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
-                sync_icvs();
-            }
-#endif // KMP_BARRIER_ICV_PUSH
-
-            KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
-                            "go(%p): %u => %u\n",
-                            gtid, team->t.t_id, tid,
-                            __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
-                            child_tid, &child_bar -> b_go, child_bar -> b_go,
-                            child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
-
-            /* release child from barrier */
-            __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
-
-            child++;
-            child_tid++;
-        }
-        while ( child <= branch_factor && child_tid < nproc );
-    }
-
-    KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
-      gtid, team->t.t_id, tid, bt ) );
-}
-
-/* The reverse versions seem to beat the forward versions overall */
-#define KMP_REVERSE_HYPER_BAR
-static void
-__kmp_hyper_barrier_release( enum barrier_type bt,
-                             kmp_info_t *this_thr,
-                             int gtid,
-                             int tid,
-                             int propagate_icvs
-                             USE_ITT_BUILD_ARG(void * itt_sync_obj)
-                             )
-{
-    /* handle fork barrier workers who aren't part of a team yet */
-    register kmp_team_t    *team;
-    register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
-    register kmp_info_t   **other_threads;
-    register kmp_uint32     num_threads;
-    register kmp_uint32     branch_bits   = __kmp_barrier_release_branch_bits[ bt ];
-    register kmp_uint32     branch_factor = 1 << branch_bits;
-    register kmp_uint32     child;
-    register kmp_uint32     child_tid;
-    register kmp_uint32     offset;
-    register kmp_uint32     level;
-
-    /* Perform a hypercube-embedded tree release for all of the threads
-       that have been gathered.  If KMP_REVERSE_HYPER_BAR is defined (default)
-       the threads are released in the reverse order of the corresponding gather,
-       otherwise threads are released in the same order. */
-
-    if ( ! KMP_MASTER_TID( tid )) {
-        /* worker threads */
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
-          gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
-
-        /* wait for parent thread to release us */
-        __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
-                          USE_ITT_BUILD_ARG( itt_sync_obj )
-                          );
-
-#if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
-            // cancel wait on previous parallel region...
-            __kmp_itt_task_starting( itt_sync_obj );
-
-            if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-                return;
-
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            if ( itt_sync_obj != NULL )
-                __kmp_itt_task_finished( itt_sync_obj );  // call prepare as early as possible for "new" barrier
-
-        } else
-#endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
-        //
-        // early exit for reaping threads releasing forkjoin barrier
-        //
-        if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
-            return;
-
-        //
-        // The worker thread may now assume that the team is valid.
-        //
-#if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
-        // libguide only code (cannot use *itt_task* routines)
-        if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
-            // we are on a fork barrier where we could not get the object reliably
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-            __kmp_itt_barrier_starting( gtid, itt_sync_obj );  // no need to call releasing, but we have paired calls...
-        }
-#endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-        tid = __kmp_tid_from_gtid( gtid );
-
-        TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
-                        gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
-
-        KMP_MB();       /* Flush all pending memory write invalidates.  */
-
-    } else {  /* KMP_MASTER_TID(tid) */
-        team = __kmp_threads[ gtid ]-> th.th_team;
-        KMP_DEBUG_ASSERT( team != NULL );
-
-        KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
-          gtid, team->t.t_id, tid, bt ) );
-    }
-
-    num_threads = this_thr -> th.th_team_nproc;
-    other_threads = team -> t.t_threads;
-
-#ifdef KMP_REVERSE_HYPER_BAR
-    /* count up to correct level for parent */
-    for ( level = 0, offset = 1;
-          offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0);
-          level += branch_bits, offset <<= branch_bits );
-
-    /* now go down from there */
-    for ( level -= branch_bits, offset >>= branch_bits;
-          offset != 0;
-          level -= branch_bits, offset >>= branch_bits )
-#else
-    /* Go down the tree, level by level */
-    for ( level = 0, offset = 1;
-          offset < num_threads;
-          level += branch_bits, offset <<= branch_bits )
-#endif // KMP_REVERSE_HYPER_BAR
-    {
-#ifdef KMP_REVERSE_HYPER_BAR
-        /* Now go in reverse order through the children, highest to lowest.
-           Initial setting of child is conservative here. */
-        child = num_threads >> ((level==0)?level:level-1);
-        for ( child = (child < branch_factor-1) ? child : branch_factor-1,
-                  child_tid = tid + (child << level);
-              child >= 1;
-              child--, child_tid -= (1 << level) )
-#else
-        if (((tid >> level) & (branch_factor - 1)) != 0)
-            /* No need to go any lower than this, since this is the level
-               parent would be notified */
-            break;
-
-        /* iterate through children on this level of the tree */
-        for ( child = 1, child_tid = tid + (1 << level);
-              child < branch_factor && child_tid < num_threads;
-              child++, child_tid += (1 << level) )
-#endif // KMP_REVERSE_HYPER_BAR
-        {
-            if ( child_tid >= num_threads ) continue;   /* child doesn't exist so keep going */
-            else {
-                register kmp_info_t   *child_thr = other_threads[ child_tid ];
-                register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
-#if KMP_CACHE_MANAGE
-                register kmp_uint32 next_child_tid = child_tid - (1 << level);
-                /* prefetch next thread's go count */
-#ifdef KMP_REVERSE_HYPER_BAR
-                if ( child-1 >= 1 && next_child_tid < num_threads )
-#else
-                if ( child+1 < branch_factor && next_child_tid < num_threads )
-#endif // KMP_REVERSE_HYPER_BAR
-                    KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
-#endif /* KMP_CACHE_MANAGE */
-
-#if KMP_BARRIER_ICV_PUSH
-                if ( propagate_icvs ) {
-                    KMP_DEBUG_ASSERT( team != NULL );
-                    __kmp_init_implicit_task( team->t.t_ident,
-                      team->t.t_threads[child_tid], team, child_tid, FALSE );
-                    load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
-                    store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
-                    sync_icvs();
-                }
-#endif // KMP_BARRIER_ICV_PUSH
-
-                KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
-                                "go(%p): %u => %u\n",
-                                gtid, team->t.t_id, tid,
-                                __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
-                                child_tid, &child_bar -> b_go, child_bar -> b_go,
-                                child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
-
-                /* release child from barrier */
-                __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
-            }
-        }
-    }
-
-    KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
-      gtid, team->t.t_id, tid, bt ) );
-}
-
-/*
- * Internal function to do a barrier.
- * If is_split is true, do a split barrier, otherwise, do a plain barrier
- * If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
- * Returns 0 if master thread, 1 if worker thread.
- */
-int
-__kmp_barrier( enum barrier_type bt, int gtid, int is_split,
-               size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) )
-{
-    register int          tid             = __kmp_tid_from_gtid( gtid );
-    register kmp_info_t  *this_thr        = __kmp_threads[ gtid ];
-    register kmp_team_t  *team            = this_thr -> th.th_team;
-    register int status = 0;
-
-    ident_t * tmp_loc = __kmp_threads[ gtid ]->th.th_ident;
-
-    KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n",
-                    gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
-
-    if ( ! team->t.t_serialized ) {
-#if USE_ITT_BUILD
-        // This value will be used in itt notify events below.
-        void * itt_sync_obj = NULL;
-        #if USE_ITT_NOTIFY
-            if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-                itt_sync_obj = __kmp_itt_barrier_object( gtid, bt, 1 );
-        #endif
-#endif /* USE_ITT_BUILD */
-        #if OMP_30_ENABLED
-            if ( __kmp_tasking_mode == tskm_extra_barrier ) {
-                __kmp_tasking_barrier( team, this_thr, gtid );
-                KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
-                               gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
-            }
-        #endif /* OMP_30_ENABLED */
-
-        //
-        // Copy the blocktime info to the thread, where __kmp_wait_sleep()
-        // can access it when the team struct is not guaranteed to exist.
-        //
-        // See the note about the corresponding code in __kmp_join_barrier()
-        // being performance-critical.
-        //
-        if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
-            #if OMP_30_ENABLED
-                this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
-                this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
-            #else
-                this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
-                this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
-            #endif // OMP_30_ENABLED
-        }
-
-#if USE_ITT_BUILD
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-            __kmp_itt_barrier_starting( gtid, itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-        if ( reduce != NULL ) {
-            //KMP_DEBUG_ASSERT( is_split == TRUE );  // #C69956
-            this_thr -> th.th_local.reduce_data = reduce_data;
-        }
-        if ( __kmp_barrier_gather_pattern[ bt ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bt ] == 0 ) {
-            __kmp_linear_barrier_gather( bt, this_thr, gtid, tid, reduce
-                                         USE_ITT_BUILD_ARG( itt_sync_obj )
-                                         );
-        } else if ( __kmp_barrier_gather_pattern[ bt ] == bp_tree_bar ) {
-            __kmp_tree_barrier_gather( bt, this_thr, gtid, tid, reduce
-                                       USE_ITT_BUILD_ARG( itt_sync_obj )
-                                       );
-        } else {
-            __kmp_hyper_barrier_gather( bt, this_thr, gtid, tid, reduce
-                                        USE_ITT_BUILD_ARG( itt_sync_obj )
-                                        );
-        }; // if
-
-#if USE_ITT_BUILD
-        // TODO: In case of split reduction barrier, master thread may send acquired event early,
-        // before the final summation into the shared variable is done (final summation can be a
-        // long operation for array reductions).
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-            __kmp_itt_barrier_middle( gtid, itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-        KMP_MB();
-
-        if ( KMP_MASTER_TID( tid ) ) {
-            status = 0;
-
-            #if OMP_30_ENABLED
-                if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-                    __kmp_task_team_wait(  this_thr, team
-                                           USE_ITT_BUILD_ARG( itt_sync_obj )
-                                           );
-                    __kmp_task_team_setup( this_thr, team );
-                }
-            #endif /* OMP_30_ENABLED */
-
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-            // Barrier - report frame end
-            if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) {
-                kmp_uint64 tmp = __itt_get_timestamp();
-                switch( __kmp_forkjoin_frames_mode ) {
-                case 1:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
-                  this_thr->th.th_frame_time = tmp;
-                  break;
-                case 2:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
-                  break;
-                case 3:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, tmp_loc );
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, tmp_loc );
-                  this_thr->th.th_frame_time = tmp;
-                  break;
-                }
-            }
-#endif /* USE_ITT_BUILD */
-        } else {
-            status = 1;
-        }
-        if ( status == 1 || ! is_split ) {
-            if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
-                __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
-                                              USE_ITT_BUILD_ARG( itt_sync_obj )
-                                              );
-            } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
-                __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
-                                            USE_ITT_BUILD_ARG( itt_sync_obj )
-                                            );
-            } else {
-                __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
-                                             USE_ITT_BUILD_ARG( itt_sync_obj )
-                                             );
-            }
-            #if OMP_30_ENABLED
-                if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-                    __kmp_task_team_sync( this_thr, team );
-                }
-            #endif /* OMP_30_ENABLED */
-        }
-
-#if USE_ITT_BUILD
-        // GEH: TODO: Move this under if-condition above and also include in __kmp_end_split_barrier().
-        //      This will more accurately represent the actual release time of the threads for split barriers.
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-            __kmp_itt_barrier_finished( gtid, itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-    } else {    // Team is serialized.
-
-        status = 0;
-
-        #if OMP_30_ENABLED
-            if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-                //
-                // The task team should be NULL for serialized code.
-                // (tasks will be executed immediately).
-                //
-                KMP_DEBUG_ASSERT( team->t.t_task_team == NULL );
-                KMP_DEBUG_ASSERT( this_thr->th.th_task_team == NULL );
-            }
-        #endif /* OMP_30_ENABLED */
-    }
-
-    KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
-                    gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid),
-                    status ) );
-    return status;
-}
-
-
-void
-__kmp_end_split_barrier( enum barrier_type bt, int gtid )
-{
-    int         tid      = __kmp_tid_from_gtid( gtid );
-    kmp_info_t *this_thr = __kmp_threads[ gtid ];
-    kmp_team_t *team     = this_thr -> th.th_team;
-
-    if( ! team -> t.t_serialized ) {
-        if( KMP_MASTER_GTID( gtid ) ) {
-            if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
-                __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
-#if USE_ITT_BUILD
-                                              , NULL
-#endif /* USE_ITT_BUILD */
-                                              );
-            } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
-                __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
-#if USE_ITT_BUILD
-                                            , NULL
-#endif /* USE_ITT_BUILD */
-                                            );
-            } else {
-                __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
-#if USE_ITT_BUILD
-                                             , NULL
-#endif /* USE_ITT_BUILD */
-                                             );
-            }; // if
-            #if OMP_30_ENABLED
-                if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-                    __kmp_task_team_sync( this_thr, team );
-                }; // if
-            #endif /* OMP_30_ENABLED */
-        }
-    }
-}
-
-/* ------------------------------------------------------------------------ */
-/* ------------------------------------------------------------------------ */
-
 /*
  * determine if we can go parallel or must use a serialized parallel region and
  * how many threads we can use
@@ -2331,45 +1037,83 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
                          kmp_info_t *master_th, int master_gtid )
 {
     int         i;
+    int use_hot_team;
 
     KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
     KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
     KMP_MB();
 
     /* first, let's setup the master thread */
-    master_th -> th.th_info.ds.ds_tid  = 0;
-    master_th -> th.th_team            = team;
-    master_th -> th.th_team_nproc      = team -> t.t_nproc;
-    master_th -> th.th_team_master     = master_th;
-    master_th -> th.th_team_serialized = FALSE;
-    master_th -> th.th_dispatch        = & team -> t.t_dispatch[ 0 ];
+    master_th->th.th_info.ds.ds_tid  = 0;
+    master_th->th.th_team            = team;
+    master_th->th.th_team_nproc      = team->t.t_nproc;
+    master_th->th.th_team_master     = master_th;
+    master_th->th.th_team_serialized = FALSE;
+    master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
 
     /* make sure we are not the optimized hot team */
-    if ( team != root->r.r_hot_team ) {
+#if KMP_NESTED_HOT_TEAMS
+    use_hot_team = 0;
+    kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
+    if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
+        int level = team->t.t_active_level - 1;    // index in array of hot teams
+        if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
+            if( master_th->th.th_teams_size.nteams > 1 ) {
+                ++level; // level was not increased in teams construct for team_of_masters
+            }
+            if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+                master_th->th.th_teams_level == team->t.t_level ) {
+                ++level; // level was not increased in teams construct for team_of_workers before the parallel
+            }            // team->t.t_level will be increased inside parallel
+        }
+        if( level < __kmp_hot_teams_max_level ) {
+            if( hot_teams[level].hot_team ) {
+                // hot team has already been allocated for given level
+                KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
+                use_hot_team = 1; // the team is ready to use
+            } else {
+                use_hot_team = 0; // AC: threads are not allocated yet
+                hot_teams[level].hot_team = team; // remember new hot team
+                hot_teams[level].hot_team_nth = team->t.t_nproc;
+            }
+        } else {
+            use_hot_team = 0;
+        }
+    }
+#else
+    use_hot_team = team == root->r.r_hot_team;
+#endif
+    if ( !use_hot_team ) {
 
         /* install the master thread */
-        team -> t.t_threads[ 0 ]    = master_th;
+        team->t.t_threads[ 0 ]    = master_th;
         __kmp_initialize_info( master_th, team, 0, master_gtid );
 
         /* now, install the worker threads */
         for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
 
             /* fork or reallocate a new thread and install it in team */
-            team -> t.t_threads[ i ] =  __kmp_allocate_thread( root, team, i );
-            KMP_DEBUG_ASSERT( team->t.t_threads[i] );
-            KMP_DEBUG_ASSERT( team->t.t_threads[i]->th.th_team == team );
+            kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
+            team->t.t_threads[ i ] = thr;
+            KMP_DEBUG_ASSERT( thr );
+            KMP_DEBUG_ASSERT( thr->th.th_team == team );
             /* align team and thread arrived states */
             KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
                             __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
                             __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
                             team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
                             team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
-
+#if OMP_40_ENABLED
+            thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
+            thr->th.th_teams_level     = master_th->th.th_teams_level;
+            thr->th.th_teams_size      = master_th->th.th_teams_size;
+#endif
             { // Initialize threads' barrier data.
                 int b;
                 kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
                 for ( b = 0; b < bs_last_barrier; ++ b ) {
                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
                 }; // for b
             }
         }
@@ -2383,19 +1127,289 @@ __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
     KMP_MB();
 }
 
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+//
+// Propagate any changes to the floating point control registers out to the team
+// We try to avoid unnecessary writes to the relevant cache line in the team structure,
+// so we don't make changes unless they are needed.
+//
+inline static void
+propagateFPControl(kmp_team_t * team)
+{
+    if ( __kmp_inherit_fp_control ) {
+        kmp_int16 x87_fpu_control_word;
+        kmp_uint32 mxcsr;
+
+        // Get master values of FPU control flags (both X87 and vector)
+        __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
+        __kmp_store_mxcsr( &mxcsr );
+        mxcsr &= KMP_X86_MXCSR_MASK;
+
+        // There is no point looking at t_fp_control_saved here.
+        // If it is TRUE, we still have to update the values if they are different from those we now have.
+        // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
+        // that the values in the team are the same as those we have.
+        // So, this code achieves what we need whether or not t_fp_control_saved is true.
+        // By checking whether the value needs updating we avoid unnecessary writes that would put the
+        // cache-line into a written state, causing all threads in the team to have to read it again.
+        if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
+            team->t.t_x87_fpu_control_word = x87_fpu_control_word;
+        }
+        if ( team->t.t_mxcsr != mxcsr ) {
+            team->t.t_mxcsr = mxcsr;
+        }
+        // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
+        // So we must ensure it is correct.
+        if (!team->t.t_fp_control_saved) {
+            team->t.t_fp_control_saved = TRUE;
+        }
+    }
+    else {
+        // Similarly here. Don't write to this cache-line in the team structure unless we have to.
+        if (team->t.t_fp_control_saved)
+            team->t.t_fp_control_saved = FALSE;
+    }
+}
+
+// Do the opposite, setting the hardware registers to the updated values from the team.
+inline static void
+updateHWFPControl(kmp_team_t * team)
+{
+    if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
+        //
+        // Only reset the fp control regs if they have been changed in the team.
+        // the parallel region that we are exiting.
+        //
+        kmp_int16 x87_fpu_control_word;
+        kmp_uint32 mxcsr;
+        __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
+        __kmp_store_mxcsr( &mxcsr );
+        mxcsr &= KMP_X86_MXCSR_MASK;
+
+        if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
+            __kmp_clear_x87_fpu_status_word();
+            __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
+        }
+
+        if ( team->t.t_mxcsr != mxcsr ) {
+            __kmp_load_mxcsr( &team->t.t_mxcsr );
+        }
+    }
+}
+#else
+# define propagateFPControl(x) ((void)0)
+# define updateHWFPControl(x)  ((void)0)
+#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+
 static void
 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
 
-static void
-__kmp_setup_icv_copy( kmp_team_t *team, int new_nproc,
-#if OMP_30_ENABLED
-                 kmp_internal_control_t * new_icvs,
-                 ident_t *                loc
-#else
-                 int new_set_nproc, int new_set_dynamic, int new_set_nested,
-                 int new_set_blocktime, int new_bt_intervals, int new_bt_set
-#endif // OMP_30_ENABLED
-                 ); // forward declaration
+/*
+ * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
+ */
+void
+__kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
+{
+    kmp_info_t *this_thr;
+    kmp_team_t *serial_team;
+
+    KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
+
+    /* Skip all this code for autopar serialized loops since it results in
+       unacceptable overhead */
+    if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
+        return;
+
+    if( ! TCR_4( __kmp_init_parallel ) )
+        __kmp_parallel_initialize();
+
+    this_thr     = __kmp_threads[ global_tid ];
+    serial_team  = this_thr->th.th_serial_team;
+
+    /* utilize the serialized team held by this thread */
+    KMP_DEBUG_ASSERT( serial_team );
+    KMP_MB();
+
+    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+        KMP_DEBUG_ASSERT( this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team );
+        KMP_DEBUG_ASSERT( serial_team->t.t_task_team == NULL );
+        KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
+                        global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
+        this_thr->th.th_task_team = NULL;
+    }
+
+#if OMP_40_ENABLED
+    kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
+    if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
+        proc_bind = proc_bind_false;
+    }
+    else if ( proc_bind == proc_bind_default ) {
+        //
+        // No proc_bind clause was specified, so use the current value
+        // of proc-bind-var for this parallel region.
+        //
+        proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
+    }
+    //
+    // Reset for next parallel region
+    //
+    this_thr->th.th_set_proc_bind = proc_bind_default;
+#endif /* OMP_40_ENABLED */
+
+    if( this_thr->th.th_team != serial_team ) {
+        // Nested level will be an index in the nested nthreads array
+        int level = this_thr->th.th_team->t.t_level;
+
+        if( serial_team->t.t_serialized ) {
+            /* this serial team was already used
+             * TODO increase performance by making this locks more specific */
+            kmp_team_t *new_team;
+            int tid = this_thr->th.th_info.ds.ds_tid;
+
+            __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
+
+            new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
+#if OMP_40_ENABLED
+                                           proc_bind,
+#endif
+                                           & this_thr->th.th_current_task->td_icvs,
+                                           0 USE_NESTED_HOT_ARG(NULL) );
+            __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
+            KMP_ASSERT( new_team );
+
+            /* setup new serialized team and install it */
+            new_team->t.t_threads[0] = this_thr;
+            new_team->t.t_parent = this_thr->th.th_team;
+            serial_team = new_team;
+            this_thr->th.th_serial_team = serial_team;
+
+            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
+                            global_tid, serial_team ) );
+
+
+            /* TODO the above breaks the requirement that if we run out of
+             * resources, then we can still guarantee that serialized teams
+             * are ok, since we may need to allocate a new one */
+        } else {
+            KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
+                            global_tid, serial_team ) );
+        }
+
+        /* we have to initialize this serial team */
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
+        KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
+        serial_team->t.t_ident         = loc;
+        serial_team->t.t_serialized    = 1;
+        serial_team->t.t_nproc         = 1;
+        serial_team->t.t_parent        = this_thr->th.th_team;
+        serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
+        this_thr->th.th_team           = serial_team;
+        serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
+
+        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
+                        global_tid, this_thr->th.th_current_task ) );
+        KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
+        this_thr->th.th_current_task->td_flags.executing = 0;
+
+        __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
+
+        /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
+           each serialized task represented by team->t.t_serialized? */
+        copy_icvs(
+                  & this_thr->th.th_current_task->td_icvs,
+                  & this_thr->th.th_current_task->td_parent->td_icvs );
+
+        // Thread value exists in the nested nthreads array for the next nested level
+        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
+            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
+        }
+
+#if OMP_40_ENABLED
+        if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
+            this_thr->th.th_current_task->td_icvs.proc_bind
+                = __kmp_nested_proc_bind.bind_types[ level + 1 ];
+        }
+#endif /* OMP_40_ENABLED */
+
+        this_thr->th.th_info.ds.ds_tid = 0;
+
+        /* set thread cache values */
+        this_thr->th.th_team_nproc     = 1;
+        this_thr->th.th_team_master    = this_thr;
+        this_thr->th.th_team_serialized = 1;
+
+        serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
+        serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
+
+        propagateFPControl (serial_team);
+
+        /* check if we need to allocate dispatch buffers stack */
+        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+        if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
+            serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
+                __kmp_allocate( sizeof( dispatch_private_info_t ) );
+        }
+        this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+        KMP_MB();
+
+    } else {
+        /* this serialized team is already being used,
+         * that's fine, just add another nested level */
+        KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads );
+        KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
+        ++ serial_team->t.t_serialized;
+        this_thr->th.th_team_serialized = serial_team->t.t_serialized;
+
+        // Nested level will be an index in the nested nthreads array
+        int level = this_thr->th.th_team->t.t_level;
+        // Thread value exists in the nested nthreads array for the next nested level
+        if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
+            this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
+        }
+        serial_team->t.t_level++;
+        KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
+                        global_tid, serial_team, serial_team->t.t_level ) );
+
+        /* allocate/push dispatch buffers stack */
+        KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
+        {
+            dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
+                __kmp_allocate( sizeof( dispatch_private_info_t ) );
+            disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
+            serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
+        }
+        this_thr->th.th_dispatch = serial_team->t.t_dispatch;
+
+        KMP_MB();
+    }
+
+    if ( __kmp_env_consistency_check )
+        __kmp_push_parallel( global_tid, NULL );
+
+#if USE_ITT_BUILD
+    // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment
+    if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
+    {
+        this_thr->th.th_ident = loc;
+        // 0 - no barriers; 1 - serialized parallel
+        __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 );
+    }
+    // Save the start of the "parallel" region for VTune. This is the join barrier begin at the same time.
+    if( ( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) &&
+          __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr ) || KMP_ITT_DEBUG )
+    {
+        this_thr->th.th_ident = loc;
+#if USE_ITT_NOTIFY
+        if( this_thr->th.th_team->t.t_level == 1 ) {
+            serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp();
+        }
+#endif
+    }
+#endif /* USE_ITT_BUILD */
+}
 
 /* most of the work for a fork */
 /* return true if we really went parallel, false if serialized */
@@ -2403,9 +1417,7 @@ int
 __kmp_fork_call(
     ident_t   * loc,
     int         gtid,
-    int         exec_master, // 0 - GNU native code, master doesn't invoke microtask
-                             // 1 - Intel code, master invokes microtask
-                             // 2 - MS native code, use special invoker
+    enum fork_context_e  call_context, // Intel, GNU, ...
     kmp_int32   argc,
     microtask_t microtask,
     launch_t    invoker,
@@ -2421,7 +1433,6 @@ __kmp_fork_call(
     int             i;
     int             master_tid;
     int             master_this_cons;
-    int             master_last_cons;
     kmp_team_t     *team;
     kmp_team_t     *parent_team;
     kmp_info_t     *master_th;
@@ -2431,38 +1442,59 @@ __kmp_fork_call(
     int             master_set_numthreads;
     int             level;
 #if OMP_40_ENABLED
+    int             active_level;
     int             teams_level;
 #endif
+#if KMP_NESTED_HOT_TEAMS
+    kmp_hot_team_ptr_t **p_hot_teams;
+#endif
+    { // KMP_TIME_BLOCK
+    KMP_TIME_BLOCK(KMP_fork_call);
 
     KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
+    if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
+        /* Some systems prefer the stack for the root thread(s) to start with */
+        /* some gap from the parent stack to prevent false sharing. */
+        void *dummy = alloca(__kmp_stkpadding);
+        /* These 2 lines below are so this does not get optimized out */
+        if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
+            __kmp_stkpadding += (short)((kmp_int64)dummy);
+    }
 
     /* initialize if needed */
-    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
     if( ! TCR_4(__kmp_init_parallel) )
         __kmp_parallel_initialize();
 
     /* setup current data */
-    master_th     = __kmp_threads[ gtid ];
-    parent_team   = master_th -> th.th_team;
-    master_tid    = master_th -> th.th_info.ds.ds_tid;
-    master_this_cons = master_th -> th.th_local.this_construct;
-    master_last_cons = master_th -> th.th_local.last_construct;
-    root          = master_th -> th.th_root;
-    master_active = root -> r.r_active;
-    master_set_numthreads = master_th -> th.th_set_nproc;
-#if OMP_30_ENABLED
+    master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
+    parent_team   = master_th->th.th_team;
+    master_tid    = master_th->th.th_info.ds.ds_tid;
+    master_this_cons = master_th->th.th_local.this_construct;
+    root          = master_th->th.th_root;
+    master_active = root->r.r_active;
+    master_set_numthreads = master_th->th.th_set_nproc;
     // Nested level will be an index in the nested nthreads array
     level         = parent_team->t.t_level;
-#endif // OMP_30_ENABLED
 #if OMP_40_ENABLED
+    active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
     teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
 #endif
+#if KMP_NESTED_HOT_TEAMS
+    p_hot_teams   = &master_th->th.th_hot_teams;
+    if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
+        *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
+                sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
+        (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
+        (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
+    }
+#endif
 
 
     master_th->th.th_ident = loc;
 
 #if OMP_40_ENABLED
-    if ( master_th->th.th_team_microtask &&
+    if ( master_th->th.th_teams_microtask &&
          ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
         // AC: This is start of parallel that is nested inside teams construct.
         //     The team is actual (hot), all workers are ready at the fork barrier.
@@ -2484,6 +1516,7 @@ __kmp_fork_call(
             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
                                            //     work correctly, will restore at join time
+            KMP_TIME_BLOCK(OMP_work);
             __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
             return TRUE;
         }
@@ -2495,7 +1528,7 @@ __kmp_fork_call(
 
         /* Change number of threads in the team if requested */
         if ( master_set_numthreads ) {   // The parallel has num_threads clause
-            if ( master_set_numthreads < master_th->th.th_set_nth_teams ) {
+            if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
                 // AC: only can reduce the number of threads dynamically, cannot increase
                 kmp_info_t **other_threads = parent_team->t.t_threads;
                 parent_team->t.t_nproc = master_set_numthreads;
@@ -2516,8 +1549,11 @@ __kmp_fork_call(
         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
                     gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
 
-        if (! parent_team->t.t_invoke( gtid )) {
-            KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+        {
+            KMP_TIME_BLOCK(OMP_work);
+            if (! parent_team->t.t_invoke( gtid )) {
+                KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+            }
         }
         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
             gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
@@ -2526,43 +1562,37 @@ __kmp_fork_call(
         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
 
         return TRUE;
-    }
+    } // Parallel closely nested in teams construct
 #endif /* OMP_40_ENABLED */
 
-#if OMP_30_ENABLED && KMP_DEBUG
+#if KMP_DEBUG
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
         KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
     }
-#endif // OMP_30_ENABLED
+#endif
 
     /* determine how many new threads we can use */
     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
 
-#if OMP_30_ENABLED
     if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
         nthreads = 1;
-    }
-    else
-#endif // OMP_30_ENABLED
-
-    {
+    } else {
         nthreads = master_set_numthreads ?
-            master_set_numthreads : get__nproc_2( parent_team, master_tid );
-        nthreads = __kmp_reserve_threads( root, parent_team, master_tid, nthreads
+            master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
+        nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
 #if OMP_40_ENABLED
-        // AC: If we execute teams from parallel region (on host), then teams
-        //     should be created but each can only have 1 thread if nesting is disabled.
-        //     If teams called from serial region, then teams and their threads
-        //     should be created regardless of the nesting setting.
-                                        ,( ( ap == NULL && teams_level == 0 ) ||
-                                           ( ap && teams_level > 0 && teams_level == level ) )
+/* AC: If we execute teams from parallel region (on host), then teams should be created
+   but each can only have 1 thread if nesting is disabled. If teams called from serial region,
+   then teams and their threads should be created regardless of the nesting setting. */
+                                         , ((ap==NULL && active_level==0) ||
+                                            (ap && teams_level>0 && teams_level==level))
 #endif /* OMP_40_ENABLED */
-        );
+                                         );
     }
     KMP_DEBUG_ASSERT( nthreads > 0 );
 
     /* If we temporarily changed the set number of threads then restore it now */
-    master_th -> th.th_set_nproc = 0;
+    master_th->th.th_set_nproc = 0;
 
 
     /* create a serialized parallel region? */
@@ -2579,19 +1609,18 @@ __kmp_fork_call(
 
         __kmpc_serialized_parallel(loc, gtid);
 
-        if ( exec_master == 0 ) {
-            // we were called from GNU native code
-            KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
-            return FALSE;
-        } else if ( exec_master == 1 ) {
+        if ( call_context == fork_context_intel ) {
             /* TODO this sucks, use the compiler itself to pass args! :) */
-            master_th -> th.th_serial_team -> t.t_ident =  loc;
+            master_th->th.th_serial_team->t.t_ident = loc;
 #if OMP_40_ENABLED
             if ( !ap ) {
                 // revert change made in __kmpc_serialized_parallel()
-                master_th -> th.th_serial_team -> t.t_level--;
+                master_th->th.th_serial_team->t.t_level--;
                 // Get args from parent team for teams construct
-                __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
+                }
             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
                 team = master_th->th.th_team;
@@ -2602,12 +1631,12 @@ __kmp_fork_call(
                 argv = (void**) team->t.t_argv;
                 if ( ap ) {
                     for( i=argc-1; i >= 0; --i )
-                      /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-                      #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
+// TODO: revert workaround for Intel(R) 64 tracker #96
+# if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
                         *argv++ = va_arg( *ap, void * );
-                      #else
+# else
                         *argv++ = va_arg( ap, void * );
-                      #endif
+# endif
                 } else {
                     for( i=0; i < argc; ++i )
                         // Get args from parent team for teams construct
@@ -2617,127 +1646,109 @@ __kmp_fork_call(
                 //     because initial code in teams should have level=0
                 team->t.t_level--;
                 // AC: call special invoker for outer "parallel" of the teams construct
-                invoker(gtid);
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    invoker(gtid);
+                }
             } else {
 #endif /* OMP_40_ENABLED */
                 argv = args;
                 for( i=argc-1; i >= 0; --i )
-                /* TODO: revert workaround for Intel(R) 64 tracker #96 */
-                #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
+// TODO: revert workaround for Intel(R) 64 tracker #96
+#if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
                     *argv++ = va_arg( *ap, void * );
-                #else
+#else
                     *argv++ = va_arg( ap, void * );
-                #endif
+#endif
                 KMP_MB();
-                __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
+                {
+                    KMP_TIME_BLOCK(OMP_work);
+                    __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
+                }
 #if OMP_40_ENABLED
             }
 #endif /* OMP_40_ENABLED */
         }
+        else if ( call_context == fork_context_gnu ) {
+            // we were called from GNU native code
+            KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
+            return FALSE;
+        }
         else {
-            KMP_ASSERT2( exec_master <= 1, "__kmp_fork_call: unknown parameter exec_master" );
+            KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
         }
 
-        KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
 
+        KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
         KMP_MB();
         return FALSE;
     }
 
-#if OMP_30_ENABLED
     // GEH: only modify the executing flag in the case when not serialized
     //      serialized case is handled in kmpc_serialized_parallel
     KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
-                    parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
-                    master_th->th.th_current_task->td_icvs.max_active_levels ) );
+                  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
+                  master_th->th.th_current_task->td_icvs.max_active_levels ) );
     // TODO: GEH - cannot do this assertion because root thread not set up as executing
     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
     master_th->th.th_current_task->td_flags.executing = 0;
-#endif
 
 #if OMP_40_ENABLED
-    if ( !master_th->th.th_team_microtask || level > teams_level )
+    if ( !master_th->th.th_teams_microtask || level > teams_level )
 #endif /* OMP_40_ENABLED */
     {
         /* Increment our nested depth level */
         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
     }
 
-#if OMP_30_ENABLED
-    //
     // See if we need to make a copy of the ICVs.
-    //
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
-    if ( ( level + 1 < __kmp_nested_nth.used ) &&
-      ( __kmp_nested_nth.nth[level + 1] != nthreads_icv ) ) {
-        nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
+        nthreads_icv = __kmp_nested_nth.nth[level+1];
     }
     else {
         nthreads_icv = 0;  // don't update
     }
 
 #if OMP_40_ENABLED
-    //
     // Figure out the proc_bind_policy for the new team.
-    //
     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
-    kmp_proc_bind_t proc_bind_icv; // proc_bind_default means don't update
-
+    kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
     if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
         proc_bind = proc_bind_false;
-        proc_bind_icv = proc_bind_default;
     }
     else {
-        proc_bind_icv = master_th->th.th_current_task->td_icvs.proc_bind;
-        if ( proc_bind == proc_bind_default ) {
-            //
-            // No proc_bind clause was specified, so use the current value
-            // of proc-bind-var for this parallel region.
-            //
-            proc_bind = proc_bind_icv;
-        }
-        else {
-            //
-            // The proc_bind policy was specified explicitly on the parallel
-            // clause.  This overrides the proc-bind-var for this parallel
-            // region, but does not change proc-bind-var.
-            //
+        if (proc_bind == proc_bind_default) {
+            // No proc_bind clause specified; use current proc-bind-var for this parallel region
+            proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
         }
-
-        //
+        /* else: The proc_bind policy was specified explicitly on parallel clause. This
+           overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
         // Figure the value of proc-bind-var for the child threads.
-        //
-        if ( ( level + 1 < __kmp_nested_proc_bind.used )
-          && ( __kmp_nested_proc_bind.bind_types[level + 1] != proc_bind_icv ) ) {
-            proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
-        }
-        else {
-            proc_bind_icv = proc_bind_default;
+        if ((level+1 < __kmp_nested_proc_bind.used)
+            && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
+            proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
         }
     }
 
-    //
     // Reset for next parallel region
-    //
     master_th->th.th_set_proc_bind = proc_bind_default;
 #endif /* OMP_40_ENABLED */
 
-    if ( ( nthreads_icv > 0 )
+    if ((nthreads_icv > 0)
 #if OMP_40_ENABLED
-      || ( proc_bind_icv != proc_bind_default )
+        || (proc_bind_icv != proc_bind_default)
 #endif /* OMP_40_ENABLED */
-      )
-    {
+        ) {
         kmp_internal_control_t new_icvs;
-        copy_icvs( & new_icvs, & master_th->th.th_current_task->td_icvs );
+        copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
         new_icvs.next = NULL;
-
-        if ( nthreads_icv > 0 ) {
+        if (nthreads_icv > 0) {
             new_icvs.nproc = nthreads_icv;
         }
 
 #if OMP_40_ENABLED
-        if ( proc_bind_icv != proc_bind_default ) {
+        if (proc_bind_icv != proc_bind_default) {
             new_icvs.proc_bind = proc_bind_icv;
         }
 #endif /* OMP_40_ENABLED */
@@ -2746,47 +1757,31 @@ __kmp_fork_call(
         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
         team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMP_40_ENABLED
-          proc_bind,
+                                   proc_bind,
 #endif
-          &new_icvs, argc );
-    } else
-#endif /* OMP_30_ENABLED */
-    {
+                                   &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
+    } else {
         /* allocate a new parallel team */
         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
         team = __kmp_allocate_team(root, nthreads, nthreads,
 #if OMP_40_ENABLED
-                proc_bind,
+                                   proc_bind,
 #endif
-#if OMP_30_ENABLED
-                &master_th->th.th_current_task->td_icvs,
-#else
-                parent_team->t.t_set_nproc[master_tid],
-                parent_team->t.t_set_dynamic[master_tid],
-                parent_team->t.t_set_nested[master_tid],
-                parent_team->t.t_set_blocktime[master_tid],
-                parent_team->t.t_set_bt_intervals[master_tid],
-                parent_team->t.t_set_bt_set[master_tid],
-#endif // OMP_30_ENABLED
-                argc );
+                                   &master_th->th.th_current_task->td_icvs, argc
+                                   USE_NESTED_HOT_ARG(master_th) );
     }
-
-    KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n",
-            team ) );
+    KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
 
     /* setup the new team */
     team->t.t_master_tid = master_tid;
     team->t.t_master_this_cons = master_this_cons;
-    team->t.t_master_last_cons = master_last_cons;
-
+    team->t.t_ident      = loc;
     team->t.t_parent     = parent_team;
     TCW_SYNC_PTR(team->t.t_pkfn, microtask);
     team->t.t_invoke     = invoker;  /* TODO move this to root, maybe */
-    team->t.t_ident      = loc;
-#if OMP_30_ENABLED
     // TODO: parent_team->t.t_level == INT_MAX ???
 #if OMP_40_ENABLED
-    if ( !master_th->th.th_team_microtask || level > teams_level ) {
+    if ( !master_th->th.th_teams_microtask || level > teams_level ) {
 #endif /* OMP_40_ENABLED */
         team->t.t_level        = parent_team->t.t_level + 1;
         team->t.t_active_level = parent_team->t.t_active_level + 1;
@@ -2797,33 +1792,22 @@ __kmp_fork_call(
         team->t.t_active_level = parent_team->t.t_active_level;
     }
 #endif /* OMP_40_ENABLED */
-    team->t.t_sched      = get__sched_2( parent_team, master_tid ); // set master's schedule as new run-time schedule
+    team->t.t_sched      = get__sched_2(parent_team, master_tid); // set master's schedule as new run-time schedule
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    if ( __kmp_inherit_fp_control ) {
-        __kmp_store_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
-        __kmp_store_mxcsr( &team->t.t_mxcsr );
-        team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
-        team->t.t_fp_control_saved = TRUE;
-    }
-    else {
-        team->t.t_fp_control_saved = FALSE;
-    }
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+    // Update the floating point rounding in the team if required.
+    propagateFPControl(team);
 
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-        //
-        // Set the master thread's task team to the team's task team.
-        // Unless this is the hot team, it should be NULL.
-        //
+        // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
         KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
         KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
-                        __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
-                        parent_team, team->t.t_task_team, team ) );
+                      __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
+                      parent_team, team->t.t_task_team, team ) );
         master_th->th.th_task_team = team->t.t_task_team;
-        KMP_DEBUG_ASSERT( ( master_th->th.th_task_team == NULL ) || ( team == root->r.r_hot_team ) ) ;
+#if !KMP_NESTED_HOT_TEAMS
+        KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
+#endif
     }
-#endif // OMP_30_ENABLED
 
     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
                 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
@@ -2833,12 +1817,12 @@ __kmp_fork_call(
     KMP_MB();
 
     /* now, setup the arguments */
-    argv = (void**) team -> t.t_argv;
+    argv = (void**)team->t.t_argv;
 #if OMP_40_ENABLED
     if ( ap ) {
 #endif /* OMP_40_ENABLED */
-        for( i=argc-1; i >= 0; --i )
-/* TODO: revert workaround for Intel(R) 64 tracker #96 */
+        for ( i=argc-1; i >= 0; --i )
+// TODO: revert workaround for Intel(R) 64 tracker #96
 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM) && KMP_OS_LINUX
             *argv++ = va_arg( *ap, void * );
 #else
@@ -2846,31 +1830,19 @@ __kmp_fork_call(
 #endif
 #if OMP_40_ENABLED
     } else {
-        for( i=0; i < argc; ++i )
+        for ( i=0; i < argc; ++i )
             // Get args from parent team for teams construct
             argv[i] = team->t.t_parent->t.t_argv[i];
     }
 #endif /* OMP_40_ENABLED */
 
     /* now actually fork the threads */
-
     team->t.t_master_active = master_active;
-    if (!root -> r.r_active)  /* Only do the assignment if it makes a difference to prevent cache ping-pong */
-        root -> r.r_active = TRUE;
+    if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
+        root->r.r_active = TRUE;
 
     __kmp_fork_team_threads( root, team, master_th, gtid );
-    __kmp_setup_icv_copy(team, nthreads
-#if OMP_30_ENABLED
-			 , &master_th->th.th_current_task->td_icvs, loc
-#else
-			 , parent_team->t.t_set_nproc[master_tid],
-			 parent_team->t.t_set_dynamic[master_tid],
-			 parent_team->t.t_set_nested[master_tid],
-			 parent_team->t.t_set_blocktime[master_tid],
-			 parent_team->t.t_set_bt_intervals[master_tid],
-			 parent_team->t.t_set_bt_set[master_tid]
-#endif /* OMP_30_ENABLED */
-			 );
+    __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
 
 
     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
@@ -2878,20 +1850,30 @@ __kmp_fork_call(
 
 #if USE_ITT_BUILD
     // Mark start of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
-    if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
+    if ((__itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) || KMP_ITT_DEBUG)
 # if OMP_40_ENABLED
-    if ( !master_th->th.th_team_microtask || microtask == (microtask_t)__kmp_teams_master )
-        // Either not in teams or the outer fork of the teams construct
+        if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
+            // Either not in teams or the outer fork of the teams construct
 # endif /* OMP_40_ENABLED */
-        __kmp_itt_region_forking( gtid );
-#endif /* USE_ITT_BUILD */
+        {
+            __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
+        }
+    kmp_uint64 tmp_time = 0;
+#if USE_ITT_NOTIFY
+    if ( __itt_get_timestamp_ptr )
+        tmp_time = __itt_get_timestamp();
+#endif
+    if ((__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode==3)|| KMP_ITT_DEBUG)
+# if OMP_40_ENABLED
+        if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master)
+            // Either not in teams or the outer fork of the teams construct
+# endif /* OMP_40_ENABLED */
+            team->t.t_region_time = tmp_time;
 
-#if USE_ITT_BUILD && USE_ITT_NOTIFY && OMP_30_ENABLED
     // Internal fork - report frame begin
-    if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr )
-    {
-        if( ! ( team->t.t_active_level > 1 ) ) {
-            master_th->th.th_frame_time   = __itt_get_timestamp();
+    if ((__kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3) && __itt_frame_submit_v3_ptr ) {
+        if (!(team->t.t_active_level > 1)) {
+            master_th->th.th_frame_time  = tmp_time;
         }
     }
 #endif /* USE_ITT_BUILD */
@@ -2899,8 +1881,8 @@ __kmp_fork_call(
     /* now go on and do the work */
     KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
     KMP_MB();
-
-    KF_TRACE( 10, ( "__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
+    KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
+                  root, team, master_th, gtid));
 
 #if USE_ITT_BUILD
     if ( __itt_stack_caller_create_ptr ) {
@@ -2913,10 +1895,11 @@ __kmp_fork_call(
 #endif /* OMP_40_ENABLED */
     {
         __kmp_internal_fork( loc, gtid, team );
-        KF_TRACE( 10, ( "__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
+        KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
+                      root, team, master_th, gtid));
     }
 
-    if (! exec_master) {
+    if (call_context == fork_context_gnu) {
         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
         return TRUE;
     }
@@ -2924,9 +1907,14 @@ __kmp_fork_call(
     /* Invoke microtask for MASTER thread */
     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
                 gtid, team->t.t_id, team->t.t_pkfn ) );
+    }  // END of timer KMP_fork_call block
 
-    if (! team->t.t_invoke( gtid )) {
-        KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+    {
+        //KMP_TIME_BLOCK(OMP_work);
+        KMP_TIME_BLOCK(USER_master_invoke);
+        if (! team->t.t_invoke( gtid )) {
+            KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
+        }
     }
     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
         gtid, team->t.t_id, team->t.t_pkfn ) );
@@ -2937,7 +1925,6 @@ __kmp_fork_call(
     return TRUE;
 }
 
-
 void
 __kmp_join_call(ident_t *loc, int gtid
 #if OMP_40_ENABLED
@@ -2945,6 +1932,7 @@ __kmp_join_call(ident_t *loc, int gtid
 #endif /* OMP_40_ENABLED */
 )
 {
+    KMP_TIME_BLOCK(KMP_join_call);
     kmp_team_t     *team;
     kmp_team_t     *parent_team;
     kmp_info_t     *master_th;
@@ -2956,24 +1944,24 @@ __kmp_join_call(ident_t *loc, int gtid
 
     /* setup current data */
     master_th     = __kmp_threads[ gtid ];
-    root          = master_th -> th.th_root;
-    team          = master_th -> th.th_team;
+    root          = master_th->th.th_root;
+    team          = master_th->th.th_team;
     parent_team   = team->t.t_parent;
 
     master_th->th.th_ident = loc;
 
-#if OMP_30_ENABLED && KMP_DEBUG
+#if KMP_DEBUG
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
                          __kmp_gtid_from_thread( master_th ), team,
-                         team -> t.t_task_team, master_th->th.th_task_team) );
+                         team->t.t_task_team, master_th->th.th_task_team) );
         KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team );
     }
-#endif // OMP_30_ENABLED
+#endif
 
     if( team->t.t_serialized ) {
 #if OMP_40_ENABLED
-        if ( master_th->th.th_team_microtask ) {
+        if ( master_th->th.th_teams_microtask ) {
             // We are in teams construct
             int level = team->t.t_level;
             int tlevel = master_th->th.th_teams_level;
@@ -3012,7 +2000,7 @@ __kmp_join_call(ident_t *loc, int gtid
     // Mark end of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
     if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
 # if OMP_40_ENABLED
-    if ( !master_th->th.th_team_microtask /* not in teams */ ||
+    if ( !master_th->th.th_teams_microtask /* not in teams */ ||
          ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
         // Either not in teams or exiting teams region
         // (teams is a frame and no other frames inside the teams)
@@ -3021,10 +2009,21 @@ __kmp_join_call(ident_t *loc, int gtid
         master_th->th.th_ident = loc;
         __kmp_itt_region_joined( gtid );
     }
+    if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG )
+# if OMP_40_ENABLED
+    if ( !master_th->th.th_teams_microtask /* not in teams */ ||
+         ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
+        // Either not in teams or exiting teams region
+        // (teams is a frame and no other frames inside the teams)
+# endif /* OMP_40_ENABLED */
+    {
+        master_th->th.th_ident = loc;
+        __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time, 0, loc, master_th->th.th_team_nproc, 1 );
+    }
 #endif /* USE_ITT_BUILD */
 
 #if OMP_40_ENABLED
-    if ( master_th->th.th_team_microtask &&
+    if ( master_th->th.th_teams_microtask &&
          !exit_teams &&
          team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
          team->t.t_level == master_th->th.th_teams_level + 1 ) {
@@ -3038,9 +2037,9 @@ __kmp_join_call(ident_t *loc, int gtid
         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
 
         /* Restore number of threads in the team if needed */
-        if ( master_th->th.th_team_nproc < master_th->th.th_set_nth_teams ) {
+        if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
             int old_num = master_th->th.th_team_nproc;
-            int new_num = master_th->th.th_set_nth_teams;
+            int new_num = master_th->th.th_teams_size.nth;
             kmp_info_t **other_threads = team->t.t_threads;
             team->t.t_nproc = new_num;
             for ( i = 0; i < old_num; ++i ) {
@@ -3051,8 +2050,9 @@ __kmp_join_call(ident_t *loc, int gtid
                 // Re-initialize thread's barrier data.
                 int b;
                 kmp_balign_t * balign = other_threads[i]->th.th_bar;
-                for ( b = 0; b < bp_last_bar; ++ b ) {
+                for ( b = 0; b < bs_last_barrier; ++ b ) {
                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
                 }
                 // Synchronize thread's task state
                 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
@@ -3061,13 +2061,13 @@ __kmp_join_call(ident_t *loc, int gtid
         return;
     }
 #endif /* OMP_40_ENABLED */
+
     /* do cleanup and restore the parent team */
-    master_th -> th.th_info .ds.ds_tid = team -> t.t_master_tid;
-    master_th -> th.th_local.this_construct = team -> t.t_master_this_cons;
-    master_th -> th.th_local.last_construct = team -> t.t_master_last_cons;
+    master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
+    master_th->th.th_local.this_construct = team->t.t_master_this_cons;
 
-    master_th -> th.th_dispatch =
-                & parent_team -> t.t_dispatch[ team -> t.t_master_tid ];
+    master_th->th.th_dispatch =
+                & parent_team->t.t_dispatch[ team->t.t_master_tid ];
 
     /* jc: The following lock has instructions with REL and ACQ semantics,
        separating the parallel user code called in this parallel region
@@ -3076,7 +2076,7 @@ __kmp_join_call(ident_t *loc, int gtid
     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
 
 #if OMP_40_ENABLED
-    if ( !master_th->th.th_team_microtask || team->t.t_level > master_th->th.th_teams_level )
+    if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
 #endif /* OMP_40_ENABLED */
     {
         /* Decrement our nested depth level */
@@ -3084,32 +2084,24 @@ __kmp_join_call(ident_t *loc, int gtid
     }
     KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
 
-    #if OMP_30_ENABLED
     KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
                    0, master_th, team ) );
     __kmp_pop_current_task_from_thread( master_th );
-    #endif // OMP_30_ENABLED
 
 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
     //
     // Restore master thread's partition.
     //
-    master_th -> th.th_first_place = team -> t.t_first_place;
-    master_th -> th.th_last_place = team -> t.t_last_place;
+    master_th->th.th_first_place = team->t.t_first_place;
+    master_th->th.th_last_place = team->t.t_last_place;
 #endif /* OMP_40_ENABLED */
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
-        __kmp_clear_x87_fpu_status_word();
-        __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
-        __kmp_load_mxcsr( &team->t.t_mxcsr );
-    }
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+    updateHWFPControl (team);
 
-    if ( root -> r.r_active != master_active )
-        root -> r.r_active = master_active;
+    if ( root->r.r_active != master_active )
+        root->r.r_active = master_active;
 
-    __kmp_free_team( root, team ); /* this will free worker threads */
+    __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
 
     /* this race was fun to find.  make sure the following is in the critical
      * region otherwise assertions may fail occasiounally since the old team
@@ -3117,39 +2109,35 @@ __kmp_join_call(ident_t *loc, int gtid
      * actually safe to run and won't cause any bugs, but will cause thoose
      * assertion failures.  it's only one deref&assign so might as well put this
      * in the critical region */
-    master_th -> th.th_team        =   parent_team;
-    master_th -> th.th_team_nproc  =   parent_team -> t.t_nproc;
-    master_th -> th.th_team_master =   parent_team -> t.t_threads[0];
-    master_th -> th.th_team_serialized = parent_team -> t.t_serialized;
+    master_th->th.th_team        =   parent_team;
+    master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
+    master_th->th.th_team_master =   parent_team->t.t_threads[0];
+    master_th->th.th_team_serialized = parent_team->t.t_serialized;
 
     /* restore serialized team, if need be */
-    if( parent_team -> t.t_serialized &&
+    if( parent_team->t.t_serialized &&
         parent_team != master_th->th.th_serial_team &&
         parent_team != root->r.r_root_team ) {
-            __kmp_free_team( root, master_th -> th.th_serial_team );
-            master_th -> th.th_serial_team = parent_team;
+            __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
+            master_th->th.th_serial_team = parent_team;
     }
 
-#if OMP_30_ENABLED
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
         //
         // Copy the task team from the new child / old parent team
         // to the thread.  If non-NULL, copy the state flag also.
         //
-        if ( ( master_th -> th.th_task_team = parent_team -> t.t_task_team ) != NULL ) {
-            master_th -> th.th_task_state = master_th -> th.th_task_team -> tt.tt_state;
+        if ( ( master_th->th.th_task_team = parent_team->t.t_task_team ) != NULL ) {
+            master_th->th.th_task_state = master_th->th.th_task_team->tt.tt_state;
         }
         KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
                         __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
                         parent_team ) );
     }
-#endif /* OMP_30_ENABLED */
 
-    #if OMP_30_ENABLED
-         // TODO: GEH - cannot do this assertion because root thread not set up as executing
-         // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
-         master_th->th.th_current_task->td_flags.executing = 1;
-    #endif // OMP_30_ENABLED
+     // TODO: GEH - cannot do this assertion because root thread not set up as executing
+     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
+     master_th->th.th_current_task->td_flags.executing = 1;
 
     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
 
@@ -3166,38 +2154,29 @@ void
 __kmp_save_internal_controls ( kmp_info_t * thread )
 {
 
-    if ( thread -> th.th_team != thread -> th.th_serial_team ) {
+    if ( thread->th.th_team != thread->th.th_serial_team ) {
         return;
     }
-    if (thread -> th.th_team -> t.t_serialized > 1) {
+    if (thread->th.th_team->t.t_serialized > 1) {
         int push = 0;
 
-        if (thread -> th.th_team -> t.t_control_stack_top == NULL) {
+        if (thread->th.th_team->t.t_control_stack_top == NULL) {
             push = 1;
         } else {
-            if ( thread -> th.th_team -> t.t_control_stack_top -> serial_nesting_level !=
-                 thread -> th.th_team -> t.t_serialized ) {
+            if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
+                 thread->th.th_team->t.t_serialized ) {
                 push = 1;
             }
         }
         if (push) {  /* push a record on the serial team's stack */
             kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
 
-#if OMP_30_ENABLED
             copy_icvs( control, & thread->th.th_current_task->td_icvs );
-#else
-            control->nproc        = thread->th.th_team->t.t_set_nproc[0];
-            control->dynamic      = thread->th.th_team->t.t_set_dynamic[0];
-            control->nested       = thread->th.th_team->t.t_set_nested[0];
-            control->blocktime    = thread->th.th_team->t.t_set_blocktime[0];
-            control->bt_intervals = thread->th.th_team->t.t_set_bt_intervals[0];
-            control->bt_set       = thread->th.th_team->t.t_set_bt_set[0];
-#endif // OMP_30_ENABLED
 
             control->serial_nesting_level = thread->th.th_team->t.t_serialized;
 
-            control->next = thread -> th.th_team -> t.t_control_stack_top;
-            thread -> th.th_team -> t.t_control_stack_top = control;
+            control->next = thread->th.th_team->t.t_control_stack_top;
+            thread->th.th_team->t.t_control_stack_top = control;
         }
     }
 }
@@ -3230,14 +2209,17 @@ __kmp_set_num_threads( int new_nth, int gtid )
     //
     root = thread->th.th_root;
     if ( __kmp_init_parallel && ( ! root->r.r_active )
-      && ( root->r.r_hot_team->t.t_nproc > new_nth ) ) {
+      && ( root->r.r_hot_team->t.t_nproc > new_nth )
+#if KMP_NESTED_HOT_TEAMS
+      && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
+#endif
+    ) {
         kmp_team_t *hot_team = root->r.r_hot_team;
         int f;
 
         __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
 
 
-#if OMP_30_ENABLED
         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
             kmp_task_team_t *task_team = hot_team->t.t_task_team;
             if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
@@ -3259,7 +2241,6 @@ __kmp_set_num_threads( int new_nth, int gtid )
                 KMP_DEBUG_ASSERT( task_team == NULL );
             }
         }
-#endif // OMP_30_ENABLED
 
         //
         // Release the extra threads we don't need any more.
@@ -3270,6 +2251,12 @@ __kmp_set_num_threads( int new_nth, int gtid )
             hot_team->t.t_threads[f] =  NULL;
         }
         hot_team->t.t_nproc = new_nth;
+#if KMP_NESTED_HOT_TEAMS
+        if( thread->th.th_hot_teams ) {
+            KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
+            thread->th.th_hot_teams[0].hot_team_nth = new_nth;
+        }
+#endif
 
 
         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
@@ -3283,13 +2270,12 @@ __kmp_set_num_threads( int new_nth, int gtid )
         }
 #if KMP_MIC
         // Special flag in case omp_set_num_threads() call
-        hot_team -> t.t_size_changed = -1;
+        hot_team->t.t_size_changed = -1;
 #endif
     }
 
 }
 
-#if OMP_30_ENABLED
 /* Changes max_active_levels */
 void
 __kmp_set_max_active_levels( int gtid, int max_active_levels )
@@ -3338,10 +2324,10 @@ __kmp_get_max_active_levels( int gtid )
     KMP_DEBUG_ASSERT( __kmp_init_serial );
 
     thread = __kmp_threads[ gtid ];
-    KMP_DEBUG_ASSERT( thread -> th.th_current_task );
+    KMP_DEBUG_ASSERT( thread->th.th_current_task );
     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
-        gtid, thread -> th.th_current_task, thread -> th.th_current_task -> td_icvs.max_active_levels ) );
-    return thread -> th.th_current_task -> td_icvs.max_active_levels;
+        gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
+    return thread->th.th_current_task->td_icvs.max_active_levels;
 }
 
 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
@@ -3380,20 +2366,20 @@ __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
         if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
             // differ static chunked vs. unchunked:
             // chunk should be invalid to indicate unchunked schedule (which is the default)
-            thread -> th.th_current_task -> td_icvs.sched.r_sched_type = kmp_sch_static;
+            thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
         } else {
-            thread -> th.th_current_task -> td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
+            thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
         }
     } else {
         //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
-        thread -> th.th_current_task -> td_icvs.sched.r_sched_type =
+        thread->th.th_current_task->td_icvs.sched.r_sched_type =
             __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
     }
     if ( kind == kmp_sched_auto ) {
         // ignore parameter chunk for schedule auto
-        thread -> th.th_current_task -> td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
+        thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
     } else {
-        thread -> th.th_current_task -> td_icvs.sched.chunk = chunk;
+        thread->th.th_current_task->td_icvs.sched.chunk = chunk;
     }
 }
 
@@ -3410,8 +2396,8 @@ __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
 
     thread = __kmp_threads[ gtid ];
 
-    //th_type = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
-    th_type = thread -> th.th_current_task -> td_icvs.sched.r_sched_type;
+    //th_type = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
+    th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
 
     switch ( th_type ) {
     case kmp_sch_static:
@@ -3446,8 +2432,8 @@ __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
         KMP_FATAL( UnknownSchedulingType, th_type );
     }
 
-    //*chunk = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
-    *chunk = thread -> th.th_current_task -> td_icvs.sched.chunk;
+    //*chunk = thread->th.th_team->t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
+    *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
 }
 
 int
@@ -3465,11 +2451,11 @@ __kmp_get_ancestor_thread_num( int gtid, int level ) {
     if( level < 0 ) return -1;
     thr = __kmp_threads[ gtid ];
     team = thr->th.th_team;
-    ii = team -> t.t_level;
+    ii = team->t.t_level;
     if( level > ii ) return -1;
 
 #if OMP_40_ENABLED
-    if( thr->th.th_team_microtask ) {
+    if( thr->th.th_teams_microtask ) {
         // AC: we are in teams region where multiple nested teams have same level
         int tlevel = thr->th.th_teams_level; // the level of the teams construct
         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
@@ -3486,25 +2472,25 @@ __kmp_get_ancestor_thread_num( int gtid, int level ) {
 
     if( ii == level ) return __kmp_tid_from_gtid( gtid );
 
-    dd = team -> t.t_serialized;
+    dd = team->t.t_serialized;
     level++;
     while( ii > level )
     {
-        for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+        for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
         {
         }
-        if( ( team -> t.t_serialized ) && ( !dd ) ) {
+        if( ( team->t.t_serialized ) && ( !dd ) ) {
             team = team->t.t_parent;
             continue;
         }
         if( ii > level ) {
             team = team->t.t_parent;
-            dd = team -> t.t_serialized;
+            dd = team->t.t_serialized;
             ii--;
         }
     }
 
-    return ( dd > 1 ) ? ( 0 ) : ( team -> t.t_master_tid );
+    return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
 }
 
 int
@@ -3522,11 +2508,11 @@ __kmp_get_team_size( int gtid, int level ) {
     if( level < 0 ) return -1;
     thr = __kmp_threads[ gtid ];
     team = thr->th.th_team;
-    ii = team -> t.t_level;
+    ii = team->t.t_level;
     if( level > ii ) return -1;
 
 #if OMP_40_ENABLED
-    if( thr->th.th_team_microtask ) {
+    if( thr->th.th_teams_microtask ) {
         // AC: we are in teams region where multiple nested teams have same level
         int tlevel = thr->th.th_teams_level; // the level of the teams construct
         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
@@ -3543,10 +2529,10 @@ __kmp_get_team_size( int gtid, int level ) {
 
     while( ii > level )
     {
-        for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
+        for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
         {
         }
-        if( team -> t.t_serialized && ( !dd ) ) {
+        if( team->t.t_serialized && ( !dd ) ) {
             team = team->t.t_parent;
             continue;
         }
@@ -3556,11 +2542,9 @@ __kmp_get_team_size( int gtid, int level ) {
         }
     }
 
-    return team -> t.t_nproc;
+    return team->t.t_nproc;
 }
 
-#endif // OMP_30_ENABLED
-
 kmp_r_sched_t
 __kmp_get_schedule_global() {
 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
@@ -3601,21 +2585,20 @@ __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
 {
 
     KMP_DEBUG_ASSERT( team );
-    if( !realloc || argc > team -> t.t_max_argc ) {
+    if( !realloc || argc > team->t.t_max_argc ) {
 
         KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
                          team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
-#if (KMP_PERF_V106 == KMP_ON)
         /* if previously allocated heap space for args, free them */
-        if ( realloc && team -> t.t_argv != &team -> t.t_inline_argv[0] )
-            __kmp_free( (void *) team -> t.t_argv );
+        if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
+            __kmp_free( (void *) team->t.t_argv );
 
         if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
             /* use unused space in the cache line for arguments */
-            team -> t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
+            team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
                              team->t.t_id, team->t.t_max_argc ));
-            team -> t.t_argv = &team -> t.t_inline_argv[0];
+            team->t.t_argv = &team->t.t_inline_argv[0];
             if ( __kmp_storage_map ) {
                 __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
                                          &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
@@ -3625,31 +2608,17 @@ __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
             }
         } else {
             /* allocate space for arguments in the heap */
-            team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
+            team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
                                      KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
                              team->t.t_id, team->t.t_max_argc ));
-            team -> t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
+            team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
             if ( __kmp_storage_map ) {
                 __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
                                          sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
                                          team->t.t_id );
             }
         }
-#else /* KMP_PERF_V106 == KMP_OFF */
-        if ( realloc )
-            __kmp_free( (void*) team -> t.t_argv );
-        team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
-                             KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
-        KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
-                         team->t.t_id, team->t.t_max_argc ));
-        team -> t.t_argv     = __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
-        if ( __kmp_storage_map ) {
-            __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
-                                     sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv", team->t.t_id );
-        }
-#endif /* KMP_PERF_V106 */
-
     }
 }
 
@@ -3663,55 +2632,41 @@ __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
     char *ptr = __kmp_allocate(max_nth *
                             ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
                                + sizeof(kmp_disp_t) + sizeof(int)*6
-#  if OMP_30_ENABLED
                                //+ sizeof(int)
                                + sizeof(kmp_r_sched_t)
-                               + sizeof(kmp_taskdata_t)
-#  endif // OMP_30_ENABLED
-                        )     );
+                               + sizeof(kmp_taskdata_t) ) );
 
-    team -> t.t_threads          = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
-    team -> t.t_disp_buffer      = (dispatch_shared_info_t*) ptr;
+    team->t.t_threads          = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
+    team->t.t_disp_buffer      = (dispatch_shared_info_t*) ptr;
                                    ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
-    team -> t.t_dispatch         = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
-    team -> t.t_set_nproc        = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_dynamic      = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_nested       = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_blocktime    = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_bt_set       = (int*) ptr;
-#  if OMP_30_ENABLED
+    team->t.t_dispatch         = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
+    team->t.t_set_nproc        = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_dynamic      = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_nested       = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_blocktime    = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_bt_set       = (int*) ptr;
     ptr += sizeof(int) * max_nth;
-    //team -> t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
-    team -> t.t_set_sched        = (kmp_r_sched_t*) ptr;
+    //team->t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
+    team->t.t_set_sched        = (kmp_r_sched_t*) ptr;
     ptr += sizeof(kmp_r_sched_t) * max_nth;
-    team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
+    team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
     ptr += sizeof(kmp_taskdata_t) * max_nth;
-#  endif // OMP_30_ENABLED
 #else
 
-    team -> t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
-    team -> t.t_disp_buffer = (dispatch_shared_info_t*)
+    team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
+    team->t.t_disp_buffer = (dispatch_shared_info_t*)
         __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
-    team -> t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
-    #if OMP_30_ENABLED
-    //team -> t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    //team -> t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
-    team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
-    #else
-    team -> t.t_set_nproc = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    team -> t.t_set_dynamic = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    team -> t.t_set_nested = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    team -> t.t_set_blocktime = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    team -> t.t_set_bt_intervals = (int*) __kmp_allocate( sizeof(int) * max_nth );
-    team -> t.t_set_bt_set = (int*) __kmp_allocate( sizeof(int) * max_nth );
-#  endif // OMP_30_ENABLED
+    team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
+    //team->t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
+    //team->t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
+    team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
 #endif
     team->t.t_max_nproc = max_nth;
 
     /* setup dispatch buffers */
     for(i = 0 ; i < num_disp_buff; ++i)
-        team -> t.t_disp_buffer[i].buffer_index = i;
+        team->t.t_disp_buffer[i].buffer_index = i;
 }
 
 static void
@@ -3728,34 +2683,16 @@ __kmp_free_team_arrays(kmp_team_t *team) {
     #if !KMP_USE_POOLED_ALLOC
         __kmp_free(team->t.t_disp_buffer);
         __kmp_free(team->t.t_dispatch);
-        #if OMP_30_ENABLED
         //__kmp_free(team->t.t_set_max_active_levels);
         //__kmp_free(team->t.t_set_sched);
         __kmp_free(team->t.t_implicit_task_taskdata);
-        #else
-        __kmp_free(team->t.t_set_nproc);
-        __kmp_free(team->t.t_set_dynamic);
-        __kmp_free(team->t.t_set_nested);
-        __kmp_free(team->t.t_set_blocktime);
-        __kmp_free(team->t.t_set_bt_intervals);
-        __kmp_free(team->t.t_set_bt_set);
-    #  endif // OMP_30_ENABLED
     #endif
     team->t.t_threads     = NULL;
     team->t.t_disp_buffer = NULL;
     team->t.t_dispatch    = NULL;
-#if OMP_30_ENABLED
     //team->t.t_set_sched   = 0;
     //team->t.t_set_max_active_levels = 0;
     team->t.t_implicit_task_taskdata = 0;
-#else
-    team->t.t_set_nproc   = 0;
-    team->t.t_set_dynamic = 0;
-    team->t.t_set_nested  = 0;
-    team->t.t_set_blocktime   = 0;
-    team->t.t_set_bt_intervals = 0;
-    team->t.t_set_bt_set  = 0;
-#endif // OMP_30_ENABLED
 }
 
 static void
@@ -3765,18 +2702,9 @@ __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
     #if !KMP_USE_POOLED_ALLOC
         __kmp_free(team->t.t_disp_buffer);
         __kmp_free(team->t.t_dispatch);
-        #if OMP_30_ENABLED
         //__kmp_free(team->t.t_set_max_active_levels);
         //__kmp_free(team->t.t_set_sched);
         __kmp_free(team->t.t_implicit_task_taskdata);
-        #else
-        __kmp_free(team->t.t_set_nproc);
-        __kmp_free(team->t.t_set_dynamic);
-        __kmp_free(team->t.t_set_nested);
-        __kmp_free(team->t.t_set_blocktime);
-        __kmp_free(team->t.t_set_bt_intervals);
-        __kmp_free(team->t.t_set_bt_set);
-    #  endif // OMP_30_ENABLED
     #endif
     __kmp_allocate_team_arrays(team, max_nth);
 
@@ -3788,9 +2716,7 @@ __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
 static kmp_internal_control_t
 __kmp_get_global_icvs( void ) {
 
-#if OMP_30_ENABLED
     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
-#endif /* OMP_30_ENABLED */
 
 #if OMP_40_ENABLED
     KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
@@ -3798,18 +2724,15 @@ __kmp_get_global_icvs( void ) {
 
     kmp_internal_control_t g_icvs = {
       0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
-      __kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
-      __kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
-      __kmp_dflt_team_nth,
-                                    //int nproc;                //internal control for # of threads for next parallel region (per thread)
-                                    // (use a max ub on value if __kmp_parallel_initialize not called yet)
+      (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
+      (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
+      (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
       __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
       __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
-      __kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
-#if OMP_30_ENABLED
+      __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
+                                    // (use a max ub on value if __kmp_parallel_initialize not called yet)
       __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
       r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
-#endif /* OMP_30_ENABLED */
 #if OMP_40_ENABLED
       __kmp_nested_proc_bind.bind_types[0],
 #endif /* OMP_40_ENABLED */
@@ -3822,24 +2745,10 @@ __kmp_get_global_icvs( void ) {
 static kmp_internal_control_t
 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
 
-    #if OMP_30_ENABLED
     kmp_internal_control_t gx_icvs;
     gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
     copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
     gx_icvs.next = NULL;
-    #else
-    kmp_internal_control_t gx_icvs =
-    {
-      0,
-      team->t.t_set_nested[0],
-      team->t.t_set_dynamic[0],
-      team->t.t_set_nproc[0],
-      team->t.t_set_blocktime[0],
-      team->t.t_set_bt_intervals[0],
-      team->t.t_set_bt_set[0],
-      NULL                          //struct kmp_internal_control *next;
-    };
-    #endif // OMP_30_ENABLED
 
     return gx_icvs;
 }
@@ -3852,20 +2761,18 @@ __kmp_initialize_root( kmp_root_t *root )
     kmp_team_t   *hot_team;
     size_t        disp_size, dispatch_size, bar_size;
     int           hot_team_max_nth;
-#if OMP_30_ENABLED
     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
-#endif // OMP_30_ENABLED
     KMP_DEBUG_ASSERT( root );
     KMP_ASSERT( ! root->r.r_begin );
 
     /* setup the root state structure */
     __kmp_init_lock( &root->r.r_begin_lock );
-    root -> r.r_begin        = FALSE;
-    root -> r.r_active       = FALSE;
-    root -> r.r_in_parallel  = 0;
-    root -> r.r_blocktime    = __kmp_dflt_blocktime;
-    root -> r.r_nested       = __kmp_dflt_nested;
+    root->r.r_begin        = FALSE;
+    root->r.r_active       = FALSE;
+    root->r.r_in_parallel  = 0;
+    root->r.r_blocktime    = __kmp_dflt_blocktime;
+    root->r.r_nested       = __kmp_dflt_nested;
 
     /* setup the root team for this task */
     /* allocate the root team structure */
@@ -3878,33 +2785,23 @@ __kmp_initialize_root( kmp_root_t *root )
 #if OMP_40_ENABLED
             __kmp_nested_proc_bind.bind_types[0],
 #endif
-#if OMP_30_ENABLED
             &r_icvs,
-#else
-            __kmp_dflt_team_nth_ub,                                    // num_treads
-            __kmp_global.g.g_dynamic,                                  // dynamic
-            __kmp_dflt_nested,                                         // nested
-            __kmp_dflt_blocktime,                                      // blocktime
-            __kmp_bt_intervals,                                        // bt_intervals
-            __kmp_env_blocktime,                                       // bt_set
-#endif // OMP_30_ENABLED
             0                                                          // argc
+            USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
         );
 
     KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
 
-    root -> r.r_root_team = root_team;
-    root_team -> t.t_control_stack_top = NULL;
+    root->r.r_root_team = root_team;
+    root_team->t.t_control_stack_top = NULL;
 
     /* initialize root team */
-    root_team -> t.t_threads[0] = NULL;
-    root_team -> t.t_nproc      = 1;
-    root_team -> t.t_serialized = 1;
-#if OMP_30_ENABLED
-    // TODO???: root_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
-    root_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
-    root_team -> t.t_sched.chunk        = r_sched.chunk;
-#endif // OMP_30_ENABLED
+    root_team->t.t_threads[0] = NULL;
+    root_team->t.t_nproc      = 1;
+    root_team->t.t_serialized = 1;
+    // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+    root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
+    root_team->t.t_sched.chunk        = r_sched.chunk;
     KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
                     root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
 
@@ -3919,39 +2816,29 @@ __kmp_initialize_root( kmp_root_t *root )
 #if OMP_40_ENABLED
             __kmp_nested_proc_bind.bind_types[0],
 #endif
-#if OMP_30_ENABLED
             &r_icvs,
-#else
-            __kmp_dflt_team_nth_ub,                                    // num_treads
-            __kmp_global.g.g_dynamic,                                  // dynamic
-            __kmp_dflt_nested,                                         // nested
-            __kmp_dflt_blocktime,                                      // blocktime
-            __kmp_bt_intervals,                                        // bt_intervals
-            __kmp_env_blocktime,                                       // bt_set
-#endif // OMP_30_ENABLED
             0                                                          // argc
+            USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
         );
     KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
 
-    root -> r.r_hot_team = hot_team;
-    root_team -> t.t_control_stack_top = NULL;
+    root->r.r_hot_team = hot_team;
+    root_team->t.t_control_stack_top = NULL;
 
     /* first-time initialization */
-    hot_team -> t.t_parent = root_team;
+    hot_team->t.t_parent = root_team;
 
     /* initialize hot team */
     hot_team_max_nth = hot_team->t.t_max_nproc;
     for ( f = 0; f < hot_team_max_nth; ++ f ) {
-        hot_team -> t.t_threads[ f ] = NULL;
+        hot_team->t.t_threads[ f ] = NULL;
     }; // for
-    hot_team -> t.t_nproc = 1;
-#if OMP_30_ENABLED
-    // TODO???: hot_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
-    hot_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
-    hot_team -> t.t_sched.chunk        = r_sched.chunk;
-#endif // OMP_30_ENABLED
+    hot_team->t.t_nproc = 1;
+    // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
+    hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
+    hot_team->t.t_sched.chunk        = r_sched.chunk;
 #if KMP_MIC
-    hot_team -> t.t_size_changed = 0;
+    hot_team->t.t_size_changed = 0;
 #endif
 
 }
@@ -4041,7 +2928,7 @@ __kmp_print_structure_thread(
     }; // if
 }
 
-static void
+void
 __kmp_print_structure(
     void
 ) {
@@ -4185,10 +3072,10 @@ static const unsigned __kmp_primes[] = {
 unsigned short
 __kmp_get_random( kmp_info_t * thread )
 {
-  unsigned x = thread -> th.th_x;
+  unsigned x = thread->th.th_x;
   unsigned short r = x>>16;
 
-  thread -> th.th_x = x*thread->th.th_a+1;
+  thread->th.th_x = x*thread->th.th_a+1;
 
   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
          thread->th.th_info.ds.ds_tid, r) );
@@ -4203,9 +3090,9 @@ __kmp_init_random( kmp_info_t * thread )
 {
   unsigned seed = thread->th.th_info.ds.ds_tid;
 
-  thread -> th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
-  thread -> th.th_x = (seed+1)*thread->th.th_a+1;
-  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread -> th.th_a) );
+  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
+  thread->th.th_x = (seed+1)*thread->th.th_a+1;
+  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
 }
 
 
@@ -4430,7 +3317,8 @@ __kmp_register_root( int initial_thread )
     /* find an available thread slot */
     /* Don't reassign the zero slot since we need that to only be used by initial
        thread */
-    for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ );
+    for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
+        ;
     KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
     KMP_ASSERT( gtid < __kmp_threads_capacity );
 
@@ -4475,69 +3363,61 @@ __kmp_register_root( int initial_thread )
     __kmp_initialize_root( root );
 
     /* setup new root thread structure */
-    if( root -> r.r_uber_thread ) {
-        root_thread = root -> r.r_uber_thread;
+    if( root->r.r_uber_thread ) {
+        root_thread = root->r.r_uber_thread;
     } else {
         root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
         if ( __kmp_storage_map ) {
             __kmp_print_thread_storage_map( root_thread, gtid );
         }
-        root_thread -> th.th_info .ds.ds_gtid = gtid;
-        root_thread -> th.th_root =  root;
+        root_thread->th.th_info .ds.ds_gtid = gtid;
+        root_thread->th.th_root =  root;
         if( __kmp_env_consistency_check ) {
-            root_thread -> th.th_cons = __kmp_allocate_cons_stack( gtid );
+            root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
         }
         #if USE_FAST_MEMORY
             __kmp_initialize_fast_memory( root_thread );
         #endif /* USE_FAST_MEMORY */
 
         #if KMP_USE_BGET
-            KMP_DEBUG_ASSERT( root_thread -> th.th_local.bget_data == NULL );
+            KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
             __kmp_initialize_bget( root_thread );
         #endif
         __kmp_init_random( root_thread );  // Initialize random number generator
     }
 
     /* setup the serial team held in reserve by the root thread */
-    if( ! root_thread -> th.th_serial_team ) {
-        #if OMP_30_ENABLED
-            kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
-        #endif // OMP_30_ENABLED
+    if( ! root_thread->th.th_serial_team ) {
+        kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
-        root_thread -> th.th_serial_team = __kmp_allocate_team( root, 1, 1,
+        root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
 #if OMP_40_ENABLED
           proc_bind_default,
 #endif
-#if OMP_30_ENABLED
           &r_icvs,
-#else
-          __kmp_dflt_team_nth_ub,
-          __kmp_global.g.g_dynamic,
-          __kmp_dflt_nested,
-          __kmp_dflt_blocktime,
-          __kmp_bt_intervals,
-          __kmp_env_blocktime,
-#endif // OMP_30_ENABLED
-          0 );
-    }
-    KMP_ASSERT( root_thread -> th.th_serial_team );
+          0 USE_NESTED_HOT_ARG(NULL) );
+    }
+    KMP_ASSERT( root_thread->th.th_serial_team );
     KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
-      root_thread -> th.th_serial_team ) );
+      root_thread->th.th_serial_team ) );
 
     /* drop root_thread into place */
     TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
 
-    root -> r.r_root_team -> t.t_threads[0] = root_thread;
-    root -> r.r_hot_team  -> t.t_threads[0] = root_thread;
-    root_thread -> th.th_serial_team -> t.t_threads[0] = root_thread;
-    root_thread -> th.th_serial_team -> t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
-    root -> r.r_uber_thread = root_thread;
+    root->r.r_root_team->t.t_threads[0] = root_thread;
+    root->r.r_hot_team ->t.t_threads[0] = root_thread;
+    root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
+    root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
+    root->r.r_uber_thread = root_thread;
 
     /* initialize the thread, get it ready to go */
     __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
 
     /* prepare the master thread for get_gtid() */
     __kmp_gtid_set_specific( gtid );
+
+    __kmp_itt_thread_name( gtid );
+
     #ifdef KMP_TDATA_GTID
         __kmp_gtid = gtid;
     #endif
@@ -4547,7 +3427,7 @@ __kmp_register_root( int initial_thread )
 
     KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
                     gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
-                    root -> r.r_hot_team -> t.t_id, 0, KMP_INIT_BARRIER_STATE,
+                    root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
                     KMP_INIT_BARRIER_STATE ) );
     { // Initialize barrier data.
         int b;
@@ -4572,6 +3452,34 @@ __kmp_register_root( int initial_thread )
     return gtid;
 }
 
+#if KMP_NESTED_HOT_TEAMS
+static int
+__kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
+{
+    int i, n, nth;
+    kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
+    if( !hot_teams || !hot_teams[level].hot_team ) {
+        return 0;
+    }
+    KMP_DEBUG_ASSERT( level < max_level );
+    kmp_team_t *team = hot_teams[level].hot_team;
+    nth = hot_teams[level].hot_team_nth;
+    n = nth - 1;                   // master is not freed
+    if( level < max_level - 1 ) {
+        for( i = 0; i < nth; ++i ) {
+            kmp_info_t *th = team->t.t_threads[i];
+            n += __kmp_free_hot_teams( root, th, level + 1, max_level );
+            if( i > 0 && th->th.th_hot_teams ) {
+                __kmp_free( th->th.th_hot_teams );
+                th->th.th_hot_teams = NULL;
+            }
+        }
+    }
+    __kmp_free_team( root, team, NULL );
+    return n;
+}
+#endif
+
 /* Resets a root thread and clear its root and hot teams.
    Returns the number of __kmp_threads entries directly and indirectly freed.
 */
@@ -4589,10 +3497,21 @@ __kmp_reset_root(int gtid, kmp_root_t *root)
     root->r.r_hot_team  = NULL;
         // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
         // to __kmp_free_team().
-    __kmp_free_team( root, root_team );
-    __kmp_free_team( root, hot_team );
+    __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
+#if KMP_NESTED_HOT_TEAMS
+    if( __kmp_hot_teams_max_level > 1 ) {  // need to free nested hot teams and their threads if any
+        for( i = 0; i < hot_team->t.t_nproc; ++i ) {
+            kmp_info_t *th = hot_team->t.t_threads[i];
+            n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
+            if( th->th.th_hot_teams ) {
+                __kmp_free( th->th.th_hot_teams );
+                th->th.th_hot_teams = NULL;
+            }
+        }
+    }
+#endif
+    __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
 
-#if OMP_30_ENABLED
     //
     // Before we can reap the thread, we need to make certain that all
     // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
@@ -4600,7 +3519,6 @@ __kmp_reset_root(int gtid, kmp_root_t *root)
     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
         __kmp_wait_to_unref_task_teams();
     }
-#endif /* OMP_30_ENABLED */
 
     #if KMP_OS_WINDOWS
         /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
@@ -4616,7 +3534,7 @@ __kmp_reset_root(int gtid, kmp_root_t *root)
         // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
     root->r.r_uber_thread = NULL;
     /* mark root as no longer in use */
-    root -> r.r_begin = FALSE;
+    root->r.r_begin = FALSE;
 
     return n;
 }
@@ -4625,10 +3543,9 @@ void
 __kmp_unregister_root_current_thread( int gtid )
 {
     KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
-    /* This lock should be OK, since unregister_root_current_thread is never called during
-     * an abort, only during a normal close.  Furthermore, if you have the
-     * forkjoin lock, you should never try to get the initz lock.
-     */
+    /* this lock should be ok, since unregister_root_current_thread is never called during
+     * and abort, only during a normal close.  furthermore, if you have the
+     * forkjoin lock, you should never try to get the initz lock */
 
     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
@@ -4681,24 +3598,20 @@ __kmp_unregister_root_other_thread( int gtid )
     return r;
 }
 
-#if OMP_30_ENABLED
-
 #if KMP_DEBUG
 void __kmp_task_info() {
 
     kmp_int32 gtid       = __kmp_entry_gtid();
     kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
     kmp_info_t *this_thr = __kmp_threads[ gtid ];
-    kmp_team_t *steam    = this_thr -> th.th_serial_team;
-    kmp_team_t *team     = this_thr -> th.th_team;
+    kmp_team_t *steam    = this_thr->th.th_serial_team;
+    kmp_team_t *team     = this_thr->th.th_team;
 
     __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
         gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
 }
 #endif // KMP_DEBUG
 
-#endif // OMP_30_ENABLED
-
 /* TODO optimize with one big memclr, take out what isn't needed,
  * split responsility to workers as much as possible, and delay
  * initialization of features as much as possible  */
@@ -4707,14 +3620,14 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
 {
     /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
      * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
-
+    kmp_info_t *master = team->t.t_threads[0];
     KMP_DEBUG_ASSERT( this_thr != NULL );
-    KMP_DEBUG_ASSERT( this_thr -> th.th_serial_team );
+    KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
     KMP_DEBUG_ASSERT( team );
-    KMP_DEBUG_ASSERT( team -> t.t_threads  );
-    KMP_DEBUG_ASSERT( team -> t.t_dispatch );
-    KMP_DEBUG_ASSERT( team -> t.t_threads[0] );
-    KMP_DEBUG_ASSERT( team -> t.t_threads[0] -> th.th_root );
+    KMP_DEBUG_ASSERT( team->t.t_threads  );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch );
+    KMP_DEBUG_ASSERT( master );
+    KMP_DEBUG_ASSERT( master->th.th_root );
 
     KMP_MB();
 
@@ -4728,21 +3641,15 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
     this_thr->th.th_new_place       = this_thr->th.th_current_place;
 # endif
 #endif
-    this_thr->th.th_root            = team -> t.t_threads[0] -> th.th_root;
+    this_thr->th.th_root            = master->th.th_root;
 
     /* setup the thread's cache of the team structure */
-    this_thr->th.th_team_nproc      = team -> t.t_nproc;
-    this_thr->th.th_team_master     = team -> t.t_threads[0];
-    this_thr->th.th_team_serialized = team -> t.t_serialized;
-#if OMP_40_ENABLED
-    this_thr->th.th_team_microtask  = team -> t.t_threads[0] -> th.th_team_microtask;
-    this_thr->th.th_teams_level     = team -> t.t_threads[0] -> th.th_teams_level;
-    this_thr->th.th_set_nth_teams   = team -> t.t_threads[0] -> th.th_set_nth_teams;
-#endif /* OMP_40_ENABLED */
+    this_thr->th.th_team_nproc      = team->t.t_nproc;
+    this_thr->th.th_team_master     = master;
+    this_thr->th.th_team_serialized = team->t.t_serialized;
     TCW_PTR(this_thr->th.th_sleep_loc, NULL);
 
-#if OMP_30_ENABLED
-    KMP_DEBUG_ASSERT( team -> t.t_implicit_task_taskdata );
+    KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
     this_thr->th.th_task_state = 0;
 
     KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
@@ -4753,13 +3660,11 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
     KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
     // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
-#endif // OMP_30_ENABLED
 
     /* TODO no worksharing in speculative threads */
-    this_thr -> th.th_dispatch      = &team -> t.t_dispatch[ tid ];
+    this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
 
     this_thr->th.th_local.this_construct = 0;
-    this_thr->th.th_local.last_construct = 0;
 
 #ifdef BUILD_TV
     this_thr->th.th_local.tv_data = 0;
@@ -4778,7 +3683,7 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
 
     /* Initialize dynamic dispatch */
     {
-        volatile kmp_disp_t *dispatch = this_thr -> th.th_dispatch;
+        volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
         /*
          * Use team max_nproc since this will never change for the team.
          */
@@ -4786,13 +3691,13 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
             ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
         KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
         KMP_ASSERT( dispatch );
-        KMP_DEBUG_ASSERT( team -> t.t_dispatch );
+        KMP_DEBUG_ASSERT( team->t.t_dispatch );
         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
 
         dispatch->th_disp_index = 0;
 
-        if( ! dispatch -> th_disp_buffer )  {
-            dispatch -> th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
+        if( ! dispatch->th_disp_buffer )  {
+            dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
 
             if ( __kmp_storage_map ) {
                 __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
@@ -4802,14 +3707,14 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid
                                          gtid, team->t.t_id, gtid );
             }
         } else {
-            memset( & dispatch -> th_disp_buffer[0], '\0', disp_size );
+            memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
         }
 
-        dispatch -> th_dispatch_pr_current = 0;
-        dispatch -> th_dispatch_sh_current = 0;
+        dispatch->th_dispatch_pr_current = 0;
+        dispatch->th_dispatch_sh_current = 0;
 
-        dispatch -> th_deo_fcn = 0;             /* ORDERED     */
-        dispatch -> th_dxo_fcn = 0;             /* END ORDERED */
+        dispatch->th_deo_fcn = 0;             /* ORDERED     */
+        dispatch->th_dxo_fcn = 0;             /* END ORDERED */
     }
 
     this_thr->th.th_next_pool = NULL;
@@ -4836,7 +3741,9 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
 
     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
     KMP_DEBUG_ASSERT( root && team );
+#if !KMP_NESTED_HOT_TEAMS
     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
+#endif
     KMP_MB();
 
     /* first, try to get one from the thread pool */
@@ -4857,7 +3764,7 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
 
         KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
                     __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
-        KMP_ASSERT(       ! new_thr -> th.th_team );
+        KMP_ASSERT(       ! new_thr->th.th_team );
         KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
 
@@ -4877,6 +3784,14 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
         }
 #endif /* KMP_ADJUST_BLOCKTIME */
 
+#if KMP_DEBUG
+        // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
+        int b;
+        kmp_balign_t * balign = new_thr->th.th_bar;
+        for( b = 0; b < bs_last_barrier; ++ b )
+            KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+#endif
+
         KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
                     __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
 
@@ -4933,30 +3848,19 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
 
     /* add the reserve serialized team, initialized from the team's master thread */
     {
-    #if OMP_30_ENABLED
     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
-    #endif // OMP_30_ENABLED
     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
-    new_thr -> th.th_serial_team = serial_team =
+    new_thr->th.th_serial_team = serial_team =
         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
 #if OMP_40_ENABLED
                                            proc_bind_default,
 #endif
-#if OMP_30_ENABLED
                                            &r_icvs,
-#else
-                                           team->t.t_set_nproc[0],
-                                           team->t.t_set_dynamic[0],
-                                           team->t.t_set_nested[0],
-                                           team->t.t_set_blocktime[0],
-                                           team->t.t_set_bt_intervals[0],
-                                           team->t.t_set_bt_set[0],
-#endif // OMP_30_ENABLED
-                                           0 );
+                                           0 USE_NESTED_HOT_ARG(NULL) );
     }
     KMP_ASSERT ( serial_team );
-    serial_team -> t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
-    serial_team -> t.t_threads[0] = new_thr;
+    serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
+    serial_team->t.t_threads[0] = new_thr;
     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
       new_thr ) );
 
@@ -4968,7 +3872,7 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
     #endif /* USE_FAST_MEMORY */
 
     #if KMP_USE_BGET
-        KMP_DEBUG_ASSERT( new_thr -> th.th_local.bget_data == NULL );
+        KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
         __kmp_initialize_bget( new_thr );
     #endif
 
@@ -4978,11 +3882,14 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
                     __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
 
-    new_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
-    new_thr->th.th_bar[ bs_plain_barrier    ].bb.b_go = KMP_INIT_BARRIER_STATE;
-    #if KMP_FAST_REDUCTION_BARRIER
-    new_thr->th.th_bar[ bs_reduction_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
-    #endif // KMP_FAST_REDUCTION_BARRIER
+    int b;
+    kmp_balign_t * balign = new_thr->th.th_bar;
+    for(b=0; b<bs_last_barrier; ++b) {
+        balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
+        balign[b].bb.team = NULL;
+        balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
+        balign[b].bb.use_oncore_barrier = 0;
+    }
 
     new_thr->th.th_spin_here = FALSE;
     new_thr->th.th_next_waiting = 0;
@@ -5051,109 +3958,23 @@ __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
  */
 static void
-__kmp_reinitialize_team( kmp_team_t *team,
-#if OMP_30_ENABLED
-                         kmp_internal_control_t *new_icvs, ident_t *loc
-#else
-                         int new_set_nproc, int new_set_dynamic, int new_set_nested,
-                         int new_set_blocktime, int new_bt_intervals, int new_bt_set
-#endif
-                         ) {
+__kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
     KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
                     team->t.t_threads[0], team ) );
-#if OMP_30_ENABLED
     KMP_DEBUG_ASSERT( team && new_icvs);
     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
     team->t.t_ident = loc;
-#else
-    KMP_DEBUG_ASSERT( team && new_set_nproc );
-#endif // OMP_30_ENABLED
 
     team->t.t_id = KMP_GEN_TEAM_ID();
 
     // Copy ICVs to the master thread's implicit taskdata
-#if OMP_30_ENABLED
-    load_icvs(new_icvs);
     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
-    store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
-    sync_icvs();
-# else
-    team -> t.t_set_nproc[0]   = new_set_nproc;
-    team -> t.t_set_dynamic[0] = new_set_dynamic;
-    team -> t.t_set_nested[0]  = new_set_nested;
-    team -> t.t_set_blocktime[0]   = new_set_blocktime;
-    team -> t.t_set_bt_intervals[0] = new_bt_intervals;
-    team -> t.t_set_bt_set[0]  = new_bt_set;
-# endif // OMP_30_ENABLED
+    copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
 
     KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
                     team->t.t_threads[0], team ) );
 }
 
-static void
-__kmp_setup_icv_copy(kmp_team_t *  team, int           new_nproc,
-#if OMP_30_ENABLED
-                kmp_internal_control_t * new_icvs,
-                ident_t *                loc
-#else
-                int new_set_nproc, int new_set_dynamic, int new_set_nested,
-                int new_set_blocktime, int new_bt_intervals, int new_bt_set
-#endif // OMP_30_ENABLED
-                )
-{
-    int f;
-
-#if OMP_30_ENABLED
-    KMP_DEBUG_ASSERT( team && new_nproc && new_icvs );
-    KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
-#else
-    KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc );
-#endif // OMP_30_ENABLED
-
-    // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team.
-    // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called.
-#if KMP_BARRIER_ICV_PULL
-    // Copy the ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where all of the
-    // worker threads can access them and make their own copies after the barrier.
-    load_icvs(new_icvs);
-    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // the threads arrays should be allocated at this point
-    store_icvs(&team->t.t_threads[0]->th.th_fixed_icvs, new_icvs);
-    sync_icvs();
-    KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team));
-
-#elif KMP_BARRIER_ICV_PUSH
-    // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
-    KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0, team->t.t_threads[0], team));
-
-#else
-    // Copy the ICVs to each of the non-master threads.  This takes O(nthreads) time.
-# if OMP_30_ENABLED
-    load_icvs(new_icvs);
-# endif // OMP_30_ENABLED
-    KMP_DEBUG_ASSERT(team->t.t_threads[0]);  // the threads arrays should be allocated at this point
-    for(f=1 ; f<new_nproc ; f++) { // skip the master thread
-# if OMP_30_ENABLED
-        // TODO: GEH - pass in better source location info since usually NULL here
-        KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
-                        f, team->t.t_threads[f], team ) );
-        __kmp_init_implicit_task( loc, team->t.t_threads[f], team, f, FALSE );
-        store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
-        KF_TRACE( 10, ( "__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
-                        f, team->t.t_threads[f], team ) );
-# else
-        team -> t.t_set_nproc[f]   = new_set_nproc;
-        team -> t.t_set_dynamic[f] = new_set_dynamic;
-        team -> t.t_set_nested[f]  = new_set_nested;
-        team -> t.t_set_blocktime[f]   = new_set_blocktime;
-        team -> t.t_set_bt_intervals[f] = new_bt_intervals;
-        team -> t.t_set_bt_set[f]  = new_bt_set;
-# endif // OMP_30_ENABLED
-    }
-# if OMP_30_ENABLED
-    sync_icvs();
-# endif // OMP_30_ENABLED
-#endif // KMP_BARRIER_ICV_PULL
-}
 
 /* initialize the team data structure
  * this assumes the t_threads and t_max_nproc are already set
@@ -5162,13 +3983,8 @@ static void
 __kmp_initialize_team(
     kmp_team_t * team,
     int          new_nproc,
-    #if OMP_30_ENABLED
-        kmp_internal_control_t * new_icvs,
-        ident_t *                loc
-    #else
-        int new_set_nproc, int new_set_dynamic, int new_set_nested,
-        int new_set_blocktime, int new_bt_intervals, int new_bt_set
-    #endif // OMP_30_ENABLED
+    kmp_internal_control_t * new_icvs,
+    ident_t *                loc
 ) {
     KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
 
@@ -5178,53 +3994,43 @@ __kmp_initialize_team(
     KMP_DEBUG_ASSERT( team->t.t_threads );
     KMP_MB();
 
-    team -> t.t_master_tid  = 0;    /* not needed */
-    /* team -> t.t_master_bar;        not needed */
-    team -> t.t_serialized  = new_nproc > 1 ? 0 : 1;
-    team -> t.t_nproc       = new_nproc;
+    team->t.t_master_tid  = 0;    /* not needed */
+    /* team->t.t_master_bar;        not needed */
+    team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
+    team->t.t_nproc       = new_nproc;
 
-    /* team -> t.t_parent     = NULL; TODO not needed & would mess up hot team */
-    team -> t.t_next_pool   = NULL;
-    /* memset( team -> t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
+    /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
+    team->t.t_next_pool   = NULL;
+    /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
 
     TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
-    team -> t.t_invoke      = NULL; /* not needed */
+    team->t.t_invoke      = NULL; /* not needed */
 
-#if OMP_30_ENABLED
-    // TODO???: team -> t.t_max_active_levels       = new_max_active_levels;
-    team -> t.t_sched       = new_icvs->sched;
-#endif // OMP_30_ENABLED
+    // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
+    team->t.t_sched       = new_icvs->sched;
 
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-    team -> t.t_fp_control_saved = FALSE; /* not needed */
-    team -> t.t_x87_fpu_control_word = 0; /* not needed */
-    team -> t.t_mxcsr = 0;                /* not needed */
+    team->t.t_fp_control_saved = FALSE; /* not needed */
+    team->t.t_x87_fpu_control_word = 0; /* not needed */
+    team->t.t_mxcsr = 0;                /* not needed */
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
-    team -> t.t_construct   = 0;
-    __kmp_init_lock( & team -> t.t_single_lock );
+    team->t.t_construct   = 0;
+    __kmp_init_lock( & team->t.t_single_lock );
 
-    team -> t.t_ordered .dt.t_value = 0;
-    team -> t.t_master_active = FALSE;
+    team->t.t_ordered .dt.t_value = 0;
+    team->t.t_master_active = FALSE;
 
-    memset( & team -> t.t_taskq, '\0', sizeof( kmp_taskq_t ));
+    memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
 
 #ifdef KMP_DEBUG
-    team -> t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
+    team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
 #endif
-    team -> t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
+    team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
 
-    team -> t.t_control_stack_top = NULL;
-
-    __kmp_reinitialize_team( team,
-#if OMP_30_ENABLED
-                             new_icvs, loc
-#else
-                             new_set_nproc, new_set_dynamic, new_set_nested,
-                             new_set_blocktime, new_bt_intervals, new_bt_set
-#endif // OMP_30_ENABLED
-                             );
+    team->t.t_control_stack_top = NULL;
 
+    __kmp_reinitialize_team( team, new_icvs, loc );
 
     KMP_MB();
     KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
@@ -5330,7 +4136,7 @@ __kmp_partition_places( kmp_team_t *team )
                     if ( place == last_place ) {
                         place = first_place;
                     }
-                    else if ( place == __kmp_affinity_num_masks - 1) {
+                    else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                         place = 0;
                     }
                     else {
@@ -5370,7 +4176,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5384,7 +4190,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5433,7 +4239,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5445,7 +4251,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5460,7 +4266,7 @@ __kmp_partition_places( kmp_team_t *team )
                     if ( place == last_place ) {
                         place = first_place;
                     }
-                    else if ( place == __kmp_affinity_num_masks - 1) {
+                    else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                         place = 0;
                     }
                     else {
@@ -5499,7 +4305,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5513,7 +4319,7 @@ __kmp_partition_places( kmp_team_t *team )
                         if ( place == last_place ) {
                             place = first_place;
                         }
-                        else if ( place == __kmp_affinity_num_masks - 1) {
+                        else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
                             place = 0;
                         }
                         else {
@@ -5548,49 +4354,106 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 #if OMP_40_ENABLED
     kmp_proc_bind_t new_proc_bind,
 #endif
-#if OMP_30_ENABLED
     kmp_internal_control_t *new_icvs,
-#else
-    int new_set_nproc, int new_set_dynamic, int new_set_nested,
-    int new_set_blocktime, int new_bt_intervals, int new_bt_set,
-#endif
-    int argc )
+    int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
 {
+    KMP_TIME_BLOCK(KMP_allocate_team);
     int f;
     kmp_team_t *team;
     char *ptr;
     size_t size;
+    int use_hot_team = ! root->r.r_active;
 
     KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
     KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
     KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
     KMP_MB();
 
-    //
-    // optimization to use a "hot" team for the top level,
-    // as it is usually the same
-    //
-    if ( ! root->r.r_active  &&  new_nproc > 1 ) {
-
+#if KMP_NESTED_HOT_TEAMS
+    int level;
+    kmp_hot_team_ptr_t *hot_teams;
+    if( master ) {
+        team = master->th.th_team;
+        level = team->t.t_active_level;
+        if( master->th.th_teams_microtask ) {                         // in teams construct?
+            if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
+                team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
+                master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
+                ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
+            }
+        }
+        hot_teams = master->th.th_hot_teams;
+        if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
+        {   // hot team has already been allocated for given level
+            use_hot_team = 1;
+        } else {
+            use_hot_team = 0;
+        }
+    }
+#endif
+    // Optimization to use a "hot" team
+    if( use_hot_team && new_nproc > 1 ) {
         KMP_DEBUG_ASSERT( new_nproc == max_nproc );
-
-        team =  root -> r.r_hot_team;
-
-#if OMP_30_ENABLED && KMP_DEBUG
+#if KMP_NESTED_HOT_TEAMS
+        team = hot_teams[level].hot_team;
+#else
+        team =  root->r.r_hot_team;
+#endif
+#if KMP_DEBUG
         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p before reinit\n",
-                           team -> t.t_task_team ));
+                           team->t.t_task_team ));
         }
 #endif
 
-        /* has the number of threads changed? */
-        if( team -> t.t_nproc > new_nproc ) {
+        // Has the number of threads changed?
+        /* Let's assume the most common case is that the number of threads is unchanged, and
+           put that case first. */
+        if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
+            KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
+#if KMP_MIC
+            // This case can mean that omp_set_num_threads() was called and the hot team size
+            // was already reduced, so we check the special flag
+            if ( team->t.t_size_changed == -1 ) {
+                team->t.t_size_changed = 1;
+            } else {
+                team->t.t_size_changed = 0;
+            }
+#endif
+
+            // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+            team->t.t_sched =  new_icvs->sched;
+
+            __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
+
+            KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
+                           0, team->t.t_threads[0], team ) );
+            __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
+
+#if OMP_40_ENABLED
+# if KMP_AFFINITY_SUPPORTED
+            if ( team->t.t_proc_bind == new_proc_bind ) {
+                KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
+                  team->t.t_id, new_proc_bind, team->t.t_first_place,
+                  team->t.t_last_place ) );
+            }
+            else {
+                team->t.t_proc_bind = new_proc_bind;
+                __kmp_partition_places( team );
+            }
+# else
+            if ( team->t.t_proc_bind != new_proc_bind ) {
+                team->t.t_proc_bind = new_proc_bind;
+            }
+# endif /* KMP_AFFINITY_SUPPORTED */
+#endif /* OMP_40_ENABLED */
+        }
+        else if( team->t.t_nproc > new_nproc ) {
             KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
 
 #if KMP_MIC
-            team -> t.t_size_changed = 1;
+            team->t.t_size_changed = 1;
 #endif
-#if OMP_30_ENABLED
             if ( __kmp_tasking_mode != tskm_immediate_exec ) {
                 kmp_task_team_t *task_team = team->t.t_task_team;
                 if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
@@ -5612,31 +4475,27 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                     KMP_DEBUG_ASSERT( task_team == NULL );
                 }
             }
-#endif // OMP_30_ENABLED
-
-            /* release the extra threads we don't need any more */
-            for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
-                KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
-                __kmp_free_thread( team->t.t_threads[ f ] );
-                team -> t.t_threads[ f ] =  NULL;
-            }
-
-            team -> t.t_nproc =  new_nproc;
-#if OMP_30_ENABLED
-            // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
-            team -> t.t_sched =  new_icvs->sched;
-#endif
-            __kmp_reinitialize_team( team,
-#if OMP_30_ENABLED
-                                     new_icvs, root->r.r_uber_thread->th.th_ident
-#else
-                                     new_set_nproc, new_set_dynamic, new_set_nested,
-                                     new_set_blocktime, new_bt_intervals, new_bt_set
-#endif // OMP_30_ENABLED
-                                     );
-
+#if KMP_NESTED_HOT_TEAMS
+            if( __kmp_hot_teams_mode == 0 ) {
+                // AC: saved number of threads should correspond to team's value in this mode,
+                // can be bigger in mode 1, when hot team has some threads in reserve
+                KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
+                hot_teams[level].hot_team_nth = new_nproc;
+#endif // KMP_NESTED_HOT_TEAMS
+                /* release the extra threads we don't need any more */
+                for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
+                    KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
+                    __kmp_free_thread( team->t.t_threads[ f ] );
+                    team->t.t_threads[ f ] = NULL;
+                }
+#if KMP_NESTED_HOT_TEAMS
+            } // (__kmp_hot_teams_mode == 0)
+#endif // KMP_NESTED_HOT_TEAMS
+            team->t.t_nproc =  new_nproc;
+            // TODO???: team->t.t_max_active_levels = new_max_active_levels;
+            team->t.t_sched =  new_icvs->sched;
+            __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
 
-#if OMP_30_ENABLED
             if ( __kmp_tasking_mode != tskm_immediate_exec ) {
                 kmp_task_team_t *task_team = team->t.t_task_team;
                 if ( task_team != NULL ) {
@@ -5646,20 +4505,17 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                     task_team->tt.tt_ref_ct = new_nproc - 1;
                 }
             }
-#endif
 
             /* update the remaining threads */
-            for( f = 0  ;  f < new_nproc  ;  f++ ) {
-                team -> t.t_threads[ f ] -> th.th_team_nproc = team->t.t_nproc;
+            for(f = 0; f < new_nproc; ++f) {
+                team->t.t_threads[f]->th.th_team_nproc = new_nproc;
             }
 
-#if OMP_30_ENABLED
             // restore the current task state of the master thread: should be the implicit task
             KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
                        0, team->t.t_threads[0], team ) );
 
-            __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
-#endif
+            __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
 
 #ifdef KMP_DEBUG
             for ( f = 0; f < team->t.t_nproc; f++ ) {
@@ -5674,9 +4530,8 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             __kmp_partition_places( team );
 # endif
 #endif
-
         }
-        else if ( team -> t.t_nproc < new_nproc ) {
+        else { // team->t.t_nproc < new_nproc
 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
             kmp_affin_mask_t *old_mask;
             if ( KMP_AFFINITY_CAPABLE() ) {
@@ -5687,21 +4542,39 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
 
 #if KMP_MIC
-            team -> t.t_size_changed = 1;
+            team->t.t_size_changed = 1;
 #endif
 
 
-            if(team -> t.t_max_nproc < new_nproc) {
+#if KMP_NESTED_HOT_TEAMS
+            int avail_threads = hot_teams[level].hot_team_nth;
+            if( new_nproc < avail_threads )
+                avail_threads = new_nproc;
+            kmp_info_t **other_threads = team->t.t_threads;
+            for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
+                // Adjust barrier data of reserved threads (if any) of the team
+                // Other data will be set in __kmp_initialize_info() below.
+                int b;
+                kmp_balign_t * balign = other_threads[f]->th.th_bar;
+                for ( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+                }
+            }
+            if( hot_teams[level].hot_team_nth >= new_nproc ) {
+                // we have all needed threads in reserve, no need to allocate any
+                // this only possible in mode 1, cannot have reserved threads in mode 0
+                KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
+                team->t.t_nproc = new_nproc;                     // just get reserved threads involved
+            } else {
+                // we may have some threads in reserve, but not enough
+                team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
+                hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
+#endif // KMP_NESTED_HOT_TEAMS
+            if(team->t.t_max_nproc < new_nproc) {
                 /* reallocate larger arrays */
                 __kmp_reallocate_team_arrays(team, new_nproc);
-                __kmp_reinitialize_team( team,
-#if OMP_30_ENABLED
-                                         new_icvs, NULL
-#else
-                                         new_set_nproc, new_set_dynamic, new_set_nested,
-                                         new_set_blocktime, new_bt_intervals, new_bt_set
-#endif // OMP_30_ENABLED
-                                         );
+                __kmp_reinitialize_team( team, new_icvs, NULL );
             }
 
 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
@@ -5729,8 +4602,9 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                 { // Initialize barrier data for new threads.
                     int b;
                     kmp_balign_t * balign = new_worker->th.th_bar;
-                    for ( b = 0; b < bp_last_bar; ++ b ) {
+                    for( b = 0; b < bs_last_barrier; ++ b ) {
                         balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                        KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
                     }
                 }
             }
@@ -5742,19 +4616,12 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                 KMP_CPU_FREE(old_mask);
             }
 #endif
-
+#if KMP_NESTED_HOT_TEAMS
+            } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
+#endif // KMP_NESTED_HOT_TEAMS
             /* make sure everyone is syncronized */
-            __kmp_initialize_team( team, new_nproc,
-#if OMP_30_ENABLED
-              new_icvs,
-              root->r.r_uber_thread->th.th_ident
-#else
-              new_set_nproc, new_set_dynamic, new_set_nested,
-              new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-            );
+            __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
 
-#if OMP_30_ENABLED
             if ( __kmp_tasking_mode != tskm_immediate_exec ) {
                 kmp_task_team_t *task_team = team->t.t_task_team;
                 if ( task_team != NULL ) {
@@ -5764,7 +4631,6 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
                     task_team->tt.tt_ref_ct = new_nproc - 1;
                 }
             }
-#endif
 
             /* reinitialize the old threads */
             for( f = 0  ;  f < team->t.t_nproc  ;  f++ )
@@ -5783,62 +4649,39 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             __kmp_partition_places( team );
 # endif
 #endif
-
-        }
-        else {
-            KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
-#if KMP_MIC
-            // This case can mean that omp_set_num_threads() was called and the hot team size
-            // was already reduced, so we check the special flag
-            if ( team -> t.t_size_changed == -1 ) {
-                team -> t.t_size_changed = 1;
-            } else {
-                team -> t.t_size_changed = 0;
-            }
-#endif
-
-#if OMP_30_ENABLED
-            // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
-            team -> t.t_sched =  new_icvs->sched;
-#endif
-
-            __kmp_reinitialize_team( team,
-#if OMP_30_ENABLED
-                                     new_icvs, root->r.r_uber_thread->th.th_ident
-#else
-                                     new_set_nproc, new_set_dynamic, new_set_nested,
-                                     new_set_blocktime, new_bt_intervals, new_bt_set
-#endif // OMP_30_ENABLED
-                                     );
-
-#if OMP_30_ENABLED
-            KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
-                           0, team->t.t_threads[0], team ) );
-            __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
-#endif
+        } // Check changes in number of threads
 
 #if OMP_40_ENABLED
-# if KMP_AFFINITY_SUPPORTED
-            if ( team->t.t_proc_bind == new_proc_bind ) {
-                KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
-                  team->t.t_id, new_proc_bind, team->t.t_first_place,
-                  team->t.t_last_place ) );
-            }
-            else {
-                team->t.t_proc_bind = new_proc_bind;
-                __kmp_partition_places( team );
+        kmp_info_t *master = team->t.t_threads[0];
+        if( master->th.th_teams_microtask ) {
+            for( f = 1; f < new_nproc; ++f ) {
+                // propagate teams construct specific info to workers
+                kmp_info_t *thr = team->t.t_threads[f];
+                thr->th.th_teams_microtask = master->th.th_teams_microtask;
+                thr->th.th_teams_level     = master->th.th_teams_level;
+                thr->th.th_teams_size      = master->th.th_teams_size;
             }
-# else
-            if ( team->t.t_proc_bind != new_proc_bind ) {
-                team->t.t_proc_bind = new_proc_bind;
-            }
-# endif /* KMP_AFFINITY_SUPPORTED */
+        }
 #endif /* OMP_40_ENABLED */
+#if KMP_NESTED_HOT_TEAMS
+        if( level ) {
+            // Sync task (TODO: and barrier?) state for nested hot teams, not needed for outermost hot team.
+            for( f = 1; f < new_nproc; ++f ) {
+                kmp_info_t *thr = team->t.t_threads[f];
+                thr->th.th_task_state = master->th.th_task_state;
+                int b;
+                kmp_balign_t * balign = thr->th.th_bar;
+                for( b = 0; b < bs_last_barrier; ++ b ) {
+                    balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
+                    KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
+                }
+            }
         }
+#endif // KMP_NESTED_HOT_TEAMS
 
         /* reallocate space for arguments if necessary */
         __kmp_alloc_argv_entries( argc, team, TRUE );
-        team -> t.t_argc     = argc;
+        team->t.t_argc     = argc;
         //
         // The hot team re-uses the previous task team,
         // if untouched during the previous release->gather phase.
@@ -5846,10 +4689,10 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 
         KF_TRACE( 10, ( " hot_team = %p\n", team ) );
 
-#if OMP_30_ENABLED && KMP_DEBUG
+#if KMP_DEBUG
         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p after reinit\n",
-              team -> t.t_task_team ));
+              team->t.t_task_team ));
         }
 #endif
 
@@ -5868,25 +4711,15 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
             __kmp_team_pool = team->t.t_next_pool;
 
             /* setup the team for fresh use */
-            __kmp_initialize_team( team, new_nproc,
-#if OMP_30_ENABLED
-              new_icvs,
-              NULL // TODO: !!!
-#else
-              new_set_nproc, new_set_dynamic, new_set_nested,
-              new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-            );
+            __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
 
-#if OMP_30_ENABLED
             KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
                             &team->t.t_task_team ) );
-            team -> t.t_task_team = NULL;
-#endif
+            team->t.t_task_team = NULL;
 
             /* reallocate space for arguments if necessary */
             __kmp_alloc_argv_entries( argc, team, TRUE );
-            team -> t.t_argc     = argc;
+            team->t.t_argc     = argc;
 
             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
@@ -5919,28 +4752,18 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
     team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
 
     /* and set it up */
-    team -> t.t_max_nproc   = max_nproc;
+    team->t.t_max_nproc   = max_nproc;
     /* NOTE well, for some reason allocating one big buffer and dividing it
      * up seems to really hurt performance a lot on the P4, so, let's not use
      * this... */
     __kmp_allocate_team_arrays( team, max_nproc );
 
     KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
-    __kmp_initialize_team( team, new_nproc,
-#if OMP_30_ENABLED
-      new_icvs,
-      NULL // TODO: !!!
-#else
-      new_set_nproc, new_set_dynamic, new_set_nested,
-      new_set_blocktime, new_bt_intervals, new_bt_set
-#endif
-    );
+    __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
 
-#if OMP_30_ENABLED
     KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
                     &team->t.t_task_team ) );
-    team -> t.t_task_team = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
-#endif
+    team->t.t_task_team = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
 
     if ( __kmp_storage_map ) {
         __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
@@ -5948,7 +4771,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 
     /* allocate space for arguments */
     __kmp_alloc_argv_entries( argc, team, FALSE );
-    team -> t.t_argc        = argc;
+    team->t.t_argc        = argc;
 
     KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
                     team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
@@ -5976,7 +4799,7 @@ __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
 /* free the team.  return it to the team pool.  release all the threads
  * associated with it */
 void
-__kmp_free_team( kmp_root_t *root, kmp_team_t *team )
+__kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
 {
     int f;
     KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
@@ -5987,15 +4810,37 @@ __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
     KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
     KMP_DEBUG_ASSERT( team->t.t_threads );
 
+    int use_hot_team = team == root->r.r_hot_team;
+#if KMP_NESTED_HOT_TEAMS
+    int level;
+    kmp_hot_team_ptr_t *hot_teams;
+    if( master ) {
+        level = team->t.t_active_level - 1;
+        if( master->th.th_teams_microtask ) {                         // in teams construct?
+            if( master->th.th_teams_size.nteams > 1 ) {
+               ++level; // level was not increased in teams construct for team_of_masters
+            }
+            if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
+                master->th.th_teams_level == team->t.t_level ) {
+                ++level; // level was not increased in teams construct for team_of_workers before the parallel
+            }            // team->t.t_level will be increased inside parallel
+        }
+        hot_teams = master->th.th_hot_teams;
+        if( level < __kmp_hot_teams_max_level ) {
+            KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
+            use_hot_team = 1;
+        }
+    }
+#endif // KMP_NESTED_HOT_TEAMS
+
     /* team is done working */
     TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
-    team -> t.t_copyin_counter = 0; // init counter for possible reuse
+    team->t.t_copyin_counter = 0; // init counter for possible reuse
     // Do not reset pointer to parent team to NULL for hot teams.
 
-    /* if we are a nested team, release our threads */
-    if( team != root->r.r_hot_team ) {
+    /* if we are non-hot team, release our threads */
+    if( ! use_hot_team ) {
 
-#if OMP_30_ENABLED
         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
             kmp_task_team_t *task_team = team->t.t_task_team;
             if ( task_team != NULL ) {
@@ -6013,10 +4858,9 @@ __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
                 team->t.t_task_team = NULL;
             }
         }
-#endif /* OMP_30_ENABLED */
 
         // Reset pointer to parent team only for non-hot teams.
-        team -> t.t_parent = NULL;
+        team->t.t_parent = NULL;
 
 
         /* free the worker threads */
@@ -6029,7 +4873,7 @@ __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
 
         /* put the team back in the team pool */
         /* TODO limit size of team pool, call reap_team if pool too large */
-        team -> t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
+        team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
         __kmp_team_pool        = (volatile kmp_team_t*) team;
     }
 
@@ -6041,29 +4885,21 @@ __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
 kmp_team_t *
 __kmp_reap_team( kmp_team_t *team )
 {
-    kmp_team_t *next_pool = team -> t.t_next_pool;
+    kmp_team_t *next_pool = team->t.t_next_pool;
 
     KMP_DEBUG_ASSERT( team );
-    KMP_DEBUG_ASSERT( team -> t.t_dispatch    );
-    KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
-    KMP_DEBUG_ASSERT( team -> t.t_threads     );
-    #if OMP_30_ENABLED
-    #else
-    KMP_DEBUG_ASSERT( team -> t.t_set_nproc   );
-    #endif
-    KMP_DEBUG_ASSERT( team -> t.t_argv        );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch    );
+    KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
+    KMP_DEBUG_ASSERT( team->t.t_threads     );
+    KMP_DEBUG_ASSERT( team->t.t_argv        );
 
     /* TODO clean the threads that are a part of this? */
 
     /* free stuff */
 
     __kmp_free_team_arrays( team );
-#if (KMP_PERF_V106 == KMP_ON)
-    if ( team -> t.t_argv != &team -> t.t_inline_argv[0] )
-        __kmp_free( (void*) team -> t.t_argv );
-#else
-    __kmp_free( (void*) team -> t.t_argv );
-#endif
+    if ( team->t.t_argv != &team->t.t_inline_argv[0] )
+        __kmp_free( (void*) team->t.t_argv );
     __kmp_free( team );
 
     KMP_MB();
@@ -6108,6 +4944,15 @@ __kmp_free_thread( kmp_info_t *this_th )
 
     KMP_DEBUG_ASSERT( this_th );
 
+    // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
+    int b;
+    kmp_balign_t *balign = this_th->th.th_bar;
+    for (b=0; b<bs_last_barrier; ++b) {
+        if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
+            balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
+        balign[b].bb.team = NULL;
+    }
+
 
     /* put thread back on the free pool */
     TCW_PTR(this_th->th.th_team, NULL);
@@ -6170,361 +5015,7 @@ __kmp_free_thread( kmp_info_t *this_th )
     KMP_MB();
 }
 
-void
-__kmp_join_barrier( int gtid )
-{
-    register kmp_info_t   *this_thr       = __kmp_threads[ gtid ];
-    register kmp_team_t   *team;
-    register kmp_uint      nproc;
-    kmp_info_t            *master_thread;
-    int                    tid;
-    #ifdef KMP_DEBUG
-        int                    team_id;
-    #endif /* KMP_DEBUG */
-#if USE_ITT_BUILD
-    void * itt_sync_obj = NULL;
-    #if USE_ITT_NOTIFY
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) // don't call routine without need
-            itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get object created at fork_barrier
-    #endif
-#endif /* USE_ITT_BUILD */
 
-    KMP_MB();
-
-    /* get current info */
-    team          = this_thr -> th.th_team;
-    /*    nproc         = team -> t.t_nproc;*/
-    nproc         = this_thr -> th.th_team_nproc;
-    KMP_DEBUG_ASSERT( nproc == team->t.t_nproc );
-    tid           = __kmp_tid_from_gtid(gtid);
-    #ifdef KMP_DEBUG
-        team_id       = team -> t.t_id;
-    #endif /* KMP_DEBUG */
-    /*    master_thread = team -> t.t_threads[0];*/
-    master_thread = this_thr -> th.th_team_master;
-    #ifdef KMP_DEBUG
-        if ( master_thread != team->t.t_threads[0] ) {
-            __kmp_print_structure();
-        }
-    #endif /* KMP_DEBUG */
-    KMP_DEBUG_ASSERT( master_thread == team->t.t_threads[0] );
-    KMP_MB();
-
-    /* verify state */
-    KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
-    KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_team) );
-    KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_root) );
-    KMP_DEBUG_ASSERT( this_thr == team -> t.t_threads[tid] );
-
-    KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
-                   gtid, team_id, tid ));
-
-    #if OMP_30_ENABLED
-        if ( __kmp_tasking_mode == tskm_extra_barrier ) {
-            __kmp_tasking_barrier( team, this_thr, gtid );
-
-            KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n",
-                           gtid, team_id, tid ));
-        }
-        #ifdef KMP_DEBUG
-        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-            KA_TRACE( 20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
-                             __kmp_gtid_from_thread( this_thr ), team_id, team -> t.t_task_team,
-                             this_thr->th.th_task_team ) );
-            KMP_DEBUG_ASSERT( this_thr->th.th_task_team == team->t.t_task_team );
-        }
-        #endif /* KMP_DEBUG */
-    #endif /* OMP_30_ENABLED */
-
-    //
-    // Copy the blocktime info to the thread, where __kmp_wait_sleep()
-    // can access it when the team struct is not guaranteed to exist.
-    //
-    // Doing these loads causes a cache miss slows down EPCC parallel by 2x.
-    // As a workaround, we do not perform the copy if blocktime=infinite,
-    // since the values are not used by __kmp_wait_sleep() in that case.
-    //
-    if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
-        #if OMP_30_ENABLED
-            this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
-            this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
-        #else
-            this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
-            this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
-        #endif // OMP_30_ENABLED
-    }
-
-#if USE_ITT_BUILD
-    if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-        __kmp_itt_barrier_starting( gtid, itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-    if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
-        __kmp_linear_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
-                                     USE_ITT_BUILD_ARG( itt_sync_obj )
-                                     );
-    } else if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
-        __kmp_tree_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
-                                   USE_ITT_BUILD_ARG( itt_sync_obj )
-                                   );
-    } else {
-        __kmp_hyper_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
-                                    USE_ITT_BUILD_ARG( itt_sync_obj )
-                                    );
-    }; // if
-
-#if USE_ITT_BUILD
-    if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
-        __kmp_itt_barrier_middle( gtid, itt_sync_obj );
-#endif /* USE_ITT_BUILD */
-
-    //
-    // From this point on, the team data structure may be deallocated
-    // at any time by the master thread - it is unsafe to reference it
-    // in any of the worker threads.
-    //
-    // Any per-team data items that need to be referenced before the end
-    // of the barrier should be moved to the kmp_task_team_t structs.
-    //
-
-    #if OMP_30_ENABLED
-        if ( KMP_MASTER_TID( tid ) ) {
-            if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-                // Master shouldn't call decrease_load().         // TODO: enable master threads.
-                // Master should have th_may_decrease_load == 0.  // TODO: enable master threads.
-                __kmp_task_team_wait( this_thr, team
-                                      USE_ITT_BUILD_ARG( itt_sync_obj )
-                                      );
-            }
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-            // Join barrier - report frame end
-            if( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode ) {
-                kmp_uint64 tmp = __itt_get_timestamp();
-                ident_t * loc = team->t.t_ident;
-                switch( __kmp_forkjoin_frames_mode ) {
-                case 1:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc );
-                  break;
-                case 2:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc );
-                  break;
-                case 3:
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_frame_time, tmp, 0, loc );
-                  __kmp_itt_frame_submit( gtid, this_thr->th.th_bar_arrive_time, tmp, 1, loc );
-                  break;
-                }
-            }
-#endif /* USE_ITT_BUILD */
-        }
-    #endif /* OMP_30_ENABLED */
-
-    #if KMP_DEBUG
-        if( KMP_MASTER_TID( tid )) {
-            KA_TRACE( 15, ( "__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
-                            gtid, team_id, tid, nproc ));
-        }
-    #endif /* KMP_DEBUG */
-
-    /* TODO now, mark worker threads as done so they may be disbanded */
-
-    KMP_MB();       /* Flush all pending memory write invalidates.  */
-    KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n",
-                   gtid, team_id, tid ));
-}
-
-
-/* TODO release worker threads' fork barriers as we are ready instead of all at once */
-
-void
-__kmp_fork_barrier( int gtid, int tid )
-{
-    kmp_info_t *this_thr = __kmp_threads[ gtid ];
-    kmp_team_t *team     = ( tid == 0 ) ? this_thr -> th.th_team : NULL;
-#if USE_ITT_BUILD
-    void * itt_sync_obj = NULL;
-#endif /* USE_ITT_BUILD */
-
-    KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
-                    gtid, ( team != NULL ) ? team->t.t_id : -1, tid ));
-
-    /* th_team pointer only valid for master thread here */
-    if ( KMP_MASTER_TID( tid ) ) {
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-            if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-                itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 1 ); // create itt barrier object
-                //__kmp_itt_barrier_starting( gtid, itt_sync_obj );   // AC: no need to call prepare right before acquired
-                __kmp_itt_barrier_middle( gtid, itt_sync_obj );       // call acquired / releasing
-            }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-
-#ifdef KMP_DEBUG
-
-        register kmp_info_t **other_threads = team -> t.t_threads;
-        register int          i;
-
-        /* verify state */
-        KMP_MB();
-
-        for( i = 1; i < team -> t.t_nproc ; i++ ) {
-            KA_TRACE( 500, ( "__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork "
-                             "go == %u.\n",
-                             gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
-                             team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
-                             other_threads[i]->th.th_bar[ bs_forkjoin_barrier ].bb.b_go ) );
-
-            KMP_DEBUG_ASSERT( ( TCR_4( other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go )
-                                & ~(KMP_BARRIER_SLEEP_STATE) )
-                               == KMP_INIT_BARRIER_STATE );
-            KMP_DEBUG_ASSERT( other_threads[i]->th.th_team == team );
-
-        }
-#endif
-
-#if OMP_30_ENABLED
-        if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-            __kmp_task_team_setup( this_thr, team );
-        }
-#endif /* OMP_30_ENABLED */
-
-        //
-        // The master thread may have changed its blocktime between the
-        // join barrier and the fork barrier.
-        //
-        // Copy the blocktime info to the thread, where __kmp_wait_sleep()
-        // can access it when the team struct is not guaranteed to exist.
-        //
-        // See the note about the corresponding code in __kmp_join_barrier()
-        // being performance-critical.
-        //
-        if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
-#if OMP_30_ENABLED
-            this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
-            this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
-#else
-            this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
-            this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
-#endif // OMP_30_ENABLED
-        }
-    } // master
-
-    if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
-        __kmp_linear_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
-                                      USE_ITT_BUILD_ARG( itt_sync_obj )
-                                      );
-    } else if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
-        __kmp_tree_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
-                                    USE_ITT_BUILD_ARG( itt_sync_obj )
-                                    );
-    } else {
-        __kmp_hyper_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
-                                     USE_ITT_BUILD_ARG( itt_sync_obj )
-                                     );
-    }; // if
-
-    //
-    // early exit for reaping threads releasing forkjoin barrier
-    //
-    if ( TCR_4(__kmp_global.g.g_done) ) {
-
-#if OMP_30_ENABLED
-        if ( this_thr->th.th_task_team != NULL ) {
-            if ( KMP_MASTER_TID( tid ) ) {
-                TCW_PTR(this_thr->th.th_task_team, NULL);
-            }
-            else {
-                __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
-            }
-        }
-#endif /* OMP_30_ENABLED */
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-        if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-            if ( !KMP_MASTER_TID( tid ) ) {
-                itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
-                if ( itt_sync_obj )
-                    __kmp_itt_barrier_finished( gtid, itt_sync_obj );
-            }
-        }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-        KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d is leaving early\n", gtid ));
-        return;
-    }
-
-    //
-    // We can now assume that a valid team structure has been allocated
-    // by the master and propagated to all worker threads.
-    //
-    // The current thread, however, may not be part of the team, so we can't
-    // blindly assume that the team pointer is non-null.
-    //
-    team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
-    KMP_DEBUG_ASSERT( team != NULL );
-    tid = __kmp_tid_from_gtid( gtid );
-
-#if OMP_30_ENABLED
-
-# if KMP_BARRIER_ICV_PULL
-    // Master thread's copy of the ICVs was set up on the implicit taskdata in __kmp_reinitialize_team.
-    // __kmp_fork_call() assumes the master thread's implicit task has this data before this function is called.
-    // We cannot modify __kmp_fork_call() to look at the fixed ICVs in the master's thread struct, because it is
-    // not always the case that the threads arrays have been allocated when __kmp_fork_call() is executed.
-    if (! KMP_MASTER_TID( tid ) ) {  // master thread already has ICVs
-        // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
-        KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid ));
-        load_icvs(&team->t.t_threads[0]->th.th_fixed_icvs);
-        __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE );
-        store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_threads[0]->th.th_fixed_icvs);
-        sync_icvs();
-    }
-# endif // KMP_BARRIER_ICV_PULL
-
-    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
-        __kmp_task_team_sync( this_thr, team );
-    }
-
-#endif /* OMP_30_ENABLED */
-
-#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
-    kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
-    if ( proc_bind == proc_bind_intel ) {
-#endif
-#if KMP_MIC
-        //
-        // Call dynamic affinity settings
-        //
-        if( __kmp_affinity_type == affinity_balanced && team->t.t_size_changed ) {
-            __kmp_balanced_affinity( tid, team->t.t_nproc );
-        }
-#endif
-#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
-    }
-    else if ( ( proc_bind != proc_bind_false )
-              && ( proc_bind != proc_bind_disabled )) {
-        if ( this_thr->th.th_new_place == this_thr->th.th_current_place ) {
-            KA_TRACE( 100, ( "__kmp_fork_barrier: T#%d already in correct place %d\n",
-                             __kmp_gtid_from_thread( this_thr ), this_thr->th.th_current_place ) );
-        }
-        else {
-            __kmp_affinity_set_place( gtid );
-        }
-    }
-#endif
-
-#if USE_ITT_BUILD && USE_ITT_NOTIFY
-    if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
-        if ( !KMP_MASTER_TID( tid ) ) {
-            itt_sync_obj  = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get correct barrier object
-            __kmp_itt_barrier_finished( gtid, itt_sync_obj );   // workers call acquired
-        }                                                                          // (prepare called inside barrier_release)
-    }
-#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
-    KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) is leaving\n",
-      gtid, team->t.t_id, tid ));
-}
-
-
-/* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
 void *
@@ -6538,7 +5029,7 @@ __kmp_launch_thread( kmp_info_t *this_thr )
     KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
 
     if( __kmp_env_consistency_check ) {
-        this_thr -> th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
+        this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
     }
 
     /* This is the place where threads wait for work */
@@ -6559,37 +5050,32 @@ __kmp_launch_thread( kmp_info_t *this_thr )
             /* we were just woken up, so run our new task */
             if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
                 int rc;
-                KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
-                    gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
+                KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
+                              gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
-                if ( __kmp_inherit_fp_control && (*pteam)->t.t_fp_control_saved ) {
-                    __kmp_clear_x87_fpu_status_word();
-                    __kmp_load_x87_fpu_control_word( &(*pteam)->t.t_x87_fpu_control_word );
-                    __kmp_load_mxcsr( &(*pteam)->t.t_mxcsr );
-                }
-#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
+                updateHWFPControl (*pteam);
 
-                rc = (*pteam) -> t.t_invoke( gtid );
+                KMP_STOP_EXPLICIT_TIMER(USER_launch_thread_loop);
+                {
+                    KMP_TIME_BLOCK(USER_worker_invoke);
+                    rc = (*pteam)->t.t_invoke( gtid );
+                }
+                KMP_START_EXPLICIT_TIMER(USER_launch_thread_loop);
                 KMP_ASSERT( rc );
 
                 KMP_MB();
-                KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
-                        gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
+                KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
+                              gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
             }
-
             /* join barrier after parallel region */
             __kmp_join_barrier( gtid );
         }
     }
-    TCR_SYNC_PTR(__kmp_global.g.g_done);
+    TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
-#if OMP_30_ENABLED
     if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
         __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
     }
-#endif /* OMP_30_ENABLED */
-
     /* run the destructors for the threadprivate data for this thread */
     __kmp_common_destroy_gtid( gtid );
 
@@ -6601,8 +5087,6 @@ __kmp_launch_thread( kmp_info_t *this_thr )
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
-
-
 void
 __kmp_internal_end_dest( void *specific_gtid )
 {
@@ -6711,11 +5195,8 @@ __kmp_reap_thread(
             /* Assume the threads are at the fork barrier here */
             KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
             /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
-            __kmp_release(
-                thread,
-                &thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go,
-                kmp_release_fence
-            );
+            kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
+            __kmp_release_64(&flag);
         }; // if
 
 
@@ -6824,7 +5305,7 @@ __kmp_internal_end(void)
 
     for( i=0 ; i<__kmp_threads_capacity ; i++ )
         if( __kmp_root[i] )
-            if( __kmp_root[i] -> r.r_active )
+            if( __kmp_root[i]->r.r_active )
                 break;
     KMP_MB();       /* Flush all pending memory write invalidates.  */
     TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
@@ -6857,10 +5338,10 @@ __kmp_internal_end(void)
     } else {
         /* TODO move this to cleanup code */
         #ifdef KMP_DEBUG
-            /* Make sure that everything has properly ended */
+            /* make sure that everything has properly ended */
             for ( i = 0; i < __kmp_threads_capacity; i++ ) {
                 if( __kmp_root[i] ) {
-//                  KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
+//                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
                     KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
                 }
             }
@@ -6891,9 +5372,7 @@ __kmp_internal_end(void)
             __kmp_reap_team( team );
         }; // while
 
-        #if OMP_30_ENABLED
-            __kmp_reap_task_teams( );
-        #endif /* OMP_30_ENABLED */
+        __kmp_reap_task_teams( );
 
         for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
             // TBD: Add some checking...
@@ -6968,7 +5447,7 @@ __kmp_internal_end_library( int gtid_req )
             /* we don't know who we are, but we may still shutdown the library */
         } else if( KMP_UBER_GTID( gtid )) {
             /* unregister ourselves as an uber thread.  gtid is no longer valid */
-            if( __kmp_root[gtid] -> r.r_active ) {
+            if( __kmp_root[gtid]->r.r_active ) {
                 __kmp_global.g.g_abort = -1;
                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
                 KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
@@ -7072,7 +5551,7 @@ __kmp_internal_end_thread( int gtid_req )
             /* we don't know who we are */
         } else if( KMP_UBER_GTID( gtid )) {
         /* unregister ourselves as an uber thread.  gtid is no longer valid */
-            if( __kmp_root[gtid] -> r.r_active ) {
+            if( __kmp_root[gtid]->r.r_active ) {
                 __kmp_global.g.g_abort = -1;
                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
                 KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
@@ -7085,23 +5564,21 @@ __kmp_internal_end_thread( int gtid_req )
             /* just a worker thread, let's leave */
             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
 
-            #if OMP_30_ENABLED
-                if ( gtid >= 0 ) {
-                    kmp_info_t *this_thr = __kmp_threads[ gtid ];
-                    if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
-                        __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
-                    }
+            if ( gtid >= 0 ) {
+                kmp_info_t *this_thr = __kmp_threads[ gtid ];
+                if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
+                    __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
                 }
-            #endif /* OMP_30_ENABLED */
+            }
 
             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
             return;
         }
     }
     #if defined GUIDEDLL_EXPORTS
-    // AC: let's not shutdown the Linux* OS dynamic library at the exit of uber thread,
-    //     because it is better shutdown later, in the library destructor.
-    //     The reason for this change is a performance problem when a non-openmp thread
+    // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
+    //     because we will better shutdown later in the library destructor.
+    //     The reason of this change is performance problem when non-openmp thread
     //     in a loop forks and joins many openmp threads. We can save a lot of time
     //     keeping worker threads alive until the program shutdown.
     // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
@@ -7319,7 +5796,7 @@ __kmp_do_serial_initialize( void )
     int i, gtid;
     int size;
 
-    KA_TRACE( 10, ("__kmp_serial_initialize: enter\n" ) );
+    KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
 
     KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
     KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
@@ -7404,9 +5881,7 @@ __kmp_do_serial_initialize( void )
     __kmp_static = kmp_sch_static_balanced;
     // AC: do not use analytical here, because it is non-monotonous
     //__kmp_guided = kmp_sch_guided_iterative_chunked;
-    #if OMP_30_ENABLED
     //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
-    #endif // OMP_30_ENABLED
     // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
     // control parts
     #if KMP_FAST_REDUCTION_BARRIER
@@ -7436,9 +5911,15 @@ __kmp_do_serial_initialize( void )
         #undef kmp_reduction_barrier_gather_bb
     #endif // KMP_FAST_REDUCTION_BARRIER
     #if KMP_MIC
-        // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
-        __kmp_barrier_gather_branch_bits [ 0 ] = 3;  // plane gather
-        __kmp_barrier_release_branch_bits[ 1 ] = 1;  // forkjoin release
+    // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
+    __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plane gather
+    __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
+    __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
+#if KMP_FAST_REDUCTION_BARRIER
+    __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
+    __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
+#endif
     #endif
 
     // From KMP_CHECKS initialization
@@ -7455,6 +5936,7 @@ __kmp_do_serial_initialize( void )
     __kmp_global.g.g_dynamic_mode = dynamic_default;
 
     __kmp_env_initialize( NULL );
+
     // Print all messages in message catalog for testing purposes.
     #ifdef KMP_DEBUG
         char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
@@ -7660,13 +6142,9 @@ __kmp_do_middle_initialize( void )
         for ( i = 0; i < __kmp_threads_capacity; i++ ) {
             kmp_info_t *thread = __kmp_threads[ i ];
             if ( thread == NULL ) continue;
-#if OMP_30_ENABLED
             if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
-#else
-            if ( thread->th.th_team->t.t_set_nproc[ thread->th.th_info.ds.ds_tid ]  != 0 ) continue;
-#endif /* OMP_30_ENABLED */
 
-            set__nproc_p( __kmp_threads[ i ], __kmp_dflt_team_nth );
+            set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
         }
     }
     KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
@@ -7786,16 +6264,15 @@ __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
 
     /* none of the threads have encountered any constructs, yet. */
     this_thr->th.th_local.this_construct = 0;
-    this_thr->th.th_local.last_construct = 0;
 #if KMP_CACHE_MANAGE
-    KMP_CACHE_PREFETCH( &this_thr -> th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
+    KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
 #endif /* KMP_CACHE_MANAGE */
     dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
     KMP_DEBUG_ASSERT( dispatch );
-    KMP_DEBUG_ASSERT( team -> t.t_dispatch );
-    //KMP_DEBUG_ASSERT( this_thr -> th.th_dispatch == &team -> t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
+    KMP_DEBUG_ASSERT( team->t.t_dispatch );
+    //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
 
-    dispatch -> th_disp_index = 0;    /* reset the dispatch buffer counter */
+    dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
 
     if( __kmp_env_consistency_check )
         __kmp_push_parallel( gtid, team->t.t_ident );
@@ -7817,7 +6294,7 @@ __kmp_invoke_task_func( int gtid )
     int          rc;
     int          tid      = __kmp_tid_from_gtid( gtid );
     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
-    kmp_team_t  *team     = this_thr -> th.th_team;
+    kmp_team_t  *team     = this_thr->th.th_team;
 
     __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
 #if USE_ITT_BUILD
@@ -7825,6 +6302,9 @@ __kmp_invoke_task_func( int gtid )
         __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
     }
 #endif /* USE_ITT_BUILD */
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_INVOKING();
+#endif
     rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
       gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
 
@@ -7840,27 +6320,30 @@ __kmp_invoke_task_func( int gtid )
 
 #if OMP_40_ENABLED
 void
-__kmp_teams_master( microtask_t microtask, int gtid )
+__kmp_teams_master( int gtid )
 {
     // This routine is called by all master threads in teams construct
-    kmp_info_t  *this_thr = __kmp_threads[ gtid ];
-    kmp_team_t  *team = this_thr -> th.th_team;
+    kmp_info_t *thr = __kmp_threads[ gtid ];
+    kmp_team_t *team = thr->th.th_team;
     ident_t     *loc =  team->t.t_ident;
-
-#if KMP_DEBUG
-    int          tid = __kmp_tid_from_gtid( gtid );
+    thr->th.th_set_nproc = thr->th.th_teams_size.nth;
+    KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
+    KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
     KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
-                   gtid, tid, microtask) );
-#endif
-
+                   gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
     // Launch league of teams now, but not let workers execute
     // (they hang on fork barrier until next parallel)
-    this_thr->th.th_set_nproc = this_thr->th.th_set_nth_teams;
-    __kmp_fork_call( loc, gtid, TRUE,
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_FORKING();
+#endif
+    __kmp_fork_call( loc, gtid, fork_context_intel,
             team->t.t_argc,
-            microtask,
+            (microtask_t)thr->th.th_teams_microtask,
             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
             NULL );
+#if INCLUDE_SSC_MARKS
+    SSC_MARK_JOINING();
+#endif
     __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
                                      // worker threads are in a fork barrier waiting for more parallel regions
 }
@@ -7868,13 +6351,15 @@ __kmp_teams_master( microtask_t microtask, int gtid )
 int
 __kmp_invoke_teams_master( int gtid )
 {
+    kmp_info_t  *this_thr = __kmp_threads[ gtid ];
+    kmp_team_t  *team     = this_thr->th.th_team;
     #if KMP_DEBUG
     if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
         KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
     #endif
-
-    __kmp_teams_master( (microtask_t)__kmp_threads[gtid]->th.th_team_microtask, gtid );
-
+    __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
+    __kmp_teams_master( gtid );
+    __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
     return 1;
 }
 #endif /* OMP_40_ENABLED */
@@ -7890,7 +6375,7 @@ __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
     kmp_info_t *thr = __kmp_threads[gtid];
 
     if( num_threads > 0 )
-        thr -> th.th_set_nproc = num_threads;
+        thr->th.th_set_nproc = num_threads;
 }
 
 #if OMP_40_ENABLED
@@ -7901,20 +6386,21 @@ void
 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
 {
     kmp_info_t *thr = __kmp_threads[gtid];
-    // The number of teams is the number of threads in the outer "parallel"
-    if( num_teams > 0 ) {
-        thr -> th.th_set_nproc = num_teams;
-    } else {
-        thr -> th.th_set_nproc = 1;  // AC: default number of teams is 1;
-                                     // TODO: should it be __kmp_ncores ?
+    KMP_DEBUG_ASSERT(num_teams >= 0);
+    KMP_DEBUG_ASSERT(num_threads >= 0);
+    if( num_teams == 0 ) {
+        num_teams = 1;    // default number of teams is 1.
     }
-    // The number of threads is for inner parallel regions
+    // Set number of teams (number of threads in the outer "parallel" of the teams)
+    thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
+
+    // Remember the number of threads for inner parallel regions
     if( num_threads > 0 ) {
-        thr -> th.th_set_nth_teams = num_threads;
+        thr->th.th_teams_size.nth = num_threads;
     } else {
         if( !TCR_4(__kmp_init_middle) )
-            __kmp_middle_initialize();
-        thr -> th.th_set_nth_teams = __kmp_avail_proc / thr -> th.th_set_nproc;
+            __kmp_middle_initialize();  // get __kmp_avail_proc calculated
+        thr->th.th_teams_size.nth = __kmp_avail_proc / num_teams;
     }
 }
 
@@ -7926,7 +6412,7 @@ void
 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
 {
     kmp_info_t *thr = __kmp_threads[gtid];
-    thr -> th.th_set_proc_bind = proc_bind;
+    thr->th.th_set_proc_bind = proc_bind;
 }
 
 #endif /* OMP_40_ENABLED */
@@ -7943,25 +6429,25 @@ __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
 #endif /* KMP_DEBUG */
 
     KMP_DEBUG_ASSERT( team );
-    KMP_DEBUG_ASSERT( this_thr -> th.th_team  ==  team );
+    KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
     KMP_MB();       /* Flush all pending memory write invalidates.  */
 
-    team -> t.t_construct = 0;          /* no single directives seen yet */
-    team -> t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
+    team->t.t_construct = 0;          /* no single directives seen yet */
+    team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
 
     /* Reset the identifiers on the dispatch buffer */
-    KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
+    KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
     if ( team->t.t_max_nproc > 1 ) {
         int i;
         for (i = 0; i <  KMP_MAX_DISP_BUF; ++i)
-            team -> t.t_disp_buffer[ i ].buffer_index = i;
+            team->t.t_disp_buffer[ i ].buffer_index = i;
     } else {
-        team -> t.t_disp_buffer[ 0 ].buffer_index = 0;
+        team->t.t_disp_buffer[ 0 ].buffer_index = 0;
     }
 
     KMP_MB();       /* Flush all pending memory write invalidates.  */
-    KMP_ASSERT( this_thr -> th.th_team  ==  team );
+    KMP_ASSERT( this_thr->th.th_team  ==  team );
 
 #ifdef KMP_DEBUG
     for( f=0 ; f<team->t.t_nproc ; f++ ) {
@@ -7981,7 +6467,7 @@ __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
     kmp_info_t *this_thr = __kmp_threads[gtid];
 
     KMP_DEBUG_ASSERT( team );
-    KMP_DEBUG_ASSERT( this_thr -> th.th_team  ==  team );
+    KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
     KMP_MB();       /* Flush all pending memory write invalidates.  */
 
@@ -8001,7 +6487,7 @@ __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
     __kmp_join_barrier( gtid );  /* wait for everyone */
 
     KMP_MB();       /* Flush all pending memory write invalidates.  */
-    KMP_ASSERT( this_thr -> th.th_team  ==  team );
+    KMP_ASSERT( this_thr->th.th_team  ==  team );
 }
 
 
@@ -8057,11 +6543,7 @@ __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
     KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
                 root, set_nproc ) );
     KMP_DEBUG_ASSERT( root );
-    #if OMP_30_ENABLED
     KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
-    #else
-    KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_set_dynamic[0] == TRUE );
-    #endif
     KMP_DEBUG_ASSERT( set_nproc > 1 );
 
     if ( set_nproc == 1) {
@@ -8203,6 +6685,11 @@ __kmp_cleanup( void )
 
     __kmp_i18n_catclose();
 
+#if KMP_STATS_ENABLED
+    __kmp_accumulate_stats_at_exit();
+    __kmp_stats_list.deallocate();
+#endif
+
     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
 }
 
@@ -8244,7 +6731,7 @@ __kmp_internal_begin( void )
     /* this is a very important step as it will register new sibling threads
      * and assign these new uber threads a new gtid */
     gtid = __kmp_entry_gtid();
-    root = __kmp_threads[ gtid ] -> th.th_root;
+    root = __kmp_threads[ gtid ]->th.th_root;
     KMP_ASSERT( KMP_UBER_GTID( gtid ));
 
     if( root->r.r_begin ) return;
@@ -8254,7 +6741,7 @@ __kmp_internal_begin( void )
         return;
     }
 
-    root -> r.r_begin = TRUE;
+    root->r.r_begin = TRUE;
 
     __kmp_release_lock( & root->r.r_begin_lock, gtid );
 }
@@ -8275,7 +6762,7 @@ __kmp_user_set_library (enum library_type arg)
     gtid = __kmp_entry_gtid();
     thread = __kmp_threads[ gtid ];
 
-    root = thread -> th.th_root;
+    root = thread->th.th_root;
 
     KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
     if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
@@ -8285,16 +6772,16 @@ __kmp_user_set_library (enum library_type arg)
 
     switch ( arg ) {
     case library_serial :
-        thread -> th.th_set_nproc = 0;
-        set__nproc_p( thread, 1 );
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, 1 );
         break;
     case library_turnaround :
-        thread -> th.th_set_nproc = 0;
-        set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
         break;
     case library_throughput :
-        thread -> th.th_set_nproc = 0;
-        set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
+        thread->th.th_set_nproc = 0;
+        set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
         break;
     default:
         KMP_FATAL( UnknownLibraryType, arg );
@@ -8378,20 +6865,20 @@ __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
     else if (blocktime > KMP_MAX_BLOCKTIME)
         blocktime = KMP_MAX_BLOCKTIME;
 
-    set__blocktime_team( thread -> th.th_team, tid, blocktime );
-    set__blocktime_team( thread -> th.th_serial_team, 0, blocktime );
+    set__blocktime_team( thread->th.th_team, tid, blocktime );
+    set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
 
     /* Calculate and set blocktime intervals for the teams */
     bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
 
-    set__bt_intervals_team( thread -> th.th_team, tid, bt_intervals );
-    set__bt_intervals_team( thread -> th.th_serial_team, 0, bt_intervals );
+    set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
+    set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
 
     /* Set whether blocktime has been set to "TRUE" */
     bt_set = TRUE;
 
-    set__bt_set_team( thread -> th.th_team, tid, bt_set );
-    set__bt_set_team( thread -> th.th_serial_team, 0, bt_set );
+    set__bt_set_team( thread->th.th_team, tid, bt_set );
+    set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
     KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
                   __kmp_gtid_from_tid(tid, thread->th.th_team),
                   thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
@@ -8481,7 +6968,7 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
 
         #elif KMP_ARCH_X86 || KMP_ARCH_ARM
 
-            #if KMP_AFFINITY_SUPPORTED
+            #if KMP_OS_LINUX || KMP_OS_WINDOWS
 
                 // basic tuning
 
@@ -8559,7 +7046,7 @@ __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
 // this function is for testing set/get/determine reduce method
 kmp_int32
 __kmp_get_reduce_method( void ) {
-    return ( ( __kmp_entry_thread() -> th.th_local.packed_reduction_method ) >> 8 );
+    return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
 }
 
 /* ------------------------------------------------------------------------ */
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
index 18e5a38dc40..14fa62ee85d 100644
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -1,7 +1,7 @@
 /*
  * kmp_sched.c -- static scheduling -- iteration initialization
- * $Revision: 42358 $
- * $Date: 2013-05-07 13:43:26 -0500 (Tue, 07 May 2013) $
+ * $Revision: 43457 $
+ * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
  */
 
 
@@ -28,6 +28,8 @@
 #include "kmp_i18n.h"
 #include "kmp_str.h"
 #include "kmp_error.h"
+#include "kmp_stats.h"
+#include "kmp_itt.h"
 
 // template for type limits
 template< typename T >
@@ -79,6 +81,7 @@ __kmp_for_static_init(
     typename traits_t< T >::signed_t  incr,
     typename traits_t< T >::signed_t  chunk
 ) {
+    KMP_COUNT_BLOCK(OMP_FOR_static);
     typedef typename traits_t< T >::unsigned_t  UT;
     typedef typename traits_t< T >::signed_t    ST;
     /*  this all has to be changed back to TID and such.. */
@@ -88,6 +91,7 @@ __kmp_for_static_init(
     register UT          trip_count;
     register kmp_team_t *team;
 
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper && pstride );
     KE_TRACE( 10, ("__kmpc_for_static_init called (%d)\n", global_tid));
     #ifdef KMP_DEBUG
     {
@@ -108,12 +112,12 @@ __kmp_for_static_init(
         __kmp_push_workshare( global_tid, ct_pdo, loc );
         if ( incr == 0 ) {
             __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
-
         }
     }
     /* special handling for zero-trip loops */
     if ( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
-        *plastiter = FALSE;
+        if( plastiter != NULL )
+            *plastiter = FALSE;
         /* leave pupper and plower set to entire iteration space */
         *pstride = incr;   /* value should never be used */
 	//        *plower = *pupper - incr;   // let compiler bypass the illegal loop (like for(i=1;i<10;i--))  THIS LINE CAUSED shape2F/h_tests_1.f TO HAVE A FAILURE ON A ZERO-TRIP LOOP (lower=1,\
@@ -149,7 +153,8 @@ __kmp_for_static_init(
     /* determine if "for" loop is an active worksharing construct */
     if ( team -> t.t_serialized ) {
         /* serialized parallel, each thread executes whole iteration space */
-        *plastiter = TRUE;
+        if( plastiter != NULL )
+            *plastiter = TRUE;
         /* leave pupper and plower set to entire iteration space */
         *pstride = (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
 
@@ -169,8 +174,9 @@ __kmp_for_static_init(
     }
     nth = team->t.t_nproc;
     if ( nth == 1 ) {
-        *plastiter = TRUE;
-
+        if( plastiter != NULL )
+            *plastiter = TRUE;
+        *pstride = (incr > 0) ? (*pupper - *plower + 1) : (-(*plower - *pupper + 1));
         #ifdef KMP_DEBUG
         {
             const char * buff;
@@ -192,12 +198,13 @@ __kmp_for_static_init(
     } else if (incr == -1) {
         trip_count = *plower - *pupper + 1;
     } else {
-        if ( incr > 1 ) {
+        if ( incr > 1 ) {  // the check is needed for unsigned division when incr < 0
             trip_count = (*pupper - *plower) / incr + 1;
         } else {
             trip_count = (*plower - *pupper) / ( -incr ) + 1;
         }
     }
+
     if ( __kmp_env_consistency_check ) {
         /* tripcount overflow? */
         if ( trip_count == 0 && *pupper != *plower ) {
@@ -219,14 +226,16 @@ __kmp_for_static_init(
                 } else {
                     *plower = *pupper + incr;
                 }
-                *plastiter = ( tid == trip_count - 1 );
+                if( plastiter != NULL )
+                    *plastiter = ( tid == trip_count - 1 );
             } else {
                 if ( __kmp_static == kmp_sch_static_balanced ) {
                     register UT small_chunk = trip_count / nth;
                     register UT extras = trip_count % nth;
                     *plower += incr * ( tid * small_chunk + ( tid < extras ? tid : extras ) );
                     *pupper = *plower + small_chunk * incr - ( tid < extras ? 0 : incr );
-                    *plastiter = ( tid == nth - 1 );
+                    if( plastiter != NULL )
+                        *plastiter = ( tid == nth - 1 );
                 } else {
                     register T big_chunk_inc_count = ( trip_count/nth +
                                                      ( ( trip_count % nth ) ? 1 : 0) ) * incr;
@@ -238,16 +247,16 @@ __kmp_for_static_init(
                     *plower += tid * big_chunk_inc_count;
                     *pupper = *plower + big_chunk_inc_count - incr;
                     if ( incr > 0 ) {
-                        if ( *pupper < *plower ) {
+                        if( *pupper < *plower )
                             *pupper = i_maxmin< T >::mx;
-                        }
-                        *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
+                        if( plastiter != NULL )
+                            *plastiter = *plower <= old_upper && *pupper > old_upper - incr;
                         if ( *pupper > old_upper ) *pupper = old_upper; // tracker C73258
                     } else {
-                        if ( *pupper > *plower ) {
+                        if( *pupper > *plower )
                             *pupper = i_maxmin< T >::mn;
-                        }
-                        *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
+                        if( plastiter != NULL )
+                            *plastiter = *plower >= old_upper && *pupper < old_upper - incr;
                         if ( *pupper < old_upper ) *pupper = old_upper; // tracker C73258
                     }
                 }
@@ -256,7 +265,7 @@ __kmp_for_static_init(
         }
     case kmp_sch_static_chunked:
         {
-            register T span;
+            register ST span;
             if ( chunk < 1 ) {
                 chunk = 1;
             }
@@ -264,11 +273,8 @@ __kmp_for_static_init(
             *pstride = span * nth;
             *plower = *plower + (span * tid);
             *pupper = *plower + span - incr;
-            /* TODO: is the following line a bug?  Shouldn't it be plastiter instead of *plastiter ? */
-            if (*plastiter) { /* only calculate this if it was requested */
-                kmp_int32 lasttid = ((trip_count - 1) / ( UT )chunk) % nth;
-                *plastiter = (tid == lasttid);
-            }
+            if( plastiter != NULL )
+                *plastiter = (tid == ((trip_count - 1)/( UT )chunk) % nth);
             break;
         }
     default:
@@ -276,6 +282,18 @@ __kmp_for_static_init(
         break;
     }
 
+#if USE_ITT_BUILD
+    // Report loop metadata
+    if ( KMP_MASTER_TID(tid) && __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
+        kmp_uint64 cur_chunk = chunk;
+        // Calculate chunk in case it was not specified; it is specified for kmp_sch_static_chunked
+        if ( schedtype == kmp_sch_static ) {
+            cur_chunk = trip_count / nth + ( ( trip_count % nth ) ? 1 : 0);
+        }
+        // 0 - "static" schedule
+        __kmp_itt_metadata_loop(loc, 0, trip_count, cur_chunk);
+    }
+#endif
     #ifdef KMP_DEBUG
     {
         const char * buff;
@@ -291,6 +309,355 @@ __kmp_for_static_init(
     return;
 }
 
+template< typename T >
+static void
+__kmp_dist_for_static_init(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                         schedule,
+    kmp_int32                        *plastiter,
+    T                                *plower,
+    T                                *pupper,
+    T                                *pupperDist,
+    typename traits_t< T >::signed_t *pstride,
+    typename traits_t< T >::signed_t  incr,
+    typename traits_t< T >::signed_t  chunk
+) {
+    KMP_COUNT_BLOCK(OMP_DISTR_FOR_static);
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    register kmp_uint32  tid;
+    register kmp_uint32  nth;
+    register kmp_uint32  team_id;
+    register kmp_uint32  nteams;
+    register UT          trip_count;
+    register kmp_team_t *team;
+    kmp_info_t * th;
+
+    KMP_DEBUG_ASSERT( plastiter && plower && pupper && pupperDist && pstride );
+    KE_TRACE( 10, ("__kmpc_dist_for_static_init called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_dist_for_static_init: T#%%d schedLoop=%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk=%%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, schedule, *plastiter,
+                       *plower, *pupper, incr, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    if( __kmp_env_consistency_check ) {
+        __kmp_push_workshare( gtid, ct_pdo, loc );
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    tid = __kmp_tid_from_gtid( gtid );
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    nth = th->th.th_team_nproc;
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute global trip count
+    if( incr == 1 ) {
+        trip_count = *pupper - *plower + 1;
+    } else if(incr == -1) {
+        trip_count = *plower - *pupper + 1;
+    } else {
+        trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    *pstride = *pupper - *plower;  // just in case (can be unused)
+    if( trip_count <= nteams ) {
+        KMP_DEBUG_ASSERT(
+            __kmp_static == kmp_sch_static_greedy || \
+            __kmp_static == kmp_sch_static_balanced
+        ); // Unknown static scheduling type.
+        // only masters of some teams get single iteration, other threads get nothing
+        if( team_id < trip_count && tid == 0 ) {
+            *pupper = *pupperDist = *plower = *plower + team_id * incr;
+        } else {
+            *pupperDist = *pupper;
+            *plower = *pupper + incr; // compiler should skip loop body
+        }
+        if( plastiter != NULL )
+            *plastiter = ( tid == 0 && team_id == trip_count - 1 );
+    } else {
+        // Get the team's chunk first (each team gets at most one chunk)
+        if( __kmp_static == kmp_sch_static_balanced ) {
+            register UT chunkD = trip_count / nteams;
+            register UT extras = trip_count % nteams;
+            *plower += incr * ( team_id * chunkD + ( team_id < extras ? team_id : extras ) );
+            *pupperDist = *plower + chunkD * incr - ( team_id < extras ? 0 : incr );
+            if( plastiter != NULL )
+                *plastiter = ( team_id == nteams - 1 );
+        } else {
+            register T chunk_inc_count =
+                ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
+            register T upper = *pupper;
+            KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                // Unknown static scheduling type.
+            *plower += team_id * chunk_inc_count;
+            *pupperDist = *plower + chunk_inc_count - incr;
+            // Check/correct bounds if needed
+            if( incr > 0 ) {
+                if( *pupperDist < *plower )
+                    *pupperDist = i_maxmin< T >::mx;
+                if( plastiter != NULL )
+                    *plastiter = *plower <= upper && *pupperDist > upper - incr;
+                if( *pupperDist > upper )
+                    *pupperDist = upper; // tracker C73258
+                if( *plower > *pupperDist ) {
+                    *pupper = *pupperDist;  // no iterations available for the team
+                    goto end;
+                }
+            } else {
+                if( *pupperDist > *plower )
+                    *pupperDist = i_maxmin< T >::mn;
+                if( plastiter != NULL )
+                    *plastiter = *plower >= upper && *pupperDist < upper - incr;
+                if( *pupperDist < upper )
+                    *pupperDist = upper; // tracker C73258
+                if( *plower < *pupperDist ) {
+                    *pupper = *pupperDist;  // no iterations available for the team
+                    goto end;
+                }
+            }
+        }
+        // Get the parallel loop chunk now (for thread)
+        // compute trip count for team's chunk
+        if( incr == 1 ) {
+            trip_count = *pupperDist - *plower + 1;
+        } else if(incr == -1) {
+            trip_count = *plower - *pupperDist + 1;
+        } else {
+            trip_count = (ST)(*pupperDist - *plower) / incr + 1;
+        }
+        KMP_DEBUG_ASSERT( trip_count );
+        switch( schedule ) {
+        case kmp_sch_static:
+        {
+            if( trip_count <= nth ) {
+                KMP_DEBUG_ASSERT(
+                    __kmp_static == kmp_sch_static_greedy || \
+                    __kmp_static == kmp_sch_static_balanced
+                ); // Unknown static scheduling type.
+                if( tid < trip_count )
+                    *pupper = *plower = *plower + tid * incr;
+                else
+                    *plower = *pupper + incr; // no iterations available
+                if( plastiter != NULL )
+                    if( *plastiter != 0 && !( tid == trip_count - 1 ) )
+                        *plastiter = 0;
+            } else {
+                if( __kmp_static == kmp_sch_static_balanced ) {
+                    register UT chunkL = trip_count / nth;
+                    register UT extras = trip_count % nth;
+                    *plower += incr * (tid * chunkL + (tid < extras ? tid : extras));
+                    *pupper = *plower + chunkL * incr - (tid < extras ? 0 : incr);
+                    if( plastiter != NULL )
+                        if( *plastiter != 0 && !( tid == nth - 1 ) )
+                            *plastiter = 0;
+                } else {
+                    register T chunk_inc_count =
+                        ( trip_count / nth + ( ( trip_count % nth ) ? 1 : 0) ) * incr;
+                    register T upper = *pupperDist;
+                    KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
+                        // Unknown static scheduling type.
+                    *plower += tid * chunk_inc_count;
+                    *pupper = *plower + chunk_inc_count - incr;
+                    if( incr > 0 ) {
+                        if( *pupper < *plower )
+                            *pupper = i_maxmin< T >::mx;
+                        if( plastiter != NULL )
+                            if( *plastiter != 0 && !(*plower <= upper && *pupper > upper - incr) )
+                                *plastiter = 0;
+                        if( *pupper > upper )
+                            *pupper = upper;//tracker C73258
+                    } else {
+                        if( *pupper > *plower )
+                            *pupper = i_maxmin< T >::mn;
+                        if( plastiter != NULL )
+                            if( *plastiter != 0 && !(*plower >= upper && *pupper < upper - incr) )
+                                *plastiter = 0;
+                        if( *pupper < upper )
+                            *pupper = upper;//tracker C73258
+                    }
+                }
+            }
+            break;
+        }
+        case kmp_sch_static_chunked:
+        {
+            register ST span;
+            if( chunk < 1 )
+                chunk = 1;
+            span = chunk * incr;
+            *pstride = span * nth;
+            *plower = *plower + (span * tid);
+            *pupper = *plower + span - incr;
+            if( plastiter != NULL )
+                if( *plastiter != 0 && !(tid == ((trip_count - 1) / ( UT )chunk) % nth) )
+                    *plastiter = 0;
+            break;
+        }
+        default:
+            KMP_ASSERT2( 0, "__kmpc_dist_for_static_init: unknown loop scheduling type" );
+            break;
+        }
+    }
+    end:;
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format(
+            "__kmpc_dist_for_static_init: last=%%d lo=%%%s up=%%%s upDist=%%%s "\
+            "stride=%%%s signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, *plastiter, *plower, *pupper, *pupperDist, *pstride ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+    KE_TRACE( 10, ("__kmpc_dist_for_static_init: T#%d return\n", gtid ) );
+    return;
+}
+
+template< typename T >
+static void
+__kmp_team_static_init(
+    ident_t                          *loc,
+    kmp_int32                         gtid,
+    kmp_int32                        *p_last,
+    T                                *p_lb,
+    T                                *p_ub,
+    typename traits_t< T >::signed_t *p_st,
+    typename traits_t< T >::signed_t  incr,
+    typename traits_t< T >::signed_t  chunk
+) {
+    // The routine returns the first chunk distributed to the team and
+    // stride for next chunks calculation.
+    // Last iteration flag set for the team that will execute
+    // the last iteration of the loop.
+    // The routine is called for dist_schedue(static,chunk) only.
+    typedef typename traits_t< T >::unsigned_t  UT;
+    typedef typename traits_t< T >::signed_t    ST;
+    kmp_uint32  team_id;
+    kmp_uint32  nteams;
+    UT          trip_count;
+    T           lower;
+    T           upper;
+    ST          span;
+    kmp_team_t *team;
+    kmp_info_t *th;
+
+    KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st );
+    KE_TRACE( 10, ("__kmp_team_static_init called (%d)\n", gtid));
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmp_team_static_init enter: T#%%d liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk %%%s; signed?<%s>\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec, traits_t< T >::spec );
+        KD_TRACE(100, ( buff, gtid, *p_last, *p_lb, *p_ub, *p_st, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+
+    lower = *p_lb;
+    upper = *p_ub;
+    if( __kmp_env_consistency_check ) {
+        if( incr == 0 ) {
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
+        }
+        if( incr > 0 ? (upper < lower) : (lower < upper) ) {
+            // The loop is illegal.
+            // Some zero-trip loops maintained by compiler, e.g.:
+            //   for(i=10;i<0;++i) // lower >= upper - run-time check
+            //   for(i=0;i>10;--i) // lower <= upper - run-time check
+            //   for(i=0;i>10;++i) // incr > 0       - compile-time check
+            //   for(i=10;i<0;--i) // incr < 0       - compile-time check
+            // Compiler does not check the following illegal loops:
+            //   for(i=0;i<10;i+=incr) // where incr<0
+            //   for(i=10;i>0;i-=incr) // where incr<0
+            __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
+        }
+    }
+    th = __kmp_threads[gtid];
+    KMP_DEBUG_ASSERT(th->th.th_teams_microtask);   // we are in the teams construct
+    team = th->th.th_team;
+    #if OMP_40_ENABLED
+    nteams = th->th.th_teams_size.nteams;
+    #endif
+    team_id = team->t.t_master_tid;
+    KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
+
+    // compute trip count
+    if( incr == 1 ) {
+        trip_count = upper - lower + 1;
+    } else if(incr == -1) {
+        trip_count = lower - upper + 1;
+    } else {
+        trip_count = (ST)(upper - lower) / incr + 1; // cast to signed to cover incr<0 case
+    }
+    if( chunk < 1 )
+        chunk = 1;
+    span = chunk * incr;
+    *p_st = span * nteams;
+    *p_lb = lower + (span * team_id);
+    *p_ub = *p_lb + span - incr;
+    if ( p_last != NULL )
+        *p_last = (team_id == ((trip_count - 1)/(UT)chunk) % nteams);
+    // Correct upper bound if needed
+    if( incr > 0 ) {
+        if( *p_ub < *p_lb ) // overflow?
+            *p_ub = i_maxmin< T >::mx;
+        if( *p_ub > upper )
+            *p_ub = upper; // tracker C73258
+    } else {   // incr < 0
+        if( *p_ub > *p_lb )
+            *p_ub = i_maxmin< T >::mn;
+        if( *p_ub < upper )
+            *p_ub = upper; // tracker C73258
+    }
+    #ifdef KMP_DEBUG
+    {
+        const char * buff;
+        // create format specifiers before the debug output
+        buff = __kmp_str_format( "__kmp_team_static_init exit: T#%%d team%%u liter=%%d "\
+            "iter=(%%%s, %%%s, %%%s) chunk %%%s\n",
+            traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
+            traits_t< ST >::spec );
+        KD_TRACE(100, ( buff, gtid, team_id, *p_last, *p_lb, *p_ub, *p_st, chunk ) );
+        __kmp_str_free( &buff );
+    }
+    #endif
+}
+
 //--------------------------------------------------------------------------------------
 extern "C" {
 
@@ -310,7 +677,7 @@ Each of the four functions here are identical apart from the argument types.
 
 The functions compute the upper and lower bounds and stride to be used for the set of iterations
 to be executed by the current thread from the statically scheduled loop that is described by the
-initial values of the bround, stride, increment and chunk size.
+initial values of the bounds, stride, increment and chunk size.
 
 @{
 */
@@ -362,5 +729,155 @@ __kmpc_for_static_init_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, km
 @}
 */
 
+/*!
+@ingroup WORK_SHARING
+@param    loc       Source code location
+@param    gtid      Global thread id of this thread
+@param    scheduleD Scheduling type for the distribute
+@param    scheduleL Scheduling type for the parallel loop
+@param    plastiter Pointer to the "last iteration" flag
+@param    plower    Pointer to the lower bound
+@param    pupper    Pointer to the upper bound of loop chunk
+@param    pupperD   Pointer to the upper bound of dist_chunk
+@param    pstrideD  Pointer to the stride for distribute
+@param    pstrideL  Pointer to the stride for parallel loop
+@param    incr      Loop increment
+@param    chunkD    The chunk size for the distribute
+@param    chunkL    The chunk size for the parallel loop
+
+Each of the four functions here are identical apart from the argument types.
+
+The functions compute the upper and lower bounds and strides to be used for the set of iterations
+to be executed by the current thread from the statically scheduled loop that is described by the
+initial values of the bounds, strides, increment and chunks for parallel loop and distribute
+constructs.
+
+@{
+*/
+void
+__kmpc_dist_for_static_init_4(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_int32 *plower, kmp_int32 *pupper, kmp_int32 *pupperD,
+    kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_dist_for_static_init< kmp_int32 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_4u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_uint32 *plower, kmp_uint32 *pupper, kmp_uint32 *pupperD,
+    kmp_int32 *pstride, kmp_int32 incr, kmp_int32 chunk )
+{
+    __kmp_dist_for_static_init< kmp_uint32 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_8(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_int64 *plower, kmp_int64 *pupper, kmp_int64 *pupperD,
+    kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_dist_for_static_init< kmp_int64 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_dist_for_static_init_4
+ */
+void
+__kmpc_dist_for_static_init_8u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 schedule, kmp_int32 *plastiter,
+    kmp_uint64 *plower, kmp_uint64 *pupper, kmp_uint64 *pupperD,
+    kmp_int64 *pstride, kmp_int64 incr, kmp_int64 chunk )
+{
+    __kmp_dist_for_static_init< kmp_uint64 >(
+        loc, gtid, schedule, plastiter, plower, pupper, pupperD, pstride, incr, chunk );
+}
+/*!
+@}
+*/
+
+//-----------------------------------------------------------------------------------------
+// Auxiliary routines for Distribute Parallel Loop construct implementation
+//    Transfer call to template< type T >
+//    __kmp_team_static_init( ident_t *loc, int gtid,
+//        int *p_last, T *lb, T *ub, ST *st, ST incr, ST chunk )
+
+/*!
+@ingroup WORK_SHARING
+@{
+@param loc Source location
+@param gtid Global thread id
+@param p_last pointer to last iteration flag
+@param p_lb  pointer to Lower bound
+@param p_ub  pointer to Upper bound
+@param p_st  Step (or increment if you prefer)
+@param incr  Loop increment
+@param chunk The chunk size to block with
+
+The functions compute the upper and lower bounds and stride to be used for the set of iterations
+to be executed by the current team from the statically scheduled loop that is described by the
+initial values of the bounds, stride, increment and chunk for the distribute construct as part of
+composite distribute parallel loop construct.
+These functions are all identical apart from the types of the arguments.
+*/
+
+void
+__kmpc_team_static_init_4(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st, kmp_int32 incr, kmp_int32 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_4u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st, kmp_int32 incr, kmp_int32 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_8(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st, kmp_int64 incr, kmp_int64 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+
+/*!
+ See @ref __kmpc_team_static_init_4
+ */
+void
+__kmpc_team_static_init_8u(
+    ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
+    kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st, kmp_int64 incr, kmp_int64 chunk )
+{
+    KMP_DEBUG_ASSERT( __kmp_init_serial );
+    __kmp_team_static_init< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st, incr, chunk );
+}
+/*!
+@}
+*/
+
 } // extern "C"
 
diff --git a/openmp/runtime/src/kmp_settings.c b/openmp/runtime/src/kmp_settings.c
index b85678e28fc..016e5a35859 100644
--- a/openmp/runtime/src/kmp_settings.c
+++ b/openmp/runtime/src/kmp_settings.c
@@ -1,7 +1,7 @@
 /*
  * kmp_settings.c -- Initialize environment variables
- * $Revision: 42816 $
- * $Date: 2013-11-11 15:33:37 -0600 (Mon, 11 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -534,9 +534,9 @@ __kmp_stg_parse_file(
     * out = __kmp_str_format( "%s", buffer );
 } // __kmp_stg_parse_file
 
+#ifdef KMP_DEBUG
 static char * par_range_to_print = NULL;
 
-#ifdef KMP_DEBUG
 static void
 __kmp_stg_parse_par_range(
     char const * name,
@@ -945,6 +945,26 @@ __kmp_stg_print_settings( kmp_str_buf_t * buffer, char const * name, void * data
 } // __kmp_stg_print_settings
 
 // -------------------------------------------------------------------------------------------------
+// KMP_STACKPAD
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_stackpad( char const * name, char const * value, void * data ) {
+    __kmp_stg_parse_int(
+        name,                             // Env var name
+        value,                            // Env var value
+        KMP_MIN_STKPADDING,               // Min value
+        KMP_MAX_STKPADDING,               // Max value
+        & __kmp_stkpadding                // Var to initialize
+    );
+} // __kmp_stg_parse_stackpad
+
+static void
+__kmp_stg_print_stackpad( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_stkpadding );
+} // __kmp_stg_print_stackpad
+
+// -------------------------------------------------------------------------------------------------
 // KMP_STACKOFFSET
 // -------------------------------------------------------------------------------------------------
 
@@ -1229,7 +1249,6 @@ __kmp_stg_print_num_threads( kmp_str_buf_t * buffer, char const * name, void * d
 // OpenMP 3.0: KMP_TASKING, OMP_MAX_ACTIVE_LEVELS,
 // -------------------------------------------------------------------------------------------------
 
-#if OMP_30_ENABLED
 static void
 __kmp_stg_parse_tasking( char const * name, char const * value, void * data ) {
     __kmp_stg_parse_int( name, value, 0, (int)tskm_max, (int *)&__kmp_tasking_mode );
@@ -1259,7 +1278,41 @@ static void
 __kmp_stg_print_max_active_levels( kmp_str_buf_t * buffer, char const * name, void * data ) {
     __kmp_stg_print_int( buffer, name, __kmp_dflt_max_active_levels );
 } // __kmp_stg_print_max_active_levels
-#endif // OMP_30_ENABLED
+
+#if KMP_NESTED_HOT_TEAMS
+// -------------------------------------------------------------------------------------------------
+// KMP_HOT_TEAMS_MAX_LEVEL, KMP_HOT_TEAMS_MODE
+// -------------------------------------------------------------------------------------------------
+
+static void
+__kmp_stg_parse_hot_teams_level( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
+    __kmp_stg_parse_int( name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT, & __kmp_hot_teams_max_level );
+} // __kmp_stg_parse_hot_teams_level
+
+static void
+__kmp_stg_print_hot_teams_level( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_hot_teams_max_level );
+} // __kmp_stg_print_hot_teams_level
+
+static void
+__kmp_stg_parse_hot_teams_mode( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
+    __kmp_stg_parse_int( name, value, 0, KMP_MAX_ACTIVE_LEVELS_LIMIT, & __kmp_hot_teams_mode );
+} // __kmp_stg_parse_hot_teams_mode
+
+static void
+__kmp_stg_print_hot_teams_mode( kmp_str_buf_t * buffer, char const * name, void * data ) {
+    __kmp_stg_print_int( buffer, name, __kmp_hot_teams_mode );
+} // __kmp_stg_print_hot_teams_mode
+
+#endif // KMP_NESTED_HOT_TEAMS
 
 // -------------------------------------------------------------------------------------------------
 // KMP_HANDLE_SIGNALS
@@ -1438,12 +1491,10 @@ __kmp_stg_parse_barrier_branch_bit( char const * name, char const * value, void
     const char *var;
 
     /* ---------- Barrier branch bit control ------------ */
-
     for ( int i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
         var = __kmp_barrier_branch_bit_env_name[ i ];
-
         if ( ( strcmp( var, name) == 0 ) && ( value != 0 ) ) {
-            char   *comma;
+            char *comma;
 
             comma = (char *) strchr( value, ',' );
             __kmp_barrier_gather_branch_bits[ i ] = ( kmp_uint32 ) __kmp_str_to_int( value, ',' );
@@ -1455,7 +1506,6 @@ __kmp_stg_parse_barrier_branch_bit( char const * name, char const * value, void
 
                 if ( __kmp_barrier_release_branch_bits[ i ] > KMP_MAX_BRANCH_BITS ) {
                     __kmp_msg( kmp_ms_warning, KMP_MSG( BarrReleaseValueInvalid, name, comma + 1 ), __kmp_msg_null );
-
                     __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
                 }
             }
@@ -2037,11 +2087,6 @@ __kmp_parse_affinity_env( char const * name, char const * value,
 # if OMP_40_ENABLED
     KMP_DEBUG_ASSERT( ( __kmp_nested_proc_bind.bind_types != NULL )
       && ( __kmp_nested_proc_bind.used > 0 ) );
-    if ( ( __kmp_affinity_notype != NULL )
-      && ( ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default )
-      || ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel ) ) ) {
-        type = TRUE;
-    }
 # endif
 
     while ( *buf != '\0' ) {
@@ -2049,29 +2094,53 @@ __kmp_parse_affinity_env( char const * name, char const * value,
 
         if (__kmp_match_str("none", buf, (const char **)&next)) {
             set_type( affinity_none );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
             buf = next;
         } else if (__kmp_match_str("scatter", buf, (const char **)&next)) {
             set_type( affinity_scatter );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
             buf = next;
         } else if (__kmp_match_str("compact", buf, (const char **)&next)) {
             set_type( affinity_compact );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
             buf = next;
         } else if (__kmp_match_str("logical", buf, (const char **)&next)) {
             set_type( affinity_logical );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
             buf = next;
         } else if (__kmp_match_str("physical", buf, (const char **)&next)) {
             set_type( affinity_physical );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
             buf = next;
         } else if (__kmp_match_str("explicit", buf, (const char **)&next)) {
             set_type( affinity_explicit );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
             buf = next;
 # if KMP_MIC
         } else if (__kmp_match_str("balanced", buf, (const char **)&next)) {
             set_type( affinity_balanced );
+#  if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+#  endif
             buf = next;
 # endif
         } else if (__kmp_match_str("disabled", buf, (const char **)&next)) {
             set_type( affinity_disabled );
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+# endif
             buf = next;
         } else if (__kmp_match_str("verbose", buf, (const char **)&next)) {
             set_verbose( TRUE );
@@ -2451,6 +2520,9 @@ __kmp_stg_parse_gomp_cpu_affinity( char const * name, char const * value, void *
             __kmp_affinity_proclist = temp_proclist;
             __kmp_affinity_type = affinity_explicit;
             __kmp_affinity_gran = affinity_gran_fine;
+# if OMP_40_ENABLED
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+# endif
         }
         else {
             KMP_WARNING( AffSyntaxError, name );
@@ -2772,6 +2844,21 @@ __kmp_stg_parse_places( char const * name, char const * value, void * data )
     const char *scan = value;
     const char *next = scan;
     const char *kind = "\"threads\"";
+    kmp_setting_t **rivals = (kmp_setting_t **) data;
+    int rc;
+
+    rc = __kmp_stg_check_rivals( name, value, rivals );
+    if ( rc ) {
+        return;
+    }
+
+    //
+    // If OMP_PROC_BIND is not specified but OMP_PLACES is,
+    // then let OMP_PROC_BIND default to true.
+    //
+    if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
 
     //__kmp_affinity_num_places = 0;
 
@@ -2805,10 +2892,17 @@ __kmp_stg_parse_places( char const * name, char const * value, void * data )
             __kmp_affinity_type = affinity_explicit;
             __kmp_affinity_gran = affinity_gran_fine;
             __kmp_affinity_dups = FALSE;
+            if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+                 __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+            }
         }
         return;
     }
 
+    if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
+    }
+
     SKIP_WS(scan);
     if ( *scan == '\0' ) {
         return;
@@ -2855,8 +2949,7 @@ __kmp_stg_print_places( kmp_str_buf_t * buffer, char const * name,
     }
     if ( ( __kmp_nested_proc_bind.used == 0 )
       || ( __kmp_nested_proc_bind.bind_types == NULL )
-      || ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_false )
-      || ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel ) ) {
+      || ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_false ) ) {
         __kmp_str_buf_print( buffer, ": %s\n", KMP_I18N_STR( NotDefined ) );
     }
     else if ( __kmp_affinity_type == affinity_explicit ) {
@@ -2913,7 +3006,7 @@ __kmp_stg_print_places( kmp_str_buf_t * buffer, char const * name,
 
 # endif /* OMP_40_ENABLED */
 
-# if OMP_30_ENABLED && (! OMP_40_ENABLED)
+# if (! OMP_40_ENABLED)
 
 static void
 __kmp_stg_parse_proc_bind( char const * name, char const * value, void * data )
@@ -2943,7 +3036,7 @@ __kmp_stg_parse_proc_bind( char const * name, char const * value, void * data )
     }
 } // __kmp_parse_proc_bind
 
-# endif /* if OMP_30_ENABLED && (! OMP_40_ENABLED) */
+# endif /* if (! OMP_40_ENABLED) */
 
 
 static void
@@ -3132,11 +3225,7 @@ __kmp_stg_parse_proc_bind( char const * name, char const * value, void * data )
         buf = next;
         SKIP_WS( buf );
         __kmp_nested_proc_bind.used = 1;
-
-        //
-        // "true" currently maps to "spread"
-        //
-        __kmp_nested_proc_bind.bind_types[0] = proc_bind_spread;
+        __kmp_nested_proc_bind.bind_types[0] = proc_bind_true;
     }
     else {
         //
@@ -3454,7 +3543,7 @@ __kmp_stg_parse_schedule( char const * name, char const * value, void * data ) {
                     KMP_WARNING( InvalidClause, name, value );
                 } else
                     KMP_WARNING( EmptyClause, name );
-            } while ( value = semicolon ? semicolon + 1 : NULL );
+            } while ( (value = semicolon ? semicolon + 1 : NULL) );
         }
     }; // if
 
@@ -3499,7 +3588,6 @@ __kmp_stg_parse_omp_schedule( char const * name, char const * value, void * data
             else if (!__kmp_strcasecmp_with_sentinel("guided", value, ','))      /* GUIDED */
                 __kmp_sched = kmp_sch_guided_chunked;
 // AC: TODO: add AUTO schedule, and pprobably remove TRAPEZOIDAL (OMP 3.0 does not allow it)
-            #if OMP_30_ENABLED
             else if (!__kmp_strcasecmp_with_sentinel("auto", value, ',')) {       /* AUTO */
                 __kmp_sched = kmp_sch_auto;
                 if( comma ) {
@@ -3507,7 +3595,6 @@ __kmp_stg_parse_omp_schedule( char const * name, char const * value, void * data
                     comma = NULL;
                 }
             }
-            #endif // OMP_30_ENABLED
             else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", value, ',')) /* TRAPEZOIDAL */
                 __kmp_sched = kmp_sch_trapezoidal;
             else if (!__kmp_strcasecmp_with_sentinel("static", value, ','))      /* STATIC */
@@ -4016,7 +4103,7 @@ __kmp_stg_parse_adaptive_lock_props( const char *name, const char *value, void *
             break;
         }
         // Next character is not an integer or not a comma OR number of values > 2 => end of list
-        if ( ( ( *next < '0' ) || ( *next > '9' ) ) && ( *next !=',') || ( total > 2 ) ) {
+        if ( ( ( *next < '0' || *next > '9' ) && *next !=',' ) || total > 2 ) {
             KMP_WARNING( EnvSyntaxError, name, value );
             return;
         }
@@ -4314,6 +4401,10 @@ __kmp_stg_print_omp_display_env( kmp_str_buf_t * buffer, char const * name, void
 
 static void
 __kmp_stg_parse_omp_cancellation( char const * name, char const * value, void * data ) {
+    if ( TCR_4(__kmp_init_parallel) ) {
+        KMP_WARNING( EnvParallelWarn, name );
+        return;
+    }   // read value before first parallel only
     __kmp_stg_parse_bool( name, value, & __kmp_omp_cancellation );
 } // __kmp_stg_parse_omp_cancellation
 
@@ -4340,6 +4431,7 @@ static kmp_setting_t __kmp_stg_table[] = {
     { "KMP_SETTINGS",                      __kmp_stg_parse_settings,           __kmp_stg_print_settings,           NULL, 0, 0 },
     { "KMP_STACKOFFSET",                   __kmp_stg_parse_stackoffset,        __kmp_stg_print_stackoffset,        NULL, 0, 0 },
     { "KMP_STACKSIZE",                     __kmp_stg_parse_stacksize,          __kmp_stg_print_stacksize,          NULL, 0, 0 },
+    { "KMP_STACKPAD",                      __kmp_stg_parse_stackpad,           __kmp_stg_print_stackpad,           NULL, 0, 0 },
     { "KMP_VERSION",                       __kmp_stg_parse_version,            __kmp_stg_print_version,            NULL, 0, 0 },
     { "KMP_WARNINGS",                      __kmp_stg_parse_warnings,           __kmp_stg_print_warnings,           NULL, 0, 0 },
 
@@ -4347,13 +4439,15 @@ static kmp_setting_t __kmp_stg_table[] = {
     { "OMP_NUM_THREADS",                   __kmp_stg_parse_num_threads,        __kmp_stg_print_num_threads,        NULL, 0, 0 },
     { "OMP_STACKSIZE",                     __kmp_stg_parse_stacksize,          __kmp_stg_print_stacksize,          NULL, 0, 0 },
 
-#if OMP_30_ENABLED
     { "KMP_TASKING",                       __kmp_stg_parse_tasking,            __kmp_stg_print_tasking,            NULL, 0, 0 },
     { "KMP_TASK_STEALING_CONSTRAINT",      __kmp_stg_parse_task_stealing,      __kmp_stg_print_task_stealing,      NULL, 0, 0 },
     { "OMP_MAX_ACTIVE_LEVELS",             __kmp_stg_parse_max_active_levels,  __kmp_stg_print_max_active_levels,  NULL, 0, 0 },
     { "OMP_THREAD_LIMIT",                  __kmp_stg_parse_all_threads,        __kmp_stg_print_all_threads,        NULL, 0, 0 },
     { "OMP_WAIT_POLICY",                   __kmp_stg_parse_wait_policy,        __kmp_stg_print_wait_policy,        NULL, 0, 0 },
-#endif // OMP_30_ENABLED
+#if KMP_NESTED_HOT_TEAMS
+    { "KMP_HOT_TEAMS_MAX_LEVEL",           __kmp_stg_parse_hot_teams_level,    __kmp_stg_print_hot_teams_level,    NULL, 0, 0 },
+    { "KMP_HOT_TEAMS_MODE",                __kmp_stg_parse_hot_teams_mode,     __kmp_stg_print_hot_teams_mode,     NULL, 0, 0 },
+#endif // KMP_NESTED_HOT_TEAMS
 
 #if KMP_HANDLE_SIGNALS
     { "KMP_HANDLE_SIGNALS",                __kmp_stg_parse_handle_signals,     __kmp_stg_print_handle_signals,     NULL, 0, 0 },
@@ -4411,18 +4505,16 @@ static kmp_setting_t __kmp_stg_table[] = {
 # ifdef KMP_GOMP_COMPAT
     { "GOMP_CPU_AFFINITY",                 __kmp_stg_parse_gomp_cpu_affinity,  NULL, /* no print */                NULL, 0, 0 },
 # endif /* KMP_GOMP_COMPAT */
-# if OMP_30_ENABLED
-#  if OMP_40_ENABLED
+# if OMP_40_ENABLED
     { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          __kmp_stg_print_proc_bind,          NULL, 0, 0 },
     { "OMP_PLACES",                        __kmp_stg_parse_places,             __kmp_stg_print_places,             NULL, 0, 0 },
-#  else
+# else
     { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          NULL, /* no print */                NULL, 0, 0 },
-#  endif /* OMP_40_ENABLED */
-# endif /* OMP_30_ENABLED */
+# endif /* OMP_40_ENABLED */
 
     { "KMP_TOPOLOGY_METHOD",               __kmp_stg_parse_topology_method,    __kmp_stg_print_topology_method,    NULL, 0, 0 },
 
-#elif !KMP_AFFINITY_SUPPORTED
+#else
 
     //
     // KMP_AFFINITY is not supported on OS X*, nor is OMP_PLACES.
@@ -4432,8 +4524,6 @@ static kmp_setting_t __kmp_stg_table[] = {
     { "OMP_PROC_BIND",                     __kmp_stg_parse_proc_bind,          __kmp_stg_print_proc_bind,          NULL, 0, 0 },
 # endif
 
-#else
-    #error "Unknown or unsupported OS"
 #endif // KMP_AFFINITY_SUPPORTED
 
     { "KMP_INIT_AT_FORK",                  __kmp_stg_parse_init_at_fork,       __kmp_stg_print_init_at_fork,       NULL, 0, 0 },
@@ -4571,7 +4661,6 @@ __kmp_stg_init( void
 
         }
 
-#if OMP_30_ENABLED
         { // Initialize KMP_LIBRARY and OMP_WAIT_POLICY data.
 
             kmp_setting_t * kmp_library     = __kmp_stg_find( "KMP_LIBRARY" );        // 1st priority.
@@ -4595,21 +4684,12 @@ __kmp_stg_init( void
             }; // if
 
         }
-#else
-        {
-            kmp_setting_t * kmp_library = __kmp_stg_find( "KMP_LIBRARY" );
-            static kmp_stg_wp_data_t kmp_data = { 0, NULL };
-            kmp_library->data = & kmp_data;
-        }
-#endif /* OMP_30_ENABLED */
 
         { // Initialize KMP_ALL_THREADS, KMP_MAX_THREADS, and OMP_THREAD_LIMIT data.
 
             kmp_setting_t * kmp_all_threads  = __kmp_stg_find( "KMP_ALL_THREADS"  );  // 1st priority.
             kmp_setting_t * kmp_max_threads  = __kmp_stg_find( "KMP_MAX_THREADS"  );  // 2nd priority.
-#if OMP_30_ENABLED
             kmp_setting_t * omp_thread_limit = __kmp_stg_find( "OMP_THREAD_LIMIT" );  // 3rd priority.
-#endif
 
             // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
             static kmp_setting_t * volatile rivals[ 4 ];
@@ -4617,20 +4697,16 @@ __kmp_stg_init( void
 
             rivals[ i ++ ] = kmp_all_threads;
             rivals[ i ++ ] = kmp_max_threads;
-#if OMP_30_ENABLED
             if ( omp_thread_limit != NULL ) {
                 rivals[ i ++ ] = omp_thread_limit;
             }; // if
-#endif
             rivals[ i ++ ] = NULL;
 
             kmp_all_threads->data = (void*)& rivals;
             kmp_max_threads->data = (void*)& rivals;
-#if OMP_30_ENABLED
             if ( omp_thread_limit != NULL ) {
                 omp_thread_limit->data = (void*)& rivals;
             }; // if
-#endif
 
         }
 
@@ -4645,18 +4721,11 @@ __kmp_stg_init( void
             KMP_DEBUG_ASSERT( gomp_cpu_affinity != NULL );
 # endif
 
-# if OMP_30_ENABLED
             kmp_setting_t * omp_proc_bind = __kmp_stg_find( "OMP_PROC_BIND" );  // 3rd priority.
             KMP_DEBUG_ASSERT( omp_proc_bind != NULL );
-# endif
-
-# if OMP_40_ENABLED
-            kmp_setting_t * omp_places = __kmp_stg_find( "OMP_PLACES" );  // 3rd priority.
-            KMP_DEBUG_ASSERT( omp_places != NULL );
-# endif
 
             // !!! volatile keyword is Intel (R) C Compiler bug CQ49908 workaround.
-            static kmp_setting_t * volatile rivals[ 5 ];
+            static kmp_setting_t * volatile rivals[ 4 ];
             int i = 0;
 
             rivals[ i ++ ] = kmp_affinity;
@@ -4666,23 +4735,30 @@ __kmp_stg_init( void
             gomp_cpu_affinity->data = (void*)& rivals;
 # endif
 
-# if OMP_30_ENABLED
             rivals[ i ++ ] = omp_proc_bind;
             omp_proc_bind->data = (void*)& rivals;
-# endif
+            rivals[ i ++ ] = NULL;
 
 # if OMP_40_ENABLED
-            rivals[ i ++ ] = omp_places;
-            omp_places->data = (void*)& rivals;
-# endif
+            static kmp_setting_t * volatile places_rivals[ 4 ];
+            i = 0;
 
-            rivals[ i ++ ] = NULL;
-        }
+            kmp_setting_t * omp_places = __kmp_stg_find( "OMP_PLACES" );  // 3rd priority.
+            KMP_DEBUG_ASSERT( omp_places != NULL );
 
+            places_rivals[ i ++ ] = kmp_affinity;
+#  ifdef KMP_GOMP_COMPAT
+            places_rivals[ i ++ ] = gomp_cpu_affinity;
+#  endif
+            places_rivals[ i ++ ] = omp_places;
+            omp_places->data = (void*)& places_rivals;
+            places_rivals[ i ++ ] = NULL;
+# endif
+        }
 #else
     // KMP_AFFINITY not supported, so OMP_PROC_BIND has no rivals.
     // OMP_PLACES not supported yet.
-#endif
+#endif // KMP_AFFINITY_SUPPORTED
 
         { // Initialize KMP_DETERMINISTIC_REDUCTION and KMP_FORCE_REDUCTION data.
 
@@ -4917,8 +4993,33 @@ __kmp_env_initialize( char const * string ) {
           && ( FIND( aff_str, "disabled" ) == NULL ) ) {
             __kmp_affinity_notype = __kmp_stg_find( "KMP_AFFINITY"  );
         }
+        else {
+            //
+            // A new affinity type is specified.
+            // Reset the affinity flags to their default values,
+            // in case this is called from kmp_set_defaults().
+            //
+            __kmp_affinity_type = affinity_default;
+            __kmp_affinity_gran = affinity_gran_default;
+            __kmp_affinity_top_method = affinity_top_method_default;
+            __kmp_affinity_respect_mask = affinity_respect_mask_default;
+        }
 # undef FIND
+
+#if OMP_40_ENABLED
+        //
+        // Also reset the affinity flags if OMP_PROC_BIND is specified.
+        //
+        aff_str = __kmp_env_blk_var( & block, "OMP_PROC_BIND" );
+        if ( aff_str != NULL ) {
+            __kmp_affinity_type = affinity_default;
+            __kmp_affinity_gran = affinity_gran_default;
+            __kmp_affinity_top_method = affinity_top_method_default;
+            __kmp_affinity_respect_mask = affinity_respect_mask_default;
+        }
+#endif /* OMP_40_ENABLED */
     }
+
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 #if OMP_40_ENABLED
@@ -4956,9 +5057,15 @@ __kmp_env_initialize( char const * string ) {
     else {
         KMP_DEBUG_ASSERT( string != NULL); // kmp_set_defaults() was called
         KMP_DEBUG_ASSERT( __kmp_user_lock_kind != lk_default );
+        __kmp_set_user_lock_vptrs( __kmp_user_lock_kind );
+        // Binds lock functions again to follow the transition between different
+        // KMP_CONSISTENCY_CHECK values. Calling this again is harmless as long
+        // as we do not allow lock kind changes after making a call to any
+        // user lock functions (true).
     }
 
 #if KMP_AFFINITY_SUPPORTED
+
     if ( ! TCR_4(__kmp_init_middle) ) {
         //
         // Determine if the machine/OS is actually capable of supporting
@@ -4984,102 +5091,87 @@ __kmp_env_initialize( char const * string ) {
         }
 
 # if OMP_40_ENABLED
-
         if ( __kmp_affinity_type == affinity_disabled )  {
             __kmp_nested_proc_bind.bind_types[0] = proc_bind_disabled;
         }
-        else if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) {
+        else if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_true ) {
             //
-            // Where supported the default is to use the KMP_AFFINITY
-            // mechanism.  On OS X* etc. it is none.
+            // OMP_PROC_BIND=true maps to OMP_PROC_BIND=spread.
             //
-#  if KMP_AFFINITY_SUPPORTED
-            __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
-#  else
-            __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
-#  endif
-        }
-
-        //
-        // If OMP_PROC_BIND was specified (so we are using OpenMP 4.0 affinity)
-        // but OMP_PLACES was not, then it defaults to the equivalent of
-        // KMP_AFFINITY=compact,noduplicates,granularity=fine.
-        //
-        if ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_intel ) {
-            if ( ( __kmp_affinity_type == affinity_none )
-#  if ! KMP_MIC
-              || ( __kmp_affinity_type == affinity_default )
-#  endif
-              ) {
-                  __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
-            }
+            __kmp_nested_proc_bind.bind_types[0] = proc_bind_spread;
         }
-        else if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_false )
-          && ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_disabled ) ) {
-            if ( __kmp_affinity_type == affinity_default ) {
-                __kmp_affinity_type = affinity_compact;
-                __kmp_affinity_dups = FALSE;
-            }
-            if ( __kmp_affinity_gran == affinity_gran_default ) {
-                __kmp_affinity_gran = affinity_gran_fine;
-            }
-        }
-# endif //  OMP_40_ENABLED
+# endif /* OMP_40_ENABLED */
 
         if ( KMP_AFFINITY_CAPABLE() ) {
 
 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64
 
-            if ( __kmp_num_proc_groups > 1 ) {
+            //
+            // Handle the Win 64 group affinity stuff if there are multiple
+            // processor groups, or if the user requested it, and OMP 4.0
+            // affinity is not in effect.
+            //
+            if ( ( ( __kmp_num_proc_groups > 1 )
+              && ( __kmp_affinity_type == affinity_default )
+#  if OMP_40_ENABLED
+              && ( __kmp_nested_proc_bind.bind_types[0] == proc_bind_default ) )
+#  endif
+              || ( __kmp_affinity_top_method == affinity_top_method_group ) ) {
                 if ( __kmp_affinity_respect_mask == affinity_respect_mask_default ) {
-                   __kmp_affinity_respect_mask = FALSE;
+                    __kmp_affinity_respect_mask = FALSE;
                 }
-
-                if ( ( __kmp_affinity_type == affinity_default )
-                  || ( __kmp_affinity_type == affinity_none ) ) {
-                    if ( __kmp_affinity_type == affinity_none ) {
-                        if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
-                          && ( __kmp_affinity_type != affinity_none ) ) ) {
-                            KMP_WARNING( AffTypeCantUseMultGroups, "none", "compact" );
-                        }
-                    }
+                if ( __kmp_affinity_type == affinity_default ) {
                     __kmp_affinity_type = affinity_compact;
-                    if ( __kmp_affinity_top_method == affinity_top_method_default ) {
-                       __kmp_affinity_top_method = affinity_top_method_group;
-                    }
+#  if OMP_40_ENABLED
+                    __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+#  endif
                 }
-                else if ( __kmp_affinity_top_method == affinity_top_method_default ) {
-                    __kmp_affinity_top_method = affinity_top_method_all;
+                if ( __kmp_affinity_top_method == affinity_top_method_default ) {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_top_method = affinity_top_method_group;
+                        __kmp_affinity_gran = affinity_gran_group;
+                    }
+                    else if ( __kmp_affinity_gran == affinity_gran_group ) {
+                        __kmp_affinity_top_method = affinity_top_method_group;
+                    }
+                    else {
+                        __kmp_affinity_top_method = affinity_top_method_all;
+                    }
                 }
-
-                if ( __kmp_affinity_gran_levels < 0 ) {
-                    if ( __kmp_affinity_top_method == affinity_top_method_group ) {
-                        if ( __kmp_affinity_gran == affinity_gran_default ) {
-                           __kmp_affinity_gran = affinity_gran_group;
-                        }
-                        else if ( __kmp_affinity_gran == affinity_gran_core ) {
-                            if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
-                              && ( __kmp_affinity_type != affinity_none ) ) ) {
-                                KMP_WARNING( AffGranCantUseMultGroups, "core", "thread" );
-                            }
-                            __kmp_affinity_gran = affinity_gran_thread;
-                        }
-                        else if ( __kmp_affinity_gran == affinity_gran_package ) {
-                            if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
-                              && ( __kmp_affinity_type != affinity_none ) ) ) {
-                                KMP_WARNING( AffGranCantUseMultGroups, "package", "group" );
-                            }
-                           __kmp_affinity_gran = affinity_gran_group;
-                        }
-                        else if ( __kmp_affinity_gran == affinity_gran_node ) {
-                            if ( __kmp_affinity_verbose || ( __kmp_affinity_warnings
-                              && ( __kmp_affinity_type != affinity_none ) ) ) {
-                                KMP_WARNING( AffGranCantUseMultGroups, "node", "group" );
-                            }
-                           __kmp_affinity_gran = affinity_gran_group;
+                else if ( __kmp_affinity_top_method == affinity_top_method_group ) {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_gran = affinity_gran_group;
+                    }
+                    else if ( ( __kmp_affinity_gran != affinity_gran_group )
+                      && ( __kmp_affinity_gran != affinity_gran_fine )
+                      && ( __kmp_affinity_gran != affinity_gran_thread ) ) {
+                        char *str = NULL;
+                        switch ( __kmp_affinity_gran ) {
+                            case affinity_gran_core: str = "core"; break;
+                            case affinity_gran_package: str = "package"; break;
+                            case affinity_gran_node: str = "node"; break;
+                            default: KMP_DEBUG_ASSERT( 0 );
                         }
+                        KMP_WARNING( AffGranTopGroup, var, str );
+                        __kmp_affinity_gran = affinity_gran_fine;
                     }
-                    else if ( __kmp_affinity_gran == affinity_gran_default ) {
+                }
+                else {
+                    if ( __kmp_affinity_gran == affinity_gran_default ) {
+                        __kmp_affinity_gran = affinity_gran_core;
+                    }
+                    else if ( __kmp_affinity_gran == affinity_gran_group ) {
+                        char *str = NULL;
+                        switch ( __kmp_affinity_type ) {
+                            case affinity_physical: str = "physical"; break;
+                            case affinity_logical: str = "logical"; break;
+                            case affinity_compact: str = "compact"; break;
+                            case affinity_scatter: str = "scatter"; break;
+                            case affinity_explicit: str = "explicit"; break;
+                            // No MIC on windows, so no affinity_balanced case
+                            default: KMP_DEBUG_ASSERT( 0 );
+                        }
+                        KMP_WARNING( AffGranGroupType, var, str );
                         __kmp_affinity_gran = affinity_gran_core;
                     }
                 }
@@ -5087,27 +5179,52 @@ __kmp_env_initialize( char const * string ) {
             else
 
 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
+
             {
                 if ( __kmp_affinity_respect_mask == affinity_respect_mask_default ) {
-                   __kmp_affinity_respect_mask = TRUE;
+# if KMP_OS_WINDOWS && KMP_ARCH_X86_64
+                    if ( __kmp_num_proc_groups > 1 ) {
+                        __kmp_affinity_respect_mask = FALSE;
+                    }
+                    else
+# endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */
+                    {
+                        __kmp_affinity_respect_mask = TRUE;
+                    }
+                }
+# if OMP_40_ENABLED
+                if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel )
+                  && ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_default ) ) {
+                    if ( __kmp_affinity_type == affinity_default ) {
+                        __kmp_affinity_type = affinity_compact;
+                        __kmp_affinity_dups = FALSE;
+                    }
                 }
+                else
+# endif /* OMP_40_ENABLED */
                 if ( __kmp_affinity_type == affinity_default ) {
 # if KMP_MIC
-                   __kmp_affinity_type = affinity_scatter;
+                    __kmp_affinity_type = affinity_scatter;
+#  if OMP_40_ENABLED
+                    __kmp_nested_proc_bind.bind_types[0] = proc_bind_intel;
+#  endif
 # else
-                   __kmp_affinity_type = affinity_none;
+                    __kmp_affinity_type = affinity_none;
+#  if OMP_40_ENABLED
+                    __kmp_nested_proc_bind.bind_types[0] = proc_bind_false;
+#  endif
 # endif
                 }
                 if ( ( __kmp_affinity_gran == affinity_gran_default )
                   &&  ( __kmp_affinity_gran_levels < 0 ) ) {
 # if KMP_MIC
-                   __kmp_affinity_gran = affinity_gran_fine;
+                    __kmp_affinity_gran = affinity_gran_fine;
 # else
-                   __kmp_affinity_gran = affinity_gran_core;
+                    __kmp_affinity_gran = affinity_gran_core;
 # endif
                 }
                 if ( __kmp_affinity_top_method == affinity_top_method_default ) {
-                   __kmp_affinity_top_method = affinity_top_method_all;
+                    __kmp_affinity_top_method = affinity_top_method_all;
                 }
             }
         }
@@ -5164,9 +5281,8 @@ __kmp_env_print() {
         char const * name  = block.vars[ i ].name;
         char const * value = block.vars[ i ].value;
         if (
-            strlen( name ) > 4
-            &&
-            ( strncmp( name, "KMP_", 4 ) == 0 ) || strncmp( name, "OMP_", 4 ) == 0
+            ( strlen( name ) > 4 && strncmp( name, "KMP_", 4 ) == 0 )
+            || strncmp( name, "OMP_", 4 ) == 0
             #ifdef KMP_GOMP_COMPAT
                 || strncmp( name, "GOMP_", 5 ) == 0
             #endif // KMP_GOMP_COMPAT
diff --git a/openmp/runtime/src/kmp_settings.h b/openmp/runtime/src/kmp_settings.h
index ebddc1db0ca..e1ad4280e7b 100644
--- a/openmp/runtime/src/kmp_settings.h
+++ b/openmp/runtime/src/kmp_settings.h
@@ -1,7 +1,7 @@
 /*
  * kmp_settings.h -- Initialize environment variables
- * $Revision: 42598 $
- * $Date: 2013-08-19 15:40:56 -0500 (Mon, 19 Aug 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_stats.cpp b/openmp/runtime/src/kmp_stats.cpp
new file mode 100644
index 00000000000..9750f7b3636
--- /dev/null
+++ b/openmp/runtime/src/kmp_stats.cpp
@@ -0,0 +1,615 @@
+/** @file kmp_stats.cpp
+ * Statistics gathering and processing.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if KMP_STATS_ENABLED
+
+#include "kmp.h"
+#include "kmp_str.h"
+#include "kmp_lock.h"
+#include "kmp_stats.h"
+
+#include <algorithm>
+#include <sstream>
+#include <iomanip>
+#include <stdlib.h>                             // for atexit
+
+#define STRINGIZE2(x) #x
+#define STRINGIZE(x) STRINGIZE2(x)
+
+#define expandName(name,flags,ignore)  {STRINGIZE(name),flags},
+statInfo timeStat::timerInfo[] = {
+    KMP_FOREACH_TIMER(expandName,0)
+    {0,0}
+};
+const statInfo counter::counterInfo[] = {
+    KMP_FOREACH_COUNTER(expandName,0)
+    {0,0}
+};
+#undef expandName
+
+#define expandName(ignore1,ignore2,ignore3)  {0.0,0.0,0.0},
+kmp_stats_output_module::rgb_color kmp_stats_output_module::timerColorInfo[] = {
+    KMP_FOREACH_TIMER(expandName,0)
+    {0.0,0.0,0.0}
+};
+#undef expandName
+
+const kmp_stats_output_module::rgb_color kmp_stats_output_module::globalColorArray[] = {
+    {1.0, 0.0, 0.0}, // red
+    {1.0, 0.6, 0.0}, // orange
+    {1.0, 1.0, 0.0}, // yellow
+    {0.0, 1.0, 0.0}, // green 
+    {0.0, 0.0, 1.0}, // blue
+    {0.6, 0.2, 0.8}, // purple
+    {1.0, 0.0, 1.0}, // magenta
+    {0.0, 0.4, 0.2}, // dark green
+    {1.0, 1.0, 0.6}, // light yellow
+    {0.6, 0.4, 0.6}, // dirty purple
+    {0.0, 1.0, 1.0}, // cyan
+    {1.0, 0.4, 0.8}, // pink
+    {0.5, 0.5, 0.5}, // grey
+    {0.8, 0.7, 0.5}, // brown
+    {0.6, 0.6, 1.0}, // light blue
+    {1.0, 0.7, 0.5}, // peach
+    {0.8, 0.5, 1.0}, // lavender
+    {0.6, 0.0, 0.0}, // dark red
+    {0.7, 0.6, 0.0}, // gold
+    {0.0, 0.0, 0.0}  // black
+};
+
+// Ensure that the atexit handler only runs once.
+static uint32_t statsPrinted = 0;
+
+// output interface
+static kmp_stats_output_module __kmp_stats_global_output;
+
+/* ****************************************************** */
+/* ************* statistic member functions ************* */
+
+void statistic::addSample(double sample)
+{
+    double delta = sample - meanVal;
+
+    sampleCount = sampleCount + 1;
+    meanVal     = meanVal + delta/sampleCount;
+    m2          = m2 + delta*(sample - meanVal);
+
+    minVal = std::min(minVal, sample);
+    maxVal = std::max(maxVal, sample);
+}
+
+statistic & statistic::operator+= (const statistic & other)
+{
+    if (sampleCount == 0)
+    {
+        *this = other;
+        return *this;
+    }
+
+    uint64_t newSampleCount = sampleCount + other.sampleCount;
+    double dnsc  = double(newSampleCount);
+    double dsc   = double(sampleCount);
+    double dscBydnsc = dsc/dnsc;
+    double dosc  = double(other.sampleCount);
+    double delta = other.meanVal - meanVal;
+
+    // Try to order these calculations to avoid overflows.
+    // If this were Fortran, then the compiler would not be able to re-order over brackets.
+    // In C++ it may be legal to do that (we certainly hope it doesn't, and CC+ Programming Language 2nd edition
+    // suggests it shouldn't, since it says that exploitation of associativity can only be made if the operation
+    // really is associative (which floating addition isn't...)).
+    meanVal     = meanVal*dscBydnsc + other.meanVal*(1-dscBydnsc);
+    m2          = m2 + other.m2 + dscBydnsc*dosc*delta*delta;
+    minVal      = std::min (minVal, other.minVal);
+    maxVal      = std::max (maxVal, other.maxVal);
+    sampleCount = newSampleCount;
+
+
+    return *this;
+}
+
+void statistic::scale(double factor)
+{
+    minVal = minVal*factor;
+    maxVal = maxVal*factor;
+    meanVal= meanVal*factor;
+    m2     = m2*factor*factor;
+    return;
+}
+
+std::string statistic::format(char unit, bool total) const
+{
+    std::string result = formatSI(sampleCount,9,' ');
+
+    result = result + std::string(", ") + formatSI(minVal,  9, unit);
+    result = result + std::string(", ") + formatSI(meanVal, 9, unit);
+    result = result + std::string(", ") + formatSI(maxVal,  9, unit);
+    if (total)
+        result = result + std::string(", ") + formatSI(meanVal*sampleCount, 9, unit);
+    result = result + std::string(", ") + formatSI(getSD(), 9, unit);
+
+    return result;
+}
+
+/* ********************************************************** */
+/* ************* explicitTimer member functions ************* */
+
+void explicitTimer::start(timer_e timerEnumValue) { 
+    startTime = tsc_tick_count::now(); 
+    if(timeStat::logEvent(timerEnumValue)) {
+        __kmp_stats_thread_ptr->incrementNestValue();
+    }
+    return;
+}
+
+void explicitTimer::stop(timer_e timerEnumValue) {
+    if (startTime.getValue() == 0)
+        return;
+
+    tsc_tick_count finishTime = tsc_tick_count::now();
+
+    //stat->addSample ((tsc_tick_count::now() - startTime).ticks());
+    stat->addSample ((finishTime - startTime).ticks());
+
+    if(timeStat::logEvent(timerEnumValue)) {
+        __kmp_stats_thread_ptr->push_event(startTime.getValue() - __kmp_stats_start_time.getValue(), finishTime.getValue() - __kmp_stats_start_time.getValue(), __kmp_stats_thread_ptr->getNestValue(), timerEnumValue); 
+        __kmp_stats_thread_ptr->decrementNestValue();
+    }
+
+    /* We accept the risk that we drop a sample because it really did start at t==0. */
+    startTime = 0; 
+    return;
+}
+
+/* ******************************************************************* */
+/* ************* kmp_stats_event_vector member functions ************* */
+
+void kmp_stats_event_vector::deallocate() {
+    __kmp_free(events);
+    internal_size = 0;
+    allocated_size = 0;
+    events = NULL;
+}
+
+// This function is for qsort() which requires the compare function to return
+// either a negative number if event1 < event2, a positive number if event1 > event2
+// or zero if event1 == event2.  
+// This sorts by start time (lowest to highest).
+int compare_two_events(const void* event1, const void* event2) {
+    kmp_stats_event* ev1 = (kmp_stats_event*)event1;
+    kmp_stats_event* ev2 = (kmp_stats_event*)event2;
+
+    if(ev1->getStart() < ev2->getStart()) return -1;
+    else if(ev1->getStart() > ev2->getStart()) return 1;
+    else return 0;
+}
+
+void kmp_stats_event_vector::sort() {
+    qsort(events, internal_size, sizeof(kmp_stats_event), compare_two_events);
+}
+
+/* *********************************************************** */
+/* ************* kmp_stats_list member functions ************* */
+
+// returns a pointer to newly created stats node
+kmp_stats_list* kmp_stats_list::push_back(int gtid) { 
+    kmp_stats_list* newnode = (kmp_stats_list*)__kmp_allocate(sizeof(kmp_stats_list));
+    // placement new, only requires space and pointer and initializes (so __kmp_allocate instead of C++ new[] is used)
+    new (newnode) kmp_stats_list();
+    newnode->setGtid(gtid);
+    newnode->prev = this->prev;
+    newnode->next = this;
+    newnode->prev->next = newnode;
+    newnode->next->prev = newnode;
+    return newnode;
+}
+void kmp_stats_list::deallocate() {
+    kmp_stats_list* ptr = this->next;
+    kmp_stats_list* delptr = this->next;
+    while(ptr != this) {
+        delptr = ptr;
+        ptr=ptr->next;
+        // placement new means we have to explicitly call destructor.
+        delptr->_event_vector.deallocate();
+        delptr->~kmp_stats_list();
+        __kmp_free(delptr);
+    }
+}
+kmp_stats_list::iterator kmp_stats_list::begin() {
+    kmp_stats_list::iterator it;
+    it.ptr = this->next;
+    return it;
+}
+kmp_stats_list::iterator kmp_stats_list::end() {
+    kmp_stats_list::iterator it;
+    it.ptr = this;
+    return it;
+}
+int kmp_stats_list::size() {
+    int retval;
+    kmp_stats_list::iterator it;
+    for(retval=0, it=begin(); it!=end(); it++, retval++) {}
+    return retval;
+}
+
+/* ********************************************************************* */
+/* ************* kmp_stats_list::iterator member functions ************* */
+
+kmp_stats_list::iterator::iterator() : ptr(NULL) {} 
+kmp_stats_list::iterator::~iterator() {}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++() {
+    this->ptr = this->ptr->next;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator++(int dummy) {
+    this->ptr = this->ptr->next;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--() {
+    this->ptr = this->ptr->prev;
+    return *this;
+}
+kmp_stats_list::iterator kmp_stats_list::iterator::operator--(int dummy) {
+    this->ptr = this->ptr->prev;
+    return *this;
+}
+bool kmp_stats_list::iterator::operator!=(const kmp_stats_list::iterator & rhs) {
+   return this->ptr!=rhs.ptr; 
+}
+bool kmp_stats_list::iterator::operator==(const kmp_stats_list::iterator & rhs) {
+   return this->ptr==rhs.ptr; 
+}
+kmp_stats_list* kmp_stats_list::iterator::operator*() const {
+    return this->ptr;
+}
+
+/* *************************************************************** */
+/* *************  kmp_stats_output_module functions ************** */
+
+const char* kmp_stats_output_module::outputFileName = NULL;
+const char* kmp_stats_output_module::eventsFileName = NULL;
+const char* kmp_stats_output_module::plotFileName   = NULL;
+int kmp_stats_output_module::printPerThreadFlag       = 0;
+int kmp_stats_output_module::printPerThreadEventsFlag = 0;
+
+// init() is called very near the beginning of execution time in the constructor of __kmp_stats_global_output
+void kmp_stats_output_module::init() 
+{
+    char * statsFileName  = getenv("KMP_STATS_FILE");
+    eventsFileName        = getenv("KMP_STATS_EVENTS_FILE");
+    plotFileName          = getenv("KMP_STATS_PLOT_FILE");
+    char * threadStats    = getenv("KMP_STATS_THREADS");
+    char * threadEvents   = getenv("KMP_STATS_EVENTS");
+
+    // set the stats output filenames based on environment variables and defaults
+    outputFileName = statsFileName;
+    eventsFileName = eventsFileName ? eventsFileName : "events.dat";
+    plotFileName   = plotFileName   ? plotFileName   : "events.plt";
+
+    // set the flags based on environment variables matching: true, on, 1, .true. , .t. , yes
+    printPerThreadFlag        = __kmp_str_match_true(threadStats);
+    printPerThreadEventsFlag  = __kmp_str_match_true(threadEvents);
+
+    if(printPerThreadEventsFlag) {
+        // assigns a color to each timer for printing
+        setupEventColors();
+    } else {
+        // will clear flag so that no event will be logged
+        timeStat::clearEventFlags();
+    }
+
+    return;
+}
+
+void kmp_stats_output_module::setupEventColors() {
+    int i;
+    int globalColorIndex = 0;
+    int numGlobalColors = sizeof(globalColorArray) / sizeof(rgb_color);
+    for(i=0;i<TIMER_LAST;i++) {
+        if(timeStat::logEvent((timer_e)i)) {
+            timerColorInfo[i] = globalColorArray[globalColorIndex];
+            globalColorIndex = (globalColorIndex+1)%numGlobalColors;
+        }
+    }
+    return;
+}
+
+void kmp_stats_output_module::printStats(FILE *statsOut, statistic const * theStats, bool areTimers)
+{
+    if (areTimers)
+    {
+        // Check if we have useful timers, since we don't print zero value timers we need to avoid
+        // printing a header and then no data.
+        bool haveTimers = false;
+        for (int s = 0; s<TIMER_LAST; s++)
+        {
+            if (theStats[s].getCount() != 0)
+            {
+                haveTimers = true;
+                break;
+            }
+        }
+        if (!haveTimers)
+            return;
+    }
+
+    // Print
+    const char * title = areTimers ? "Timer,                   SampleCount," : "Counter,                 ThreadCount,";
+    fprintf (statsOut, "%s    Min,      Mean,       Max,     Total,        SD\n", title);    
+    if (areTimers) {
+        for (int s = 0; s<TIMER_LAST; s++) {
+            statistic const * stat = &theStats[s];
+            if (stat->getCount() != 0) {
+                char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+                fprintf (statsOut, "%-25s, %s\n", timeStat::name(timer_e(s)), stat->format(tag, true).c_str());
+            }
+        }
+    } else {   // Counters
+        for (int s = 0; s<COUNTER_LAST; s++) {
+            statistic const * stat = &theStats[s];
+            fprintf (statsOut, "%-25s, %s\n", counter::name(counter_e(s)), stat->format(' ', true).c_str());
+        }
+    }
+} 
+
+void kmp_stats_output_module::printCounters(FILE * statsOut, counter const * theCounters)
+{
+    // We print all the counters even if they are zero.
+    // That makes it easier to slice them into a spreadsheet if you need to.
+    fprintf (statsOut, "\nCounter,                    Count\n");
+    for (int c = 0; c<COUNTER_LAST; c++) {
+        counter const * stat = &theCounters[c];
+        fprintf (statsOut, "%-25s, %s\n", counter::name(counter_e(c)), formatSI(stat->getValue(), 9, ' ').c_str());
+    }
+}
+
+void kmp_stats_output_module::printEvents(FILE* eventsOut, kmp_stats_event_vector* theEvents, int gtid) {
+    // sort by start time before printing
+    theEvents->sort();
+    for (int i = 0; i < theEvents->size(); i++) {
+        kmp_stats_event ev = theEvents->at(i);
+        rgb_color color = getEventColor(ev.getTimerName());
+        fprintf(eventsOut, "%d %lu %lu %1.1f rgb(%1.1f,%1.1f,%1.1f) %s\n", 
+                gtid, 
+                ev.getStart(), 
+                ev.getStop(), 
+                1.2 - (ev.getNestLevel() * 0.2),
+                color.r, color.g, color.b,
+                timeStat::name(ev.getTimerName())
+               );
+    }
+    return;
+}
+
+void kmp_stats_output_module::windupExplicitTimers()
+{
+    // Wind up any explicit timers. We assume that it's fair at this point to just walk all the explcit timers in all threads 
+    // and say "it's over".
+    // If the timer wasn't running, this won't record anything anyway.
+    kmp_stats_list::iterator it;
+    for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        for (int timer=0; timer<EXPLICIT_TIMER_LAST; timer++) {
+            (*it)->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer);
+        }
+    }
+}
+
+void kmp_stats_output_module::printPloticusFile() {
+    int i;
+    int size = __kmp_stats_list.size();
+    FILE* plotOut = fopen(plotFileName, "w+");
+
+    fprintf(plotOut, "#proc page\n"
+                     "   pagesize: 15 10\n"
+                     "   scale: 1.0\n\n");
+
+    fprintf(plotOut, "#proc getdata\n"
+                     "   file: %s\n\n", 
+                     eventsFileName);
+
+    fprintf(plotOut, "#proc areadef\n"
+                     "   title: OpenMP Sampling Timeline\n"
+                     "   titledetails: align=center size=16\n"
+                     "   rectangle: 1 1 13 9\n"
+                     "   xautorange: datafield=2,3\n"
+                     "   yautorange: -1 %d\n\n", 
+                     size);
+
+    fprintf(plotOut, "#proc xaxis\n"
+                     "   stubs: inc\n"
+                     "   stubdetails: size=12\n"
+                     "   label: Time (ticks)\n"
+                     "   labeldetails: size=14\n\n");
+
+    fprintf(plotOut, "#proc yaxis\n"
+                     "   stubs: inc 1\n"
+                     "   stubrange: 0 %d\n"
+                     "   stubdetails: size=12\n"
+                     "   label: Thread #\n"
+                     "   labeldetails: size=14\n\n", 
+                     size-1);
+
+    fprintf(plotOut, "#proc bars\n"
+                     "   exactcolorfield: 5\n"
+                     "   axis: x\n"
+                     "   locfield: 1\n"
+                     "   segmentfields: 2 3\n"
+                     "   barwidthfield: 4\n\n");
+
+    // create legend entries corresponding to the timer color
+    for(i=0;i<TIMER_LAST;i++) {
+        if(timeStat::logEvent((timer_e)i)) {
+            rgb_color c = getEventColor((timer_e)i);
+            fprintf(plotOut, "#proc legendentry\n"
+                             "   sampletype: color\n"
+                             "   label: %s\n"
+                             "   details: rgb(%1.1f,%1.1f,%1.1f)\n\n",
+                             timeStat::name((timer_e)i),
+                             c.r, c.g, c.b);
+
+        }
+    }
+
+    fprintf(plotOut, "#proc legend\n"
+                     "   format: down\n"
+                     "   location: max max\n\n");
+    fclose(plotOut);
+    return;
+}
+
+void kmp_stats_output_module::outputStats(const char* heading) 
+{
+    statistic allStats[TIMER_LAST];
+    statistic allCounters[COUNTER_LAST];
+
+    // stop all the explicit timers for all threads
+    windupExplicitTimers();
+
+    FILE * eventsOut;
+    FILE * statsOut = outputFileName ? fopen (outputFileName, "a+") : stderr;
+
+    if (eventPrintingEnabled()) {
+        eventsOut = fopen(eventsFileName, "w+");
+    }
+
+    if (!statsOut)
+        statsOut = stderr;
+
+    fprintf(statsOut, "%s\n",heading);
+    // Accumulate across threads.
+    kmp_stats_list::iterator it;
+    for (it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        int t = (*it)->getGtid();
+        // Output per thread stats if requested.
+        if (perThreadPrintingEnabled()) {
+            fprintf (statsOut, "Thread %d\n", t);
+            printStats(statsOut, (*it)->getTimers(), true);
+            printCounters(statsOut, (*it)->getCounters());
+            fprintf(statsOut,"\n");
+        }
+        // Output per thread events if requested.
+        if (eventPrintingEnabled()) {
+            kmp_stats_event_vector events = (*it)->getEventVector();
+            printEvents(eventsOut, &events, t);
+        }
+
+        for (int s = 0; s<TIMER_LAST; s++) {
+            // See if we should ignore this timer when aggregating
+            if ((timeStat::masterOnly(timer_e(s)) && (t != 0)) || // Timer is only valid on the master and this thread is a worker
+                (timeStat::workerOnly(timer_e(s)) && (t == 0)) || // Timer is only valid on a worker and this thread is the master
+                timeStat::synthesized(timer_e(s))                 // It's a synthesized stat, so there's no raw data for it.
+               )            
+            {
+                continue;
+            }
+
+            statistic * threadStat = (*it)->getTimer(timer_e(s));
+            allStats[s] += *threadStat;
+        }
+
+        // Special handling for synthesized statistics.
+        // These just have to be coded specially here for now. 
+        // At present we only have one: the total parallel work done in each thread.
+        // The variance here makes it easy to see load imbalance over the whole program (though, of course,
+        // it's possible to have a code with awful load balance in every parallel region but perfect load
+        // balance oever the whole program.)
+        allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal());
+
+        // Time waiting for work (synthesized)
+        if ((t != 0) || !timeStat::workerOnly(timer_e(TIMER_OMP_await_work)))
+            allStats[TIMER_Total_await_work].addSample ((*it)->getTimer(TIMER_OMP_await_work)->getTotal());
+
+        // Time in explicit barriers.
+        allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal());
+
+        for (int c = 0; c<COUNTER_LAST; c++) {
+            if (counter::masterOnly(counter_e(c)) && t != 0)
+                continue;
+            allCounters[c].addSample ((*it)->getCounter(counter_e(c))->getValue());
+        }
+    }
+
+    if (eventPrintingEnabled()) {
+        printPloticusFile();
+        fclose(eventsOut);
+    }
+
+    fprintf (statsOut, "Aggregate for all threads\n");
+    printStats (statsOut, &allStats[0], true);
+    fprintf (statsOut, "\n");
+    printStats (statsOut, &allCounters[0], false);
+
+    if (statsOut != stderr)
+        fclose(statsOut);
+
+}
+
+/* ************************************************** */
+/* *************  exported C functions ************** */
+
+// no name mangling for these functions, we want the c files to be able to get at these functions
+extern "C" {
+
+void __kmp_reset_stats()
+{
+    kmp_stats_list::iterator it;
+    for(it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
+        timeStat * timers     = (*it)->getTimers();
+        counter * counters    = (*it)->getCounters();
+        explicitTimer * eTimers = (*it)->getExplicitTimers();
+
+        for (int t = 0; t<TIMER_LAST; t++)
+            timers[t].reset();
+
+        for (int c = 0; c<COUNTER_LAST; c++)
+            counters[c].reset();
+
+        for (int t=0; t<EXPLICIT_TIMER_LAST; t++)
+            eTimers[t].reset();
+
+        // reset the event vector so all previous events are "erased"
+        (*it)->resetEventVector();
+
+        // May need to restart the explicit timers in thread zero?
+    }
+    KMP_START_EXPLICIT_TIMER(OMP_serial);
+    KMP_START_EXPLICIT_TIMER(OMP_start_end);
+}
+
+// This function will reset all stats and stop all threads' explicit timers if they haven't been stopped already.
+void __kmp_output_stats(const char * heading)
+{
+    __kmp_stats_global_output.outputStats(heading);
+    __kmp_reset_stats();
+}
+
+void __kmp_accumulate_stats_at_exit(void)
+{
+    // Only do this once.
+    if (KMP_XCHG_FIXED32(&statsPrinted, 1) != 0)
+        return;
+
+    __kmp_output_stats("Statistics on exit");
+    return;
+}
+
+void __kmp_stats_init(void) 
+{
+    return;
+}
+
+} // extern "C" 
+
+#endif // KMP_STATS_ENABLED
diff --git a/openmp/runtime/src/kmp_stats.h b/openmp/runtime/src/kmp_stats.h
new file mode 100644
index 00000000000..f804cb5007b
--- /dev/null
+++ b/openmp/runtime/src/kmp_stats.h
@@ -0,0 +1,706 @@
+#ifndef KMP_STATS_H
+#define KMP_STATS_H
+
+/** @file kmp_stats.h
+ * Functions for collecting statistics.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#if KMP_STATS_ENABLED
+/*
+ * Statistics accumulator.
+ * Accumulates number of samples and computes min, max, mean, standard deviation on the fly.
+ *
+ * Online variance calculation algorithm from http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+ */
+
+#include <limits>
+#include <math.h>
+#include <string>
+#include <stdint.h>
+#include <new> // placement new
+#include "kmp_stats_timing.h"
+
+
+/*!
+ * @ingroup STATS_GATHERING
+ * \brief flags to describe the statistic ( timers or counter )
+ *
+*/
+class stats_flags_e {
+    public:
+        const static int onlyInMaster = 1<<0; //!< statistic is valid only for master
+        const static int noUnits      = 1<<1; //!< statistic doesn't need units printed next to it in output
+        const static int synthesized  = 1<<2; //!< statistic's value is created atexit time in the __kmp_output_stats function
+        const static int notInMaster  = 1<<3; //!< statistic is valid for non-master threads
+        const static int logEvent     = 1<<4; //!< statistic can be logged when KMP_STATS_EVENTS is on (valid only for timers)
+};
+
+/*!
+ * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments - macro(COUNTER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A counter counts the occurence of some event.
+ * Each thread accumulates its own count, at the end of execution the counts are aggregated treating each thread
+ * as a separate measurement. (Unless onlyInMaster is set, in which case there's only a single measurement).
+ * The min,mean,max are therefore the values for the threads.
+ * Adding the counter here and then putting in a KMP_BLOCK_COUNTER(name) is all you need to do.
+ * All of the tables and printing is generated from this macro.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_FOREACH_COUNTER(macro, arg)                         \
+    macro (OMP_PARALLEL, stats_flags_e::onlyInMaster, arg)      \
+    macro (OMP_FOR_static, 0, arg)                              \
+    macro (OMP_FOR_dynamic, 0, arg)                             \
+    macro (OMP_DISTR_FOR_static, 0, arg)                        \
+    macro (OMP_DISTR_FOR_dynamic, 0, arg)                       \
+    macro (OMP_BARRIER, 0, arg)                                 \
+    macro (OMP_CRITICAL,0, arg)                                 \
+    macro (OMP_SINGLE, 0, arg)                                  \
+    macro (OMP_MASTER, 0, arg)                                  \
+    macro (OMP_set_lock, 0, arg)                                \
+    macro (OMP_test_lock, 0, arg)                               \
+    macro (OMP_test_lock_failure, 0, arg)                       \
+    macro (REDUCE_wait, 0, arg)                                 \
+    macro (REDUCE_nowait, 0, arg)                               \
+    macro (LAST,0,arg)
+
+/*!
+ * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h
+ *
+ * @param macro a user defined macro that takes three arguments - macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \details A timer collects multiple samples of some count in each thread and then finally aggregates over all the threads.
+ * The count is normally a time (in ticks), hence the name "timer". (But can be any value, so we use this for "number of arguments passed to fork"
+ * as well, or we could collect "loop iteration count" if we wanted to).
+ * For timers the threads are not significant, it's the individual observations that count, so the statistics are at that level.
+ * Format is "macro(name, flags, arg)"
+ *
+ * @ingroup STATS_GATHERING
+ */
+#define KMP_FOREACH_TIMER(macro, arg)                                       \
+    macro (OMP_PARALLEL_args, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
+    macro (FOR_static_iterations, stats_flags_e::onlyInMaster | stats_flags_e::noUnits, arg) \
+    macro (FOR_dynamic_iterations, stats_flags_e::noUnits, arg)         \
+    macro (OMP_start_end, stats_flags_e::onlyInMaster, arg)             \
+    macro (OMP_serial, stats_flags_e::onlyInMaster, arg)                \
+    macro (OMP_work, 0, arg)                                            \
+    macro (Total_work, stats_flags_e::synthesized, arg)                 \
+    macro (OMP_await_work, stats_flags_e::notInMaster, arg)             \
+    macro (Total_await_work, stats_flags_e::synthesized, arg)           \
+    macro (OMP_barrier, 0, arg)                                         \
+    macro (Total_barrier, stats_flags_e::synthesized, arg)              \
+    macro (OMP_test_lock, 0, arg)                                       \
+    macro (FOR_static_scheduling, 0, arg)                               \
+    macro (FOR_dynamic_scheduling, 0, arg)                              \
+    macro (KMP_fork_call, 0, arg) \
+    macro (KMP_join_call, 0, arg) \
+    macro (KMP_fork_barrier, stats_flags_e::logEvent, arg)              \
+    macro (KMP_join_barrier, stats_flags_e::logEvent, arg)              \
+    macro (KMP_barrier, 0, arg)                   \
+    macro (KMP_end_split_barrier, 0, arg) \
+    macro (KMP_wait_sleep, 0, arg) \
+    macro (KMP_release, 0, arg)                   \
+    macro (KMP_hier_gather, 0, arg) \
+    macro (KMP_hier_release, 0, arg) \
+    macro (KMP_hyper_gather,  stats_flags_e::logEvent, arg) \
+    macro (KMP_hyper_release,  stats_flags_e::logEvent, arg) \
+    macro (KMP_linear_gather, 0, arg)                                   \
+    macro (KMP_linear_release, 0, arg)                                  \
+    macro (KMP_tree_gather, 0, arg)                                     \
+    macro (KMP_tree_release, 0, arg)                                    \
+    macro (USER_master_invoke, stats_flags_e::logEvent, arg) \
+    macro (USER_worker_invoke, stats_flags_e::logEvent, arg) \
+    macro (USER_resume, stats_flags_e::logEvent, arg) \
+    macro (USER_suspend, stats_flags_e::logEvent, arg) \
+    macro (USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
+    macro (KMP_allocate_team, 0, arg) \
+    macro (KMP_setup_icv_copy, 0, arg) \
+    macro (USER_icv_copy, 0, arg) \
+    macro (LAST,0, arg)
+
+
+
+// OMP_PARALLEL_args      -- the number of arguments passed to a fork
+// FOR_static_iterations  -- Number of available parallel chunks of work in a static for
+// FOR_dynamic_iterations -- Number of available parallel chunks of work in a dynamic for
+//                           Both adjust for any chunking, so if there were an iteration count of 20 but a chunk size of 10, we'd record 2.
+// OMP_serial             -- thread zero time executing serial code
+// OMP_start_end          -- time from when OpenMP is initialized until the stats are printed at exit
+// OMP_work               -- elapsed time in code dispatched by a fork (measured in the thread)
+// Total_work             -- a synthesized statistic summarizing how much parallel work each thread executed.
+// OMP_barrier            -- time at "real" barriers
+// Total_barrier          -- a synthesized statistic summarizing how much time at real barriers in each thread
+// OMP_set_lock           -- time in lock setting
+// OMP_test_lock          -- time in testing a lock
+// LOCK_WAIT              -- time waiting for a lock
+// FOR_static_scheduling  -- time spent doing scheduling for a static "for"
+// FOR_dynamic_scheduling -- time spent doing scheduling for a dynamic "for"
+// KMP_wait_sleep         -- time in __kmp_wait_sleep
+// KMP_release            -- time in __kmp_release
+// KMP_fork_barrier       -- time in __kmp_fork_barrier
+// KMP_join_barrier       -- time in __kmp_join_barrier
+// KMP_barrier            -- time in __kmp_barrier
+// KMP_end_split_barrier  -- time in __kmp_end_split_barrier
+// KMP_setup_icv_copy     -- time in __kmp_setup_icv_copy
+// KMP_icv_copy           -- start/stop timer for any ICV copying
+// KMP_linear_gather      -- time in __kmp_linear_barrier_gather
+// KMP_linear_release     -- time in __kmp_linear_barrier_release
+// KMP_tree_gather        -- time in __kmp_tree_barrier_gather
+// KMP_tree_release       -- time in __kmp_tree_barrier_release
+// KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
+// KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+
+/*!
+ * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
+ *
+ * @param macro a user defined macro that takes three arguments - macro(TIMER_NAME, flags, arg)
+ * @param arg a user defined argument to send to the user defined macro
+ *
+ * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE BAD THINGS WILL HAPPEN!
+ *
+ * \details Explicit timers are ones where we need to allocate a timer itself (as well as the accumulated timing statistics).
+ * We allocate these on a per-thread basis, and explicitly start and stop them.
+ * Block timers just allocate the timer itself on the stack, and use the destructor to notice block exit; they don't
+ * need to be defined here.
+ * The name here should be the same as that of a timer above.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_FOREACH_EXPLICIT_TIMER(macro, arg)  \
+    macro(OMP_serial, 0, arg)                   \
+    macro(OMP_start_end, 0, arg)                \
+    macro(USER_icv_copy, 0, arg) \
+    macro(USER_launch_thread_loop, stats_flags_e::logEvent, arg) \
+    macro(LAST, 0, arg)
+
+#define ENUMERATE(name,ignore,prefix) prefix##name,
+enum timer_e {
+    KMP_FOREACH_TIMER(ENUMERATE, TIMER_)
+};
+
+enum explicit_timer_e {
+    KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_)
+};
+
+enum counter_e {
+    KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_)
+};
+#undef ENUMERATE
+
+class statistic
+{
+    double   minVal;
+    double   maxVal;
+    double   meanVal;
+    double   m2;
+    uint64_t sampleCount;
+
+ public:
+    statistic() { reset(); }
+    statistic (statistic const &o): minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), sampleCount(o.sampleCount) {}
+
+    double   getMin()   const { return minVal; }
+    double   getMean()  const { return meanVal; }
+    double   getMax()   const { return maxVal; }
+    uint64_t getCount() const { return sampleCount; }
+    double   getSD()    const { return sqrt(m2/sampleCount); }
+    double   getTotal() const { return sampleCount*meanVal; }
+
+    void reset()
+    {
+        minVal =  std::numeric_limits<double>::max();
+        maxVal = -std::numeric_limits<double>::max();
+        meanVal= 0.0;
+        m2     = 0.0;
+        sampleCount = 0;
+    }
+    void addSample(double sample);
+    void scale    (double factor);
+    void scaleDown(double f)  { scale (1./f); }
+    statistic & operator+= (statistic const & other);
+
+    std::string format(char unit, bool total=false) const;
+};
+
+struct statInfo
+{
+    const char * name;
+    uint32_t     flags;
+};
+
+class timeStat : public statistic
+{
+    static statInfo timerInfo[];
+
+ public:
+    timeStat() : statistic() {}
+    static const char * name(timer_e e) { return timerInfo[e].name; }
+    static bool  masterOnly (timer_e e) { return timerInfo[e].flags & stats_flags_e::onlyInMaster; }
+    static bool  workerOnly (timer_e e) { return timerInfo[e].flags & stats_flags_e::notInMaster;  }
+    static bool  noUnits    (timer_e e) { return timerInfo[e].flags & stats_flags_e::noUnits;      }
+    static bool  synthesized(timer_e e) { return timerInfo[e].flags & stats_flags_e::synthesized;  }
+    static bool  logEvent   (timer_e e) { return timerInfo[e].flags & stats_flags_e::logEvent;     }
+    static void  clearEventFlags()      {
+        int i;
+        for(i=0;i<TIMER_LAST;i++) {
+            timerInfo[i].flags &= (~(stats_flags_e::logEvent));
+        }
+    }
+};
+
+// Where we need explicitly to start and end the timer, this version can be used
+// Since these timers normally aren't nicely scoped, so don't have a good place to live
+// on the stack of the thread, they're more work to use.
+class explicitTimer
+{
+    timeStat * stat;
+    tsc_tick_count startTime;
+
+ public:
+    explicitTimer () : stat(0), startTime(0) { }
+    explicitTimer (timeStat * s) : stat(s), startTime() { }
+
+    void setStat (timeStat *s) { stat = s; }
+    void start(timer_e timerEnumValue);
+    void stop(timer_e timerEnumValue);
+    void reset() { startTime = 0; }
+};
+
+// Where all you need is to time a block, this is enough.
+// (It avoids the need to have an explicit end, leaving the scope suffices.)
+class blockTimer : public explicitTimer
+{
+    timer_e timerEnumValue;
+ public:
+    blockTimer (timeStat * s, timer_e newTimerEnumValue) : timerEnumValue(newTimerEnumValue), explicitTimer(s) { start(timerEnumValue); }
+    ~blockTimer() { stop(timerEnumValue); }
+};
+
+// If all you want is a count, then you can use this...
+// The individual per-thread counts will be aggregated into a statistic at program exit.
+class counter
+{
+    uint64_t value;
+    static const statInfo counterInfo[];
+
+ public:
+    counter() : value(0) {}
+    void increment() { value++; }
+    uint64_t getValue() const { return value; }
+    void reset() { value = 0; }
+    static const char * name(counter_e e) { return counterInfo[e].name; }
+    static bool  masterOnly (counter_e e) { return counterInfo[e].flags & stats_flags_e::onlyInMaster; }
+};
+
+/* ****************************************************************
+    Class to implement an event
+
+    There are four components to an event: start time, stop time
+    nest_level, and timer_name.
+    The start and stop time should be obvious (recorded in clock ticks).
+    The nest_level relates to the bar width in the timeline graph.
+    The timer_name is used to determine which timer event triggered this event.
+
+    the interface to this class is through four read-only operations:
+    1) getStart()     -- returns the start time as 64 bit integer
+    2) getStop()      -- returns the stop time as 64 bit integer
+    3) getNestLevel() -- returns the nest level of the event
+    4) getTimerName() -- returns the timer name that triggered event
+
+    *MORE ON NEST_LEVEL*
+    The nest level is used in the bar graph that represents the timeline.
+    Its main purpose is for showing how events are nested inside eachother.
+    For example, say events, A, B, and C are recorded.  If the timeline
+    looks like this:
+
+Begin -------------------------------------------------------------> Time
+         |    |          |        |          |              |
+         A    B          C        C          B              A
+       start start     start     end        end            end
+
+       Then A, B, C will have a nest level of 1, 2, 3 respectively.
+       These values are then used to calculate the barwidth so you can
+       see that inside A, B has occured, and inside B, C has occured.
+       Currently, this is shown with A's bar width being larger than B's
+       bar width, and B's bar width being larger than C's bar width.
+
+**************************************************************** */
+class kmp_stats_event {
+    uint64_t start;
+    uint64_t stop;
+    int nest_level;
+    timer_e timer_name;
+ public:
+    kmp_stats_event() : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {}
+    kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) : start(strt), stop(stp), nest_level(nst), timer_name(nme) {}
+    inline uint64_t  getStart() const { return start; }
+    inline uint64_t  getStop() const  { return stop;  }
+    inline int       getNestLevel() const { return nest_level; }
+    inline timer_e   getTimerName() const { return timer_name; }
+};
+
+/* ****************************************************************
+    Class to implement a dynamically expandable array of events
+
+    ---------------------------------------------------------
+    | event 1 | event 2 | event 3 | event 4 | ... | event N |
+    ---------------------------------------------------------
+
+    An event is pushed onto the back of this array at every
+    explicitTimer->stop() call.  The event records the thread #,
+    start time, stop time, and nest level related to the bar width.
+
+    The event vector starts at size INIT_SIZE and grows (doubles in size)
+    if needed.  An implication of this behavior is that log(N)
+    reallocations are needed (where N is number of events).  If you want
+    to avoid reallocations, then set INIT_SIZE to a large value.
+
+    the interface to this class is through six operations:
+    1) reset() -- sets the internal_size back to 0 but does not deallocate any memory
+    2) size()  -- returns the number of valid elements in the vector
+    3) push_back(start, stop, nest, timer_name) -- pushes an event onto
+                                                   the back of the array
+    4) deallocate() -- frees all memory associated with the vector
+    5) sort() -- sorts the vector by start time
+    6) operator[index] or at(index) -- returns event reference at that index
+
+**************************************************************** */
+class kmp_stats_event_vector {
+    kmp_stats_event* events;
+    int internal_size;
+    int allocated_size;
+    static const int INIT_SIZE = 1024;
+ public:
+    kmp_stats_event_vector() {
+        events = (kmp_stats_event*)__kmp_allocate(sizeof(kmp_stats_event)*INIT_SIZE);
+        internal_size = 0;
+        allocated_size = INIT_SIZE;
+    }
+   ~kmp_stats_event_vector() {}
+    inline void reset() { internal_size = 0; }
+    inline int  size() const { return internal_size; }
+    void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) {
+        int i;
+        if(internal_size == allocated_size) {
+            kmp_stats_event* tmp = (kmp_stats_event*)__kmp_allocate(sizeof(kmp_stats_event)*allocated_size*2);
+            for(i=0;i<internal_size;i++) tmp[i] = events[i];
+            __kmp_free(events);
+            events = tmp;
+            allocated_size*=2;
+        }
+        events[internal_size] = kmp_stats_event(start_time, stop_time, nest_level, name);
+        internal_size++;
+        return;
+    }
+    void deallocate();
+    void sort();
+    const kmp_stats_event & operator[](int index) const { return events[index]; }
+          kmp_stats_event & operator[](int index) { return events[index]; }
+    const kmp_stats_event & at(int index) const { return events[index]; }
+          kmp_stats_event & at(int index) { return events[index]; }
+};
+
+/* ****************************************************************
+    Class to implement a doubly-linked, circular, statistics list
+
+    |---| ---> |---| ---> |---| ---> |---| ---> ... next
+    |   |      |   |      |   |      |   |
+    |---| <--- |---| <--- |---| <--- |---| <--- ... prev
+    Sentinel   first      second     third
+    Node       node       node       node
+
+    The Sentinel Node is the user handle on the list.
+    The first node corresponds to thread 0's statistics.
+    The second node corresponds to thread 1's statistics and so on...
+
+    Each node has a _timers, _counters, and _explicitTimers array to
+    hold that thread's statistics.  The _explicitTimers
+    point to the correct _timer and update its statistics at every stop() call.
+    The explicitTimers' pointers are set up in the constructor.
+    Each node also has an event vector to hold that thread's timing events.
+    The event vector expands as necessary and records the start-stop times
+    for each timer.
+
+    The nestLevel variable is for plotting events and is related
+    to the bar width in the timeline graph.
+
+    Every thread will have a __thread local pointer to its node in
+    the list.  The sentinel node is used by the master thread to
+    store "dummy" statistics before __kmp_create_worker() is called.
+
+**************************************************************** */
+class kmp_stats_list {
+    int gtid;
+    timeStat      _timers[TIMER_LAST+1];
+    counter       _counters[COUNTER_LAST+1];
+    explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST+1];
+    int           _nestLevel; // one per thread
+    kmp_stats_event_vector _event_vector;
+    kmp_stats_list* next;
+    kmp_stats_list* prev;
+ public:
+    kmp_stats_list() : next(this) , prev(this) , _event_vector(), _nestLevel(0) {
+#define doInit(name,ignore1,ignore2) \
+        getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name));
+        KMP_FOREACH_EXPLICIT_TIMER(doInit,0);
+#undef doInit
+    }
+   ~kmp_stats_list() { }
+    inline timeStat *      getTimer(timer_e idx)                  { return &_timers[idx]; }
+    inline counter  *      getCounter(counter_e idx)              { return &_counters[idx]; }
+    inline explicitTimer * getExplicitTimer(explicit_timer_e idx) { return &_explicitTimers[idx]; }
+    inline timeStat *      getTimers()                            { return _timers; }
+    inline counter  *      getCounters()                          { return _counters; }
+    inline explicitTimer * getExplicitTimers()                    { return _explicitTimers; }
+    inline kmp_stats_event_vector & getEventVector()              { return _event_vector; }
+    inline void resetEventVector()                                { _event_vector.reset(); }
+    inline void incrementNestValue()                              { _nestLevel++; }
+    inline int  getNestValue()                                    { return _nestLevel; }
+    inline void decrementNestValue()                              { _nestLevel--; }
+    inline int  getGtid() const                                   { return gtid; }
+    inline void setGtid(int newgtid)                              { gtid = newgtid; }
+    kmp_stats_list* push_back(int gtid); // returns newly created list node
+    inline void     push_event(uint64_t start_time, uint64_t stop_time, int nest_level, timer_e name) {
+        _event_vector.push_back(start_time, stop_time, nest_level, name);
+    }
+    void deallocate();
+    class iterator;
+    kmp_stats_list::iterator begin();
+    kmp_stats_list::iterator end();
+    int size();
+    class iterator {
+        kmp_stats_list* ptr;
+        friend kmp_stats_list::iterator kmp_stats_list::begin();
+        friend kmp_stats_list::iterator kmp_stats_list::end();
+      public:
+        iterator();
+       ~iterator();
+        iterator operator++();
+        iterator operator++(int dummy);
+        iterator operator--();
+        iterator operator--(int dummy);
+        bool operator!=(const iterator & rhs);
+        bool operator==(const iterator & rhs);
+        kmp_stats_list* operator*() const; // dereference operator
+    };
+};
+
+/* ****************************************************************
+   Class to encapsulate all output functions and the environment variables
+
+   This module holds filenames for various outputs (normal stats, events, plot file),
+   as well as coloring information for the plot file.
+
+   The filenames and flags variables are read from environment variables.
+   These are read once by the constructor of the global variable __kmp_stats_output
+   which calls init().
+
+   During this init() call, event flags for the timeStat::timerInfo[] global array
+   are cleared if KMP_STATS_EVENTS is not true (on, 1, yes).
+
+   The only interface function that is public is outputStats(heading).  This function
+   should print out everything it needs to, either to files or stderr,
+   depending on the environment variables described below
+
+   ENVIRONMENT VARIABLES:
+   KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this file,
+                     otherwise, print to stderr
+   KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to either
+                        KMP_STATS_FILE or stderr
+   KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename,
+                          otherwise, the plot file is sent to "events.plt"
+   KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log events
+   KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file,
+                            otherwise, output is sent to "events.dat"
+
+**************************************************************** */
+class kmp_stats_output_module {
+
+ public:
+    struct rgb_color {
+        float r;
+        float g;
+        float b;
+    };
+
+ private:
+    static const char* outputFileName;
+    static const char* eventsFileName;
+    static const char* plotFileName;
+    static int printPerThreadFlag;
+    static int printPerThreadEventsFlag;
+    static const rgb_color globalColorArray[];
+    static       rgb_color timerColorInfo[];
+
+    void init();
+    static void setupEventColors();
+    static void printPloticusFile();
+    static void printStats(FILE *statsOut, statistic const * theStats, bool areTimers);
+    static void printCounters(FILE * statsOut, counter const * theCounters);
+    static void printEvents(FILE * eventsOut, kmp_stats_event_vector* theEvents, int gtid);
+    static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; }
+    static void windupExplicitTimers();
+    bool eventPrintingEnabled() {
+        if(printPerThreadEventsFlag) return true;
+        else return false;
+    }
+    bool perThreadPrintingEnabled() {
+        if(printPerThreadFlag) return true;
+        else return false;
+    }
+
+ public:
+    kmp_stats_output_module() { init(); }
+    void outputStats(const char* heading);
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+void __kmp_stats_init();
+void __kmp_reset_stats();
+void __kmp_output_stats(const char *);
+void __kmp_accumulate_stats_at_exit(void);
+// thread local pointer to stats node within list
+extern __thread kmp_stats_list* __kmp_stats_thread_ptr;
+// head to stats list.
+extern kmp_stats_list __kmp_stats_list;
+// lock for __kmp_stats_list
+extern kmp_tas_lock_t  __kmp_stats_lock;
+// reference start time
+extern tsc_tick_count __kmp_stats_start_time;
+// interface to output
+extern kmp_stats_output_module __kmp_stats_output;
+
+#ifdef __cplusplus
+}
+#endif
+
+// Simple, standard interfaces that drop out completely if stats aren't enabled
+
+
+/*!
+ * \brief Uses specified timer (name) to time code block.
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ *
+ * \details Use KMP_TIME_BLOCK(name) macro to time a code block.  This will record the time taken in the block
+ * and use the destructor to stop the timer.  Convenient!
+ * With this definition you can't have more than one KMP_TIME_BLOCK in the same code block.
+ * I don't think that's a problem.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_TIME_BLOCK(name) \
+    blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)
+
+/*!
+ * \brief Adds value to specified timer (name).
+ *
+ * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
+ * @param value double precision sample value to add to statistics for the timer
+ *
+ * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to a timer statistics.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_VALUE(name, value) \
+    __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value)
+
+/*!
+ * \brief Increments specified counter (name).
+ *
+ * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro
+ *
+ * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics counter for the executing thread.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_COUNT_BLOCK(name) \
+   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
+
+/*!
+ * \brief "Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro.
+ *
+ * @param name explicit timer name as specified under the KMP_FOREACH_EXPLICIT_TIMER() macro
+ *
+ * \details Use to start a timer.  This will need a corresponding KMP_STOP_EXPLICIT_TIMER()
+ * macro to stop the timer unlike the KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end
+ * of the code block.  All explicit timers are stopped at library exit time before the final statistics are outputted.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_START_EXPLICIT_TIMER(name) \
+    __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)->start(TIMER_##name)
+
+/*!
+ * \brief "Stops" an explicit timer.
+ *
+ * @param name explicit timer name as specified under the KMP_FOREACH_EXPLICIT_TIMER() macro
+ *
+ * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer.  When this is done, the time between the last KMP_START_EXPLICIT_TIMER(name)
+ * and this KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value.  The timer will then be reset.
+ * After the KMP_STOP_EXPLICIT_TIMER(name) macro is called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer once again.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_STOP_EXPLICIT_TIMER(name) \
+    __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)->stop(TIMER_##name)
+
+/*!
+ * \brief Outputs the current thread statistics and reset them.
+ *
+ * @param heading_string heading put above the final stats output
+ *
+ * \details Explicitly stops all timers and outputs all stats.
+ * Environment variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a filename instead of stderr
+ * Environment variable, `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific stats
+ * For now the `OMPTB_STATSTHREADS` environment variable can either be defined with any value, which will print out thread
+ * specific stats, or it can be undefined (not specified in the environment) and thread specific stats won't be printed
+ * It should be noted that all statistics are reset when this macro is called.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_OUTPUT_STATS(heading_string) \
+    __kmp_output_stats(heading_string)
+
+/*!
+ * \brief resets all stats (counters to 0, timers to 0 elapsed ticks)
+ *
+ * \details Reset all stats for all threads.
+ *
+ * @ingroup STATS_GATHERING
+*/
+#define KMP_RESET_STATS()  __kmp_reset_stats()
+
+#else // KMP_STATS_ENABLED
+
+// Null definitions
+#define KMP_TIME_BLOCK(n)             ((void)0)
+#define KMP_COUNT_VALUE(n,v)          ((void)0)
+#define KMP_COUNT_BLOCK(n)            ((void)0)
+#define KMP_START_EXPLICIT_TIMER(n)   ((void)0)
+#define KMP_STOP_EXPLICIT_TIMER(n)    ((void)0)
+
+#define KMP_OUTPUT_STATS(heading_string) ((void)0)
+#define KMP_RESET_STATS()  ((void)0)
+
+#endif  // KMP_STATS_ENABLED
+
+#endif // KMP_STATS_H
diff --git a/openmp/runtime/src/kmp_stats_timing.cpp b/openmp/runtime/src/kmp_stats_timing.cpp
new file mode 100644
index 00000000000..987ea4f764d
--- /dev/null
+++ b/openmp/runtime/src/kmp_stats_timing.cpp
@@ -0,0 +1,167 @@
+/** @file kmp_stats_timing.cpp
+ * Timing functions
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+
+#include "kmp_stats_timing.h"
+
+using namespace std;
+
+#if KMP_OS_LINUX
+# if KMP_MIC
+double tsc_tick_count::tick_time()
+{
+    // pretty bad assumption of 1GHz clock for MIC
+    return 1/((double)1000*1.e6);
+}
+# else
+#  include <string.h>
+// Extract the value from the CPUID information
+double tsc_tick_count::tick_time()
+{
+    static double result = 0.0;
+
+    if (result == 0.0)
+    {
+        int cpuinfo[4];
+        char brand[256];
+
+        __cpuid(cpuinfo, 0x80000000);
+        memset(brand, 0, sizeof(brand));
+        int ids = cpuinfo[0];
+
+        for (unsigned int i=2; i<(ids^0x80000000)+2; i++)
+            __cpuid(brand+(i-2)*sizeof(cpuinfo), i | 0x80000000);
+
+        char * start = &brand[0];
+        for (;*start == ' '; start++)
+            ;
+    
+        char * end = brand + strlen(brand) - 3;
+        uint64_t multiplier;
+
+        if (*end == 'M') multiplier = 1000LL*1000LL;
+        else if (*end == 'G') multiplier = 1000LL*1000LL*1000LL;
+        else if (*end == 'T') multiplier = 1000LL*1000LL*1000LL*1000LL;
+        else 
+        {
+            cout << "Error determining multiplier '" << *end << "'\n";
+            exit (-1);
+        }
+        *end = 0;
+        while (*end != ' ') end--;
+        end++;
+    
+        double freq = strtod(end, &start);
+        if (freq == 0.0) 
+        {
+            cout << "Error calculating frequency " <<  end << "\n";
+            exit (-1);
+        }
+
+        result = ((double)1.0)/(freq * multiplier);
+    }
+    return result;
+}
+# endif
+#endif
+
+static bool useSI = true;
+
+// Return a formatted string after normalising the value into
+// engineering style and using a suitable unit prefix (e.g. ms, us, ns).
+std::string formatSI(double interval, int width, char unit)
+{
+    std::stringstream os;
+
+    if (useSI)
+    {
+        // Preserve accuracy for small numbers, since we only multiply and the positive powers
+        // of ten are precisely representable. 
+        static struct { double scale; char prefix; } ranges[] = {
+            {1.e12,'f'},
+            {1.e9, 'p'},
+            {1.e6, 'n'},
+            {1.e3, 'u'},
+            {1.0,  'm'},
+            {1.e-3,' '},
+            {1.e-6,'k'},
+            {1.e-9,'M'},
+            {1.e-12,'G'},
+            {1.e-15,'T'},
+            {1.e-18,'P'},
+            {1.e-21,'E'},
+            {1.e-24,'Z'},
+            {1.e-27,'Y'}
+        };
+        
+        if (interval == 0.0)
+        {
+            os << std::setw(width-3) << std::right << "0.00" << std::setw(3) << unit;
+            return os.str();
+        }
+
+        bool negative = false;
+        if (interval < 0.0)
+        {
+            negative = true;
+            interval = -interval;
+        }
+        
+        for (int i=0; i<(int)(sizeof(ranges)/sizeof(ranges[0])); i++)
+        {
+            if (interval*ranges[i].scale < 1.e0)
+            {
+                interval = interval * 1000.e0 * ranges[i].scale;
+                os << std::fixed << std::setprecision(2) << std::setw(width-3) << std::right << 
+                    (negative ? -interval : interval) << std::setw(2) << ranges[i].prefix << std::setw(1) << unit;
+
+                return os.str();
+            }
+        }
+    }
+    os << std::setprecision(2) << std::fixed << std::right << std::setw(width-3) << interval << std::setw(3) << unit;
+
+    return os.str();
+}
+
+tsc_tick_count::tsc_interval_t computeLastInLastOutInterval(timePair * times, int nTimes)
+{
+    timePair lastTimes = times[0];
+    tsc_tick_count * startp = lastTimes.get_startp();
+    tsc_tick_count * endp   = lastTimes.get_endp();
+
+    for (int i=1; i<nTimes; i++)
+    {
+       (*startp) = startp->later(times[i].get_start());
+       (*endp)   = endp->later  (times[i].get_end());
+    }
+
+    return lastTimes.duration();
+}
+
+std::string timePair::format() const
+{
+    std::ostringstream oss;
+
+    oss << start.getValue() << ":" << end.getValue() << " = " << (end-start).getValue();
+
+    return oss.str();
+}
diff --git a/openmp/runtime/src/kmp_stats_timing.h b/openmp/runtime/src/kmp_stats_timing.h
new file mode 100644
index 00000000000..2bdfdeadfd0
--- /dev/null
+++ b/openmp/runtime/src/kmp_stats_timing.h
@@ -0,0 +1,104 @@
+#ifndef KMP_STATS_TIMING_H
+#define KMP_STATS_TIMING_H
+
+/** @file kmp_stats_timing.h
+ * Access to real time clock and timers.
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include <stdint.h>
+#include <string>
+#include <limits>
+#include "kmp_os.h"
+
+class tsc_tick_count {
+  private:
+    int64_t my_count;
+
+  public:
+    class tsc_interval_t {
+        int64_t value;
+        explicit tsc_interval_t(int64_t _value) : value(_value) {}
+     public:
+        tsc_interval_t() : value(0) {}; // Construct 0 time duration
+        double seconds() const; // Return the length of a time interval in seconds
+        double ticks() const { return double(value); }
+        int64_t getValue() const { return value; }
+
+        friend class tsc_tick_count;
+
+        friend tsc_interval_t operator-(
+        const tsc_tick_count t1, const tsc_tick_count t0);
+    };
+
+    tsc_tick_count() : my_count(static_cast<int64_t>(__rdtsc())) {};
+    tsc_tick_count(int64_t value) : my_count(value) {};
+    int64_t getValue() const { return my_count; }
+    tsc_tick_count later (tsc_tick_count const other) const { 
+        return my_count > other.my_count ? (*this) : other; 
+    }
+    tsc_tick_count earlier(tsc_tick_count const other) const { 
+        return my_count < other.my_count ? (*this) : other; 
+    }
+    static double tick_time(); // returns seconds per cycle (period) of clock
+    static tsc_tick_count now() { return tsc_tick_count(); } // returns the rdtsc register value
+    friend tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0);
+};
+
+inline tsc_tick_count::tsc_interval_t operator-(const tsc_tick_count t1, const tsc_tick_count t0) 
+{
+    return tsc_tick_count::tsc_interval_t( t1.my_count-t0.my_count );
+}
+
+inline double tsc_tick_count::tsc_interval_t::seconds() const 
+{
+    return value*tick_time();
+}
+
+extern std::string formatSI(double interval, int width, char unit);
+
+inline std::string formatSeconds(double interval, int width)
+{
+    return formatSI(interval, width, 'S');
+}
+
+inline std::string formatTicks(double interval, int width)
+{
+    return formatSI(interval, width, 'T');
+}
+
+class timePair
+{
+    tsc_tick_count KMP_ALIGN_CACHE start;
+    tsc_tick_count end;
+
+public:
+    timePair() : start(-std::numeric_limits<int64_t>::max()), end(-std::numeric_limits<int64_t>::max()) {}
+    tsc_tick_count get_start() const { return start; }
+    tsc_tick_count get_end()   const { return end; }
+    tsc_tick_count * get_startp()    { return &start; }
+    tsc_tick_count * get_endp()      { return &end; }
+
+    void markStart() { start = tsc_tick_count::now(); }
+    void markEnd()   { end   = tsc_tick_count::now(); }
+    void set_start(tsc_tick_count s) { start = s; }
+    void set_end  (tsc_tick_count e) { end = e; }
+
+    tsc_tick_count::tsc_interval_t duration() const { return end-start; }
+    std::string format() const;
+
+};
+
+extern tsc_tick_count::tsc_interval_t computeLastInLastOutInterval(timePair * times, int nTimes);
+#endif // KMP_STATS_TIMING_H
diff --git a/openmp/runtime/src/kmp_str.c b/openmp/runtime/src/kmp_str.c
index 9c0469fd19d..d9e8d26a498 100644
--- a/openmp/runtime/src/kmp_str.c
+++ b/openmp/runtime/src/kmp_str.c
@@ -1,7 +1,7 @@
 /*
  * kmp_str.c -- String manipulation routines.
- * $Revision: 42810 $
- * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+ * $Revision: 43084 $
+ * $Date: 2014-04-15 09:15:14 -0500 (Tue, 15 Apr 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_str.h b/openmp/runtime/src/kmp_str.h
index 7de3a2ec228..1c4fb984323 100644
--- a/openmp/runtime/src/kmp_str.h
+++ b/openmp/runtime/src/kmp_str.h
@@ -1,7 +1,7 @@
 /*
  * kmp_str.h -- String manipulation routines.
- * $Revision: 42613 $
- * $Date: 2013-08-23 13:29:50 -0500 (Fri, 23 Aug 2013) $
+ * $Revision: 43435 $
+ * $Date: 2014-09-04 15:16:08 -0500 (Thu, 04 Sep 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_stub.c b/openmp/runtime/src/kmp_stub.c
index c1914f4f570..aef8eb1f9c4 100644
--- a/openmp/runtime/src/kmp_stub.c
+++ b/openmp/runtime/src/kmp_stub.c
@@ -1,7 +1,7 @@
 /*
  * kmp_stub.c -- stub versions of user-callable OpenMP RT functions.
- * $Revision: 42826 $
- * $Date: 2013-11-20 03:39:45 -0600 (Wed, 20 Nov 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
@@ -15,13 +15,13 @@
 //===----------------------------------------------------------------------===//
 
 
-#include "kmp_stub.h"
-
 #include <stdlib.h>
 #include <limits.h>
 #include <errno.h>
 
-#include "kmp_os.h"             // KMP_OS_*
+#include "omp.h"                // Function renamings.
+#include "kmp.h"                // KMP_DEFAULT_STKSIZE
+#include "kmp_stub.h"
 
 #if KMP_OS_WINDOWS
     #include <windows.h>
@@ -29,20 +29,12 @@
     #include <sys/time.h>
 #endif
 
-#include "omp.h"                // Function renamings.
-#include "kmp.h"                // KMP_DEFAULT_STKSIZE
-#include "kmp_version.h"
-
 // Moved from omp.h
-#if OMP_30_ENABLED
-
 #define omp_set_max_active_levels    ompc_set_max_active_levels
 #define omp_set_schedule             ompc_set_schedule
 #define omp_get_ancestor_thread_num  ompc_get_ancestor_thread_num
 #define omp_get_team_size            ompc_get_team_size
 
-#endif // OMP_30_ENABLED
-
 #define omp_set_num_threads          ompc_set_num_threads
 #define omp_set_dynamic              ompc_set_dynamic
 #define omp_set_nested               ompc_set_nested
@@ -95,15 +87,13 @@ static size_t __kmps_init() {
 void omp_set_num_threads( omp_int_t num_threads ) { i; }
 void omp_set_dynamic( omp_int_t dynamic )         { i; __kmps_set_dynamic( dynamic ); }
 void omp_set_nested( omp_int_t nested )           { i; __kmps_set_nested( nested );   }
-#if OMP_30_ENABLED
-    void omp_set_max_active_levels( omp_int_t max_active_levels ) { i; }
-    void omp_set_schedule( omp_sched_t kind, omp_int_t modifier ) { i; __kmps_set_schedule( (kmp_sched_t)kind, modifier ); }
-    int omp_get_ancestor_thread_num( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 0 ); }
-    int omp_get_team_size( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 1 ); }
-    int kmpc_set_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
-    int kmpc_unset_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
-    int kmpc_get_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
-#endif // OMP_30_ENABLED
+void omp_set_max_active_levels( omp_int_t max_active_levels ) { i; }
+void omp_set_schedule( omp_sched_t kind, omp_int_t modifier ) { i; __kmps_set_schedule( (kmp_sched_t)kind, modifier ); }
+int omp_get_ancestor_thread_num( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 0 ); }
+int omp_get_team_size( omp_int_t level ) { i; return ( level ) ? ( -1 ) : ( 1 ); }
+int kmpc_set_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
+int kmpc_unset_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
+int kmpc_get_affinity_mask_proc( int proc, void **mask ) { i; return -1; }
 
 /* kmp API functions */
 void kmp_set_stacksize( omp_int_t arg )   { i; __kmps_set_stacksize( arg ); }
@@ -178,8 +168,6 @@ int __kmps_get_stacksize( void ) {
     return __kmps_stacksize;
 } // __kmps_get_stacksize
 
-#if OMP_30_ENABLED
-
 static kmp_sched_t __kmps_sched_kind     = kmp_sched_default;
 static int         __kmps_sched_modifier = 0;
 
@@ -195,8 +183,6 @@ static int         __kmps_sched_modifier = 0;
         *modifier = __kmps_sched_modifier;
     } // __kmps_get_schedule
 
-#endif // OMP_30_ENABLED
-
 #if OMP_40_ENABLED
 
 static kmp_proc_bind_t __kmps_proc_bind = proc_bind_false;
diff --git a/openmp/runtime/src/kmp_stub.h b/openmp/runtime/src/kmp_stub.h
index 4b3e3ec0c41..44bd1f764f7 100644
--- a/openmp/runtime/src/kmp_stub.h
+++ b/openmp/runtime/src/kmp_stub.h
@@ -1,7 +1,7 @@
 /*
  * kmp_stub.h
- * $Revision: 42061 $
- * $Date: 2013-02-28 16:36:24 -0600 (Thu, 28 Feb 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
@@ -33,7 +33,6 @@ int  __kmps_get_nested( void );
 void __kmps_set_stacksize( int arg );
 int  __kmps_get_stacksize();
 
-#if OMP_30_ENABLED
 #ifndef KMP_SCHED_TYPE_DEFINED
 #define KMP_SCHED_TYPE_DEFINED
 typedef enum kmp_sched {
@@ -46,11 +45,10 @@ typedef enum kmp_sched {
 #endif
 void __kmps_set_schedule( kmp_sched_t kind, int modifier );
 void __kmps_get_schedule( kmp_sched_t *kind, int *modifier );
-#endif // OMP_30_ENABLED
 
 #if OMP_40_ENABLED
-void __kmps_set_proc_bind( enum kmp_proc_bind_t arg );
-enum kmp_proc_bind_t __kmps_get_proc_bind( void );
+void __kmps_set_proc_bind( kmp_proc_bind_t arg );
+kmp_proc_bind_t __kmps_get_proc_bind( void );
 #endif /* OMP_40_ENABLED */
 
 double __kmps_get_wtime();
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index ac51de2b55d..be426171890 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -19,6 +19,7 @@
 
 #include "kmp.h"
 #include "kmp_io.h"
+#include "kmp_wait_release.h"
 
 #if OMP_40_ENABLED
 
@@ -88,20 +89,20 @@ static kmp_dephash_t *
 __kmp_dephash_create ( kmp_info_t *thread )
 {
     kmp_dephash_t *h;
-	
+
     kmp_int32 size = kmp_dephash_size * sizeof(kmp_dephash_entry_t) + sizeof(kmp_dephash_t);
-	 	
+
 #if USE_FAST_MEMORY
     h = (kmp_dephash_t *) __kmp_fast_allocate( thread, size );
 #else
     h = (kmp_dephash_t *) __kmp_thread_malloc( thread, size );
 #endif
 
-#ifdef KMP_DEBUG	
+#ifdef KMP_DEBUG
     h->nelements = 0;
 #endif
     h->buckets = (kmp_dephash_entry **)(h+1);
-	
+
     for ( kmp_int32 i = 0; i < kmp_dephash_size; i++ )
         h->buckets[i] = 0;
 
@@ -137,11 +138,11 @@ static kmp_dephash_entry *
 __kmp_dephash_find ( kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr )
 {
     kmp_int32 bucket = __kmp_dephash_hash(addr);
-	
+
     kmp_dephash_entry_t *entry;
     for ( entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket )
         if ( entry->addr == addr ) break;
-	
+
     if ( entry == NULL ) {
         // create entry. This is only done by one thread so no locking required
 #if USE_FAST_MEMORY
@@ -212,6 +213,8 @@ static inline kmp_int32
 __kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
                      bool dep_barrier,kmp_int32 ndeps, kmp_depend_info_t *dep_list)
 {
+    KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d depencies : dep_barrier = %d\n", filter, gtid, ndeps, dep_barrier ) );
+    
     kmp_info_t *thread = __kmp_threads[ gtid ];
     kmp_int32 npredecessors=0;
     for ( kmp_int32 i = 0; i < ndeps ; i++ ) {
@@ -232,6 +235,8 @@ __kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
                     if ( indep->dn.task ) {
                         __kmp_track_dependence(indep,node);
                         indep->dn.successors = __kmp_add_node(thread, indep->dn.successors, node);
+                        KA_TRACE(40,("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p",
+                                 filter,gtid, KMP_TASK_TO_TASKDATA(indep->dn.task), KMP_TASK_TO_TASKDATA(node->dn.task)));
                         npredecessors++;
                     }
                     KMP_RELEASE_DEPNODE(gtid,indep);
@@ -246,13 +251,16 @@ __kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
             if ( last_out->dn.task ) {
                 __kmp_track_dependence(last_out,node);
                 last_out->dn.successors = __kmp_add_node(thread, last_out->dn.successors, node);
+                KA_TRACE(40,("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p", 
+                             filter,gtid, KMP_TASK_TO_TASKDATA(last_out->dn.task), KMP_TASK_TO_TASKDATA(node->dn.task)));
+                
                 npredecessors++;
             }
             KMP_RELEASE_DEPNODE(gtid,last_out);
         }
 
         if ( dep_barrier ) {
-            // if this is a sync point in the serial sequence and previous outputs are guaranteed to be completed after
+            // if this is a sync point in the serial sequence, then the previous outputs are guaranteed to be completed after
             // the execution of this task so the previous output nodes can be cleared.
             __kmp_node_deref(thread,last_out);
             info->last_out = NULL;
@@ -265,6 +273,9 @@ __kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
         }
 
     }
+
+    KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter, gtid, npredecessors ) );
+
     return npredecessors;
 }
 
@@ -278,7 +289,10 @@ __kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_de
                    kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
 {
     int i;
-   	
+
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependencies for task %p : %d possibly aliased dependencies, %d non-aliased depedencies : dep_barrier=%d .\n", gtid, taskdata, ndeps, ndeps_noalias, dep_barrier ) );
+
     // Filter deps in dep_list
     // TODO: Different algorithm for large dep_list ( > 10 ? )
     for ( i = 0; i < ndeps; i ++ ) {
@@ -292,8 +306,8 @@ __kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_de
     }
 
     // doesn't need to be atomic as no other thread is going to be accessing this node just yet
-    // npredecessors is set 1 to ensure that none of the releasing tasks queues this task before we have finished processing all the dependencies
-    node->dn.npredecessors = 1;
+    // npredecessors is set -1 to ensure that none of the releasing tasks queues this task before we have finished processing all the dependencies
+    node->dn.npredecessors = -1;
 
     // used to pack all npredecessors additions into a single atomic operation at the end
     int npredecessors;
@@ -301,12 +315,16 @@ __kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_de
     npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps, dep_list);
     npredecessors += __kmp_process_deps<false>(gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list);
 
-    KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors);
-
-    // Remove the fake predecessor and find out if there's any outstanding dependence (some tasks may have finished while we processed the dependences)
     node->dn.task = task;
     KMP_MB();
-    npredecessors = KMP_TEST_THEN_DEC32(&node->dn.npredecessors) - 1;
+
+    // Account for our initial fake value
+    npredecessors++;
+
+    // Update predecessors and obtain current value to check if there are still any outstandig dependences (some tasks may have finished while we processed the dependences)
+    npredecessors = KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors) + npredecessors;
+
+    KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n", gtid, npredecessors, taskdata ) );
 
     // beyond this point the task could be queued (and executed) by a releasing task...
     return npredecessors > 0 ? true : false;
@@ -318,11 +336,15 @@ __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
     kmp_info_t *thread = __kmp_threads[ gtid ];
     kmp_depnode_t *node = task->td_depnode;
 
-    if ( task->td_dephash )
+    if ( task->td_dephash ) {
+        KA_TRACE(40, ("__kmp_realease_deps: T#%d freeing dependencies hash of task %p.\n", gtid, task ) );
         __kmp_dephash_free(thread,task->td_dephash);
+    }
 
     if ( !node ) return;
 
+    KA_TRACE(20, ("__kmp_realease_deps: T#%d notifying succesors of task %p.\n", gtid, task ) );
+    
     KMP_ACQUIRE_DEPNODE(gtid,node);
     node->dn.task = NULL; // mark this task as finished, so no new dependencies are generated
     KMP_RELEASE_DEPNODE(gtid,node);
@@ -335,9 +357,10 @@ __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
         // successor task can be NULL for wait_depends or because deps are still being processed
         if ( npredecessors == 0 ) {
             KMP_MB();
-            if ( successor->dn.task )
-            // loc_ref was already stored in successor's task_data
-                __kmpc_omp_task(NULL,gtid,successor->dn.task);
+            if ( successor->dn.task ) {            
+                KA_TRACE(20, ("__kmp_realease_deps: T#%d successor %p of %p scheduled for execution.\n", gtid, successor->dn.task, task ) );
+                __kmp_omp_task(gtid,successor->dn.task,false);
+            }
         }
 
         next = p->next;
@@ -350,6 +373,8 @@ __kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
     }
 
     __kmp_node_deref(thread,node);
+
+    KA_TRACE(20, ("__kmp_realease_deps: T#%d all successors of %p notified of completation\n", gtid, task ) );
 }
 
 /*!
@@ -368,15 +393,20 @@ Schedule a non-thread-switchable task with dependences for execution
 */
 kmp_int32
 __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
-                                 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
-				 kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
+                            kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                            kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
 {
+
+    kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+
     kmp_info_t *thread = __kmp_threads[ gtid ];
     kmp_taskdata_t * current_task = thread->th.th_current_task;
 
     bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
 
-    if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {	   		
+    if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
         /* if no dependencies have been tracked yet, create the dependence hash */
         if ( current_task->td_dephash == NULL )
             current_task->td_dephash = __kmp_dephash_create(thread);
@@ -388,13 +418,21 @@ __kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_ta
 #endif
 
         __kmp_init_node(node);
-        KMP_TASK_TO_TASKDATA(new_task)->td_depnode = node;
+        new_taskdata->td_depnode = node;
 
         if ( __kmp_check_deps( gtid, node, new_task, current_task->td_dephash, NO_DEP_BARRIER,
-                               ndeps, dep_list, ndeps_noalias,noalias_dep_list ) )
+                               ndeps, dep_list, ndeps_noalias,noalias_dep_list ) ) {
+            KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had blocking dependencies: "
+                  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n", gtid, loc_ref,
+                  new_taskdata ) );
             return TASK_CURRENT_NOT_QUEUED;
+        }
     }
 
+    KA_TRACE(10, ("__kmpc_omp_task_with_deps(exit): T#%d task had no blocking dependencies : "
+                  "loc=%p task=%p, transferring to __kmpc_omp_task\n", gtid, loc_ref,
+                  new_taskdata ) );    
+
     return __kmpc_omp_task(loc_ref,gtid,new_task);
 }
 
@@ -413,35 +451,44 @@ void
 __kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
                        kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
 {
-    if ( ndeps == 0 && ndeps_noalias == 0 ) return;
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(enter): T#%d loc=%p\n", gtid, loc_ref) );
+
+    if ( ndeps == 0 && ndeps_noalias == 0 ) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no dependencies to wait upon : loc=%p\n", gtid, loc_ref) );
+        return;
+    }
 
     kmp_info_t *thread = __kmp_threads[ gtid ];
     kmp_taskdata_t * current_task = thread->th.th_current_task;
 
-    // dependences are not computed in serial teams
-    if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final)
+    // We can return immediately as:
+    //   - dependences are not computed in serial teams
+    //   - if the dephash is not yet created it means we have nothing to wait for
+    if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final || current_task->td_dephash == NULL ) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
         return;
-
-    // if the dephash is not yet created it means we have nothing to wait for
-    if ( current_task->td_dephash == NULL ) return;
+    }
 
     kmp_depnode_t node;
     __kmp_init_node(&node);
 
     if (!__kmp_check_deps( gtid, &node, NULL, current_task->td_dephash, DEP_BARRIER,
-                           ndeps, dep_list, ndeps_noalias, noalias_dep_list ))
+                           ndeps, dep_list, ndeps_noalias, noalias_dep_list )) {
+        KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d has no blocking dependencies : loc=%p\n", gtid, loc_ref) );
         return;
+    }
 
     int thread_finished = FALSE;
+    kmp_flag_32 flag((volatile kmp_uint32 *)&(node.dn.npredecessors), 0U);
     while ( node.dn.npredecessors > 0 ) {
-        __kmp_execute_tasks( thread, gtid, (volatile kmp_uint32 *)&(node.dn.npredecessors),
-                             0, FALSE, &thread_finished,
+        flag.execute_tasks(thread, gtid, FALSE, &thread_finished,
 #if USE_ITT_BUILD
-                             NULL,
+                           NULL,
 #endif
-                             __kmp_task_stealing_constraint );
+                           __kmp_task_stealing_constraint );
     }
 
+    KA_TRACE(10, ("__kmpc_omp_wait_deps(exit): T#%d finished waiting : loc=%p\n", gtid, loc_ref) );
 }
 
 #endif /* OMP_40_ENABLED */
diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c
index 6607577e69d..9db1565193b 100644
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@@ -1,7 +1,7 @@
 /*
  * kmp_tasking.c -- OpenMP 3.0 tasking support.
- * $Revision: 42852 $
- * $Date: 2013-12-04 10:50:49 -0600 (Wed, 04 Dec 2013) $
+ * $Revision: 43389 $
+ * $Date: 2014-08-11 10:54:01 -0500 (Mon, 11 Aug 2014) $
  */
 
 
@@ -18,9 +18,9 @@
 #include "kmp.h"
 #include "kmp_i18n.h"
 #include "kmp_itt.h"
+#include "kmp_wait_release.h"
 
 
-#if OMP_30_ENABLED
 
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
@@ -31,26 +31,12 @@ static void __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_t
 static void __kmp_alloc_task_deque( kmp_info_t *thread, kmp_thread_data_t *thread_data );
 static int  __kmp_realloc_task_threads_data( kmp_info_t *thread, kmp_task_team_t *task_team );
 
-#ifndef KMP_DEBUG
-# define __kmp_static_delay( arg )     /* nothing to do */
-#else
-
-static void
-__kmp_static_delay( int arg )
-{
-/* Work around weird code-gen bug that causes assert to trip */
-# if KMP_ARCH_X86_64 && KMP_OS_LINUX
-    KMP_ASSERT( arg != 0 );
-# else
-    KMP_ASSERT( arg >= 0 );
-# endif
-}
-#endif /* KMP_DEBUG */
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
+static inline void __kmp_null_resume_wrapper(int gtid, volatile void *flag) {
+    switch (((kmp_flag_64 *)flag)->get_type()) {
+    case flag32: __kmp_resume_32(gtid, NULL); break;
+    case flag64: __kmp_resume_64(gtid, NULL); break;
+    case flag_oncore: __kmp_resume_oncore(gtid, NULL); break;
+    }
 }
 
 #ifdef BUILD_TIED_TASK_STACK
@@ -605,9 +591,7 @@ __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_tas
     }
 #endif /* BUILD_TIED_TASK_STACK */
 
-    KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
     KMP_DEBUG_ASSERT( taskdata -> td_flags.complete == 0 );
-    taskdata -> td_flags.executing = 0;  // suspend the finishing task
     taskdata -> td_flags.complete = 1;   // mark the task as completed
     KMP_DEBUG_ASSERT( taskdata -> td_flags.started == 1 );
     KMP_DEBUG_ASSERT( taskdata -> td_flags.freed == 0 );
@@ -624,6 +608,12 @@ __kmp_task_finish( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_tas
 #endif
     }
 
+    // td_flags.executing  must be marked as 0 after __kmp_release_deps has been called
+    // Othertwise, if a task is executed immediately from the release_deps code
+    // the flag will be reset to 1 again by this same function
+    KMP_DEBUG_ASSERT( taskdata -> td_flags.executing == 1 );
+    taskdata -> td_flags.executing = 0;  // suspend the finishing task
+
     KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
                   gtid, taskdata, children) );
 
@@ -908,7 +898,7 @@ __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags,
     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
     taskdata->td_dephash = NULL;
     taskdata->td_depnode = NULL;
-#endif 
+#endif
     // Only need to keep track of child task counts if team parallel and tasking not serialized
     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
@@ -1047,25 +1037,19 @@ __kmpc_omp_task_parts( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
     return TASK_CURRENT_NOT_QUEUED;
 }
 
-
 //---------------------------------------------------------------------
-// __kmpc_omp_task: Schedule a non-thread-switchable task for execution
-// loc_ref: location of original task pragma (ignored)
+// __kmp_omp_task: Schedule a non-thread-switchable task for execution
 // gtid: Global Thread ID of encountering thread
 // new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// serialize_immediate: if TRUE then if the task is executed immediately its execution will be serialized
 // returns:
 //
 //    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
 //    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
-
 kmp_int32
-__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
+__kmp_omp_task( kmp_int32 gtid, kmp_task_t * new_task, bool serialize_immediate )
 {
     kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
-    kmp_int32 rc;
-
-    KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
-                  gtid, loc_ref, new_taskdata ) );
 
     /* Should we execute the new task or queue it?   For now, let's just always try to
        queue it.  If the queue fills up, then we'll execute it.  */
@@ -1073,16 +1057,41 @@ __kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
     if ( __kmp_push_task( gtid, new_task ) == TASK_NOT_PUSHED ) // if cannot defer
     {                                                           // Execute this task immediately
         kmp_taskdata_t * current_task = __kmp_threads[ gtid ] -> th.th_current_task;
-        new_taskdata -> td_flags.task_serial = 1;
+        if ( serialize_immediate )
+          new_taskdata -> td_flags.task_serial = 1;
         __kmp_invoke_task( gtid, new_task, current_task );
     }
 
-    KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
-                  gtid, loc_ref, new_taskdata ) );
 
     return TASK_CURRENT_NOT_QUEUED;
 }
 
+//---------------------------------------------------------------------
+// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a non-thread-switchable task from
+// the parent thread only!
+// loc_ref: location of original task pragma (ignored)
+// gtid: Global Thread ID of encountering thread
+// new_task: non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
+// returns:
+//
+//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to be resumed later.
+//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be resumed later.
+
+kmp_int32
+__kmpc_omp_task( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task)
+{
+    kmp_taskdata_t * new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
+    kmp_int32 res;
+
+    KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+
+    res =  __kmp_omp_task(gtid,new_task,true);
+
+    KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
+                  gtid, loc_ref, new_taskdata ) );
+    return res;
+}
 
 //-------------------------------------------------------------------------------------
 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are complete
@@ -1117,11 +1126,10 @@ __kmpc_omp_taskwait( ident_t *loc_ref, kmp_int32 gtid )
 
         if ( ! taskdata->td_flags.team_serial ) {
             // GEH: if team serialized, avoid reading the volatile variable below.
+            kmp_flag_32 flag(&(taskdata->td_incomplete_child_tasks), 0U);
             while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
-                __kmp_execute_tasks( thread, gtid, &(taskdata->td_incomplete_child_tasks),
-                                     0, FALSE, &thread_finished
-                                     USE_ITT_BUILD_ARG(itt_sync_obj),
-                                     __kmp_task_stealing_constraint );
+                flag.execute_tasks(thread, gtid, FALSE, &thread_finished
+                                   USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
             }
         }
 #if USE_ITT_BUILD
@@ -1153,7 +1161,7 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
     KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
                   gtid, loc_ref, end_part) );
 
-    if ( __kmp_tasking_mode != tskm_immediate_exec ) {
+    if ( __kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel ) {
         // GEH TODO: shouldn't we have some sort of OMPRAP API calls here to mark begin wait?
 
         thread = __kmp_threads[ gtid ];
@@ -1172,11 +1180,14 @@ __kmpc_omp_taskyield( ident_t *loc_ref, kmp_int32 gtid, int end_part )
             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
 #endif /* USE_ITT_BUILD */
         if ( ! taskdata->td_flags.team_serial ) {
-            __kmp_execute_tasks( thread, gtid, NULL, 0, FALSE, &thread_finished
-                                 USE_ITT_BUILD_ARG(itt_sync_obj),
-                                 __kmp_task_stealing_constraint );
+            kmp_task_team_t * task_team = thread->th.th_task_team;
+            if (task_team != NULL) {
+                if (KMP_TASKING_ENABLED(task_team, thread->th.th_task_state)) {
+                    __kmp_execute_tasks_32( thread, gtid, NULL, FALSE, &thread_finished
+                                            USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
+                }
+            }
         }
-
 #if USE_ITT_BUILD
         if ( itt_sync_obj != NULL )
             __kmp_itt_taskwait_finished( gtid, itt_sync_obj );
@@ -1236,11 +1247,10 @@ __kmpc_end_taskgroup( ident_t* loc, int gtid )
 #endif /* USE_ITT_BUILD */
 
         if ( ! taskdata->td_flags.team_serial ) {
+            kmp_flag_32 flag(&(taskgroup->count), 0U);
             while ( TCR_4(taskgroup->count) != 0 ) {
-                __kmp_execute_tasks( thread, gtid, &(taskgroup->count),
-                                     0, FALSE, &thread_finished
-                                     USE_ITT_BUILD_ARG(itt_sync_obj),
-                                     __kmp_task_stealing_constraint );
+                flag.execute_tasks(thread, gtid, FALSE, &thread_finished
+                                   USE_ITT_BUILD_ARG(itt_sync_obj), __kmp_task_stealing_constraint );
             }
         }
 
@@ -1433,7 +1443,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
 
     __kmp_release_bootstrap_lock( & victim_td -> td.td_deque_lock );
 
-    KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#d: task_team=%p "
+    KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d stole task %p from T#%d: task_team=%p "
                   "ntasks=%d head=%u tail=%u\n",
                   gtid, taskdata, __kmp_gtid_from_thread( victim ), task_team,
                   victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
@@ -1445,7 +1455,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
 
 
 //-----------------------------------------------------------------------------
-// __kmp_execute_tasks: Choose and execute tasks until either the condition
+// __kmp_execute_tasks_template: Choose and execute tasks until either the condition
 // is statisfied (return true) or there are none left (return false).
 // final_spin is TRUE if this is the spin at the release barrier.
 // thread_finished indicates whether the thread is finished executing all
@@ -1453,16 +1463,10 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
 // spinner is the location on which to spin.
 // spinner == NULL means only execute a single task and return.
 // checker is the value to check to terminate the spin.
-
-int
-__kmp_execute_tasks( kmp_info_t *thread,
-                     kmp_int32 gtid,
-                     volatile kmp_uint *spinner,
-                     kmp_uint checker,
-                     int final_spin,
-                     int *thread_finished
-                     USE_ITT_BUILD_ARG(void * itt_sync_obj),
-                     kmp_int32 is_constrained )
+template <class C>
+static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, 
+                                               int *thread_finished
+                                               USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
 {
     kmp_task_team_t *     task_team;
     kmp_team_t *          team;
@@ -1478,7 +1482,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
     task_team = thread -> th.th_task_team;
     KMP_DEBUG_ASSERT( task_team != NULL );
 
-    KA_TRACE(15, ("__kmp_execute_tasks(enter): T#%d final_spin=%d *thread_finished=%d\n",
+    KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n",
                   gtid, final_spin, *thread_finished) );
 
     threads_data = (kmp_thread_data_t *)TCR_PTR(task_team -> tt.tt_threads_data);
@@ -1512,8 +1516,8 @@ __kmp_execute_tasks( kmp_info_t *thread,
         // If this thread is in the last spin loop in the barrier, waiting to be
         // released, we know that the termination condition will not be satisified,
         // so don't waste any cycles checking it.
-        if ((spinner == NULL) || ((!final_spin) && (TCR_4(*spinner) == checker))) {
-            KA_TRACE(15, ("__kmp_execute_tasks(exit #1): T#%d spin condition satisfied\n", gtid) );
+        if (flag == NULL || (!final_spin && flag->done_check())) {
+            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #1): T#%d spin condition satisfied\n", gtid) );
             return TRUE;
         }
         KMP_YIELD( __kmp_library == library_throughput );   // Yield before executing next task
@@ -1527,7 +1531,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
         // result in the termination condition being satisfied.
         if (! *thread_finished) {
             kmp_uint32 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-            KA_TRACE(20, ("__kmp_execute_tasks(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n",
+            KA_TRACE(20, ("__kmp_execute_tasks_template(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n",
                           gtid, count, task_team) );
             *thread_finished = TRUE;
         }
@@ -1537,8 +1541,8 @@ __kmp_execute_tasks( kmp_info_t *thread,
         // thread to pass through the barrier, where it might reset each thread's
         // th.th_team field for the next parallel region.
         // If we can steal more work, we know that this has not happened yet.
-        if ((spinner != NULL) && (TCR_4(*spinner) == checker)) {
-            KA_TRACE(15, ("__kmp_execute_tasks(exit #2): T#%d spin condition satisfied\n", gtid) );
+        if (flag != NULL && flag->done_check()) {
+            KA_TRACE(15, ("__kmp_execute_tasks_template(exit #2): T#%d spin condition satisfied\n", gtid) );
             return TRUE;
         }
     }
@@ -1569,8 +1573,8 @@ __kmp_execute_tasks( kmp_info_t *thread,
 #endif /* USE_ITT_BUILD */
 
             // Check to see if this thread can proceed.
-            if ((spinner == NULL) || ((!final_spin) && (TCR_4(*spinner) == checker))) {
-                KA_TRACE(15, ("__kmp_execute_tasks(exit #3): T#%d spin condition satisfied\n",
+            if (flag == NULL || (!final_spin && flag->done_check())) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #3): T#%d spin condition satisfied\n",
                               gtid) );
                 return TRUE;
             }
@@ -1579,7 +1583,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // If the execution of the stolen task resulted in more tasks being
             // placed on our run queue, then restart the whole process.
             if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
-                KA_TRACE(20, ("__kmp_execute_tasks: T#%d stolen task spawned other tasks, restart\n",
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
                               gtid) );
                 goto start;
             }
@@ -1596,7 +1600,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // result in the termination condition being satisfied.
             if (! *thread_finished) {
                 kmp_uint32 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-                KA_TRACE(20, ("__kmp_execute_tasks(dec #2): T#%d dec unfinished_threads to %d "
+                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #2): T#%d dec unfinished_threads to %d "
                               "task_team=%p\n", gtid, count, task_team) );
                 *thread_finished = TRUE;
             }
@@ -1607,8 +1611,8 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // thread to pass through the barrier, where it might reset each thread's
             // th.th_team field for the next parallel region.
             // If we can steal more work, we know that this has not happened yet.
-            if ((spinner != NULL) && (TCR_4(*spinner) == checker)) {
-                KA_TRACE(15, ("__kmp_execute_tasks(exit #4): T#%d spin condition satisfied\n",
+            if (flag != NULL && flag->done_check()) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #4): T#%d spin condition satisfied\n",
                               gtid) );
                 return TRUE;
             }
@@ -1640,8 +1644,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
              (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
              (TCR_PTR(other_thread->th.th_sleep_loc) != NULL))
         {
-            __kmp_resume( __kmp_gtid_from_thread( other_thread ), NULL );
-
+            __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc);
             // A sleeping thread should not have any tasks on it's queue.
             // There is a slight possibility that it resumes, steals a task from
             // another thread, which spawns more tasks, all in the that it takes
@@ -1677,8 +1680,8 @@ __kmp_execute_tasks( kmp_info_t *thread,
             }
 
             // Check to see if this thread can proceed.
-            if ((spinner == NULL) || ((!final_spin) && (TCR_4(*spinner) == checker))) {
-                KA_TRACE(15, ("__kmp_execute_tasks(exit #5): T#%d spin condition satisfied\n",
+            if (flag == NULL || (!final_spin && flag->done_check())) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #5): T#%d spin condition satisfied\n",
                               gtid) );
                 return TRUE;
             }
@@ -1687,7 +1690,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // If the execution of the stolen task resulted in more tasks being
             // placed on our run queue, then restart the whole process.
             if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) {
-                KA_TRACE(20, ("__kmp_execute_tasks: T#%d stolen task spawned other tasks, restart\n",
+                KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n",
                               gtid) );
                 goto start;
             }
@@ -1704,7 +1707,7 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // result in the termination condition being satisfied.
             if (! *thread_finished) {
                 kmp_uint32 count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1;
-                KA_TRACE(20, ("__kmp_execute_tasks(dec #3): T#%d dec unfinished_threads to %d; "
+                KA_TRACE(20, ("__kmp_execute_tasks_template(dec #3): T#%d dec unfinished_threads to %d; "
                               "task_team=%p\n",
                               gtid, count, task_team) );
                 *thread_finished = TRUE;
@@ -1716,18 +1719,42 @@ __kmp_execute_tasks( kmp_info_t *thread,
             // thread to pass through the barrier, where it might reset each thread's
             // th.th_team field for the next parallel region.
             // If we can steal more work, we know that this has not happened yet.
-            if ((spinner != NULL) && (TCR_4(*spinner) == checker)) {
-                KA_TRACE(15, ("__kmp_execute_tasks(exit #6): T#%d spin condition satisfied\n",
-                              gtid) );
+            if (flag != NULL && flag->done_check()) {
+                KA_TRACE(15, ("__kmp_execute_tasks_template(exit #6): T#%d spin condition satisfied\n", gtid) );
                 return TRUE;
             }
         }
     }
 
-    KA_TRACE(15, ("__kmp_execute_tasks(exit #7): T#%d can't find work\n", gtid) );
+    KA_TRACE(15, ("__kmp_execute_tasks_template(exit #7): T#%d can't find work\n", gtid) );
     return FALSE;
 }
 
+int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
+                           int *thread_finished
+                           USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
+                           int *thread_finished
+                           USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
+                               int *thread_finished
+                               USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained)
+{
+    return __kmp_execute_tasks_template(thread, gtid, flag, final_spin, thread_finished
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
+
 
 //-----------------------------------------------------------------------------
 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
@@ -1770,7 +1797,7 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
         // tasks and execute them.  In extra barrier mode, tasks do not sleep
         // at the separate tasking barrier, so this isn't a problem.
         for (i = 0; i < nthreads; i++) {
-            volatile kmp_uint *sleep_loc;
+            volatile void *sleep_loc;
             kmp_info_t *thread = threads_data[i].td.td_thr;
 
             if (i == this_thr->th.th_info.ds.ds_tid) {
@@ -1779,17 +1806,16 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
             // Since we haven't locked the thread's suspend mutex lock at this
             // point, there is a small window where a thread might be putting
             // itself to sleep, but hasn't set the th_sleep_loc field yet.
-            // To work around this, __kmp_execute_tasks() periodically checks
+            // To work around this, __kmp_execute_tasks_template() periodically checks
             // see if other threads are sleeping (using the same random
             // mechanism that is used for task stealing) and awakens them if
             // they are.
-            if ( ( sleep_loc =  (volatile kmp_uint *)
-                                TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
+            if ( ( sleep_loc = TCR_PTR( thread -> th.th_sleep_loc) ) != NULL )
             {
                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d waking up thread T#%d\n",
                                  __kmp_gtid_from_thread( this_thr ),
                                  __kmp_gtid_from_thread( thread ) ) );
-                __kmp_resume( __kmp_gtid_from_thread( thread ), sleep_loc );
+                __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
             }
             else {
                 KF_TRACE( 50, ( "__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
@@ -1805,7 +1831,7 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
 
 
 /* ------------------------------------------------------------------------ */
-/*
+/* // TODO: Check the comment consistency
  * Utility routines for "task teams".  A task team (kmp_task_t) is kind of
  * like a shadow of the kmp_team_t data struct, with a different lifetime.
  * After a child * thread checks into a barrier and calls __kmp_release() from
@@ -1839,6 +1865,7 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
  * barriers, when no explicit tasks were spawned (pushed, actually).
  */
 
+
 static kmp_task_team_t *__kmp_free_task_teams = NULL;           // Free list for task_team data structures
 // Lock for task team data structures
 static kmp_bootstrap_lock_t __kmp_task_team_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER( __kmp_task_team_lock );
@@ -2193,7 +2220,6 @@ __kmp_wait_to_unref_task_teams(void)
              thread != NULL;
              thread = thread->th.th_next_pool)
         {
-            volatile kmp_uint *sleep_loc;
 #if KMP_OS_WINDOWS
             DWORD exit_val;
 #endif
@@ -2218,11 +2244,12 @@ __kmp_wait_to_unref_task_teams(void)
                            __kmp_gtid_from_thread( thread ) ) );
 
             if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
+                volatile void *sleep_loc;
                 // If the thread is sleeping, awaken it.
-                if ( ( sleep_loc = (volatile kmp_uint *) TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
+                if ( ( sleep_loc = TCR_PTR( thread->th.th_sleep_loc) ) != NULL ) {
                     KA_TRACE( 10, ( "__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
                                     __kmp_gtid_from_thread( thread ), __kmp_gtid_from_thread( thread ) ) );
-                    __kmp_resume( __kmp_gtid_from_thread( thread ), sleep_loc );
+                    __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
                 }
             }
         }
@@ -2350,9 +2377,9 @@ __kmp_task_team_wait( kmp_info_t *this_thr,
         // contention, only the master thread checks for the
         // termination condition.
         //
-        __kmp_wait_sleep( this_thr, &task_team->tt.tt_unfinished_threads, 0, TRUE
-                          USE_ITT_BUILD_ARG(itt_sync_obj)
-                          );
+        kmp_flag_32 flag(&task_team->tt.tt_unfinished_threads, 0U);
+        flag.wait(this_thr, TRUE
+                  USE_ITT_BUILD_ARG(itt_sync_obj));
 
         //
         // Kill the old task team, so that the worker threads will
@@ -2390,8 +2417,9 @@ __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
 #if USE_ITT_BUILD
     KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
 #endif /* USE_ITT_BUILD */
-    while (! __kmp_execute_tasks( thread, gtid, spin, 0, TRUE, &flag 
-                                  USE_ITT_BUILD_ARG(NULL), 0 ) ) {
+    kmp_flag_32 spin_flag(spin, 0U);
+    while (! spin_flag.execute_tasks(thread, gtid, TRUE, &flag
+                                     USE_ITT_BUILD_ARG(NULL), 0 ) ) {
 #if USE_ITT_BUILD
         // TODO: What about itt_sync_obj??
         KMP_FSYNC_SPIN_PREPARE( spin );
@@ -2409,5 +2437,3 @@ __kmp_tasking_barrier( kmp_team_t *team, kmp_info_t *thread, int gtid )
 #endif /* USE_ITT_BUILD */
 }
 
-#endif // OMP_30_ENABLED
-
diff --git a/openmp/runtime/src/kmp_taskq.c b/openmp/runtime/src/kmp_taskq.c
index 79ba3f3e72a..3a276b55ad5 100644
--- a/openmp/runtime/src/kmp_taskq.c
+++ b/openmp/runtime/src/kmp_taskq.c
@@ -1,7 +1,7 @@
 /*
  * kmp_taskq.c -- TASKQ support for OpenMP.
- * $Revision: 42582 $
- * $Date: 2013-08-09 06:30:22 -0500 (Fri, 09 Aug 2013) $
+ * $Revision: 43389 $
+ * $Date: 2014-08-11 10:54:01 -0500 (Mon, 11 Aug 2014) $
  */
 
 
@@ -33,23 +33,6 @@
 
 #define THREAD_ALLOC_FOR_TASKQ
 
-static void
-__kmp_static_delay( int arg )
-{
-/* Work around weird code-gen bug that causes assert to trip */
-#if KMP_ARCH_X86_64 && KMP_OS_LINUX
-    KMP_ASSERT( arg != 0 );
-#else
-    KMP_ASSERT( arg >= 0 );
-#endif
-}
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
-}
-
 static int
 in_parallel_context( kmp_team_t *team )
 {
@@ -790,7 +773,7 @@ __kmp_dequeue_task (kmp_int32 global_tid, kmpc_task_queue_t *queue, int in_paral
  * 1.  Walk up the task queue tree from the current queue's parent and look
  *      on the way up (for loop, below).
  * 2.  Do a depth-first search back down the tree from the root and
- *      look (find_task_in_descandent_queue()).
+ *      look (find_task_in_descendant_queue()).
  *
  * Here are the rules for deciding which task to take from a queue
  * (__kmp_find_task_in_queue ()):
@@ -1608,7 +1591,6 @@ __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *taskq_thunk)
                  && (! __kmp_taskq_has_any_children(queue) )
                  && (! (queue->tq_flags & TQF_ALL_TASKS_QUEUED) )
                   ) {
-                __kmp_static_delay( 1 );
                 KMP_YIELD_WHEN( TRUE, spins );
             }
 
diff --git a/openmp/runtime/src/kmp_threadprivate.c b/openmp/runtime/src/kmp_threadprivate.c
index 6ab79690eb5..4bf2701f245 100644
--- a/openmp/runtime/src/kmp_threadprivate.c
+++ b/openmp/runtime/src/kmp_threadprivate.c
@@ -1,7 +1,7 @@
 /*
  * kmp_threadprivate.c -- OpenMP threadprivate support library
- * $Revision: 42618 $
- * $Date: 2013-08-27 09:15:45 -0500 (Tue, 27 Aug 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_utility.c b/openmp/runtime/src/kmp_utility.c
index faf3916b2d1..180fceaeba1 100644
--- a/openmp/runtime/src/kmp_utility.c
+++ b/openmp/runtime/src/kmp_utility.c
@@ -1,7 +1,7 @@
 /*
  * kmp_utility.c -- Utility routines for the OpenMP support library.
- * $Revision: 42588 $
- * $Date: 2013-08-13 01:26:00 -0500 (Tue, 13 Aug 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_version.c b/openmp/runtime/src/kmp_version.c
index b836c39d192..77a24e8aa00 100644
--- a/openmp/runtime/src/kmp_version.c
+++ b/openmp/runtime/src/kmp_version.c
@@ -1,7 +1,7 @@
 /*
  * kmp_version.c
- * $Revision: 42806 $
- * $Date: 2013-11-05 16:16:45 -0600 (Tue, 05 Nov 2013) $
+ * $Revision: 43435 $
+ * $Date: 2014-09-04 15:16:08 -0500 (Thu, 04 Sep 2014) $
  */
 
 
@@ -20,7 +20,7 @@
 #include "kmp_version.h"
 
 // Replace with snapshot date YYYYMMDD for promotion build.
-#define KMP_VERSION_BUILD    00000000
+#define KMP_VERSION_BUILD    20140926
 
 // Helper macros to convert value of macro to string literal.
 #define _stringer( x ) #x
@@ -46,6 +46,8 @@
         #define KMP_COMPILER "Intel C++ Compiler 14.0"
     #elif __INTEL_COMPILER == 1410
         #define KMP_COMPILER "Intel C++ Compiler 14.1"
+    #elif __INTEL_COMPILER == 1500
+        #define KMP_COMPILER "Intel C++ Compiler 15.0"
     #elif __INTEL_COMPILER == 9999
         #define KMP_COMPILER "Intel C++ Compiler mainline"
     #endif
@@ -54,7 +56,7 @@
 #elif KMP_COMPILER_GCC
     #define KMP_COMPILER "GCC " stringer( __GNUC__ ) "." stringer( __GNUC_MINOR__ )
 #elif KMP_COMPILER_MSVC
-    #define KMP_COMPILER "MSVC " stringer( __MSC_FULL_VER )
+    #define KMP_COMPILER "MSVC " stringer( _MSC_FULL_VER )
 #endif
 #ifndef KMP_COMPILER
     #warning "Unknown compiler"
@@ -77,7 +79,7 @@
 
 // Finally, define strings.
 #define KMP_LIBRARY   KMP_LIB_TYPE " library (" KMP_LINK_TYPE ")"
-#define KMP_COPYRIGHT "Copyright (C) 1997-2013, Intel Corporation. All Rights Reserved."
+#define KMP_COPYRIGHT ""
 
 int const __kmp_version_major = KMP_VERSION_MAJOR;
 int const __kmp_version_minor = KMP_VERSION_MINOR;
@@ -85,10 +87,8 @@ int const __kmp_version_build = KMP_VERSION_BUILD;
 int const __kmp_openmp_version =
     #if OMP_40_ENABLED
         201307;
-    #elif OMP_30_ENABLED
-        201107;
     #else
-        200505;
+        201107;
     #endif
 
 /* Do NOT change the format of this string!  Intel(R) Thread Profiler checks for a
@@ -128,7 +128,6 @@ __kmp_print_version_1( void )
         kmp_str_buf_t buffer;
         __kmp_str_buf_init( & buffer );
         // Print version strings skipping initial magic.
-        __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_copyright[ KMP_VERSION_MAGIC_LEN ] );
         __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lib_ver[ KMP_VERSION_MAGIC_LEN ] );
         __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lib_type[ KMP_VERSION_MAGIC_LEN ] );
         __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_link_type[ KMP_VERSION_MAGIC_LEN ] );
@@ -164,8 +163,6 @@ __kmp_print_version_1( void )
                 ); // __kmp_str_buf_print
             }; // for i
             __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_lock[ KMP_VERSION_MAGIC_LEN ] );
-            __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_perf_v19[ KMP_VERSION_MAGIC_LEN ] );
-            __kmp_str_buf_print( & buffer, "%s\n", & __kmp_version_perf_v106[ KMP_VERSION_MAGIC_LEN ] );
         #endif
         __kmp_str_buf_print(
             & buffer,
diff --git a/openmp/runtime/src/kmp_version.h b/openmp/runtime/src/kmp_version.h
index 0bd2c4235bd..76e1e631b67 100644
--- a/openmp/runtime/src/kmp_version.h
+++ b/openmp/runtime/src/kmp_version.h
@@ -1,7 +1,7 @@
 /*
  * kmp_version.h -- version number for this release
- * $Revision: 42181 $
- * $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+ * $Revision: 42982 $
+ * $Date: 2014-02-12 10:11:02 -0600 (Wed, 12 Feb 2014) $
  */
 
 
@@ -55,8 +55,6 @@ extern char const __kmp_version_alt_comp[];
 extern char const __kmp_version_omp_api[];
 // ??? extern char const __kmp_version_debug[];
 extern char const __kmp_version_lock[];
-extern char const __kmp_version_perf_v19[];
-extern char const __kmp_version_perf_v106[];
 extern char const __kmp_version_nested_stats_reporting[];
 extern char const __kmp_version_ftnstdcall[];
 extern char const __kmp_version_ftncdecl[];
diff --git a/openmp/runtime/src/kmp_wait_release.cpp b/openmp/runtime/src/kmp_wait_release.cpp
new file mode 100644
index 00000000000..c3e228f3a61
--- /dev/null
+++ b/openmp/runtime/src/kmp_wait_release.cpp
@@ -0,0 +1,52 @@
+/*
+ * kmp_wait_release.cpp -- Wait/Release implementation
+ * $Revision: 43417 $
+ * $Date: 2014-08-26 14:06:38 -0500 (Tue, 26 Aug 2014) $
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "kmp_wait_release.h"
+
+void __kmp_wait_32(kmp_info_t *this_thr, kmp_flag_32 *flag, int final_spin
+                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+void __kmp_wait_64(kmp_info_t *this_thr, kmp_flag_64 *flag, int final_spin
+                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+void __kmp_wait_oncore(kmp_info_t *this_thr, kmp_flag_oncore *flag, int final_spin
+                       USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    __kmp_wait_template(this_thr, flag, final_spin
+                        USE_ITT_BUILD_ARG(itt_sync_obj) );
+}
+
+
+
+void __kmp_release_32(kmp_flag_32 *flag) {
+    __kmp_release_template(flag);
+}
+
+void __kmp_release_64(kmp_flag_64 *flag) {
+    __kmp_release_template(flag);
+}
+
+void __kmp_release_oncore(kmp_flag_oncore *flag) {
+    __kmp_release_template(flag);
+}
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
new file mode 100644
index 00000000000..599dcf0d772
--- /dev/null
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -0,0 +1,496 @@
+/*
+ * kmp_wait_release.h -- Wait/Release implementation
+ * $Revision: 43417 $
+ * $Date: 2014-08-26 14:06:38 -0500 (Tue, 26 Aug 2014) $
+ */
+
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef KMP_WAIT_RELEASE_H
+#define KMP_WAIT_RELEASE_H
+
+#include "kmp.h"
+#include "kmp_itt.h"
+
+/*!
+@defgroup WAIT_RELEASE Wait/Release operations
+
+The definitions and functions here implement the lowest level thread
+synchronizations of suspending a thread and awaking it. They are used
+to build higher level operations such as barriers and fork/join.
+*/
+
+/*!
+@ingroup WAIT_RELEASE
+@{
+*/
+
+/*! 
+ * The flag_type describes the storage used for the flag.
+ */
+enum flag_type {
+    flag32,        /**< 32 bit flags */
+    flag64,        /**< 64 bit flags */
+    flag_oncore    /**< special 64-bit flag for on-core barrier (hierarchical) */
+};
+
+/*!
+ * Base class for wait/release volatile flag
+ */
+template <typename P>
+class kmp_flag {
+    volatile P * loc;  /**< Pointer to the flag storage that is modified by another thread */
+    flag_type t;       /**< "Type" of the flag in loc */
+ public:
+    typedef P flag_t;
+    kmp_flag(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+    /*!
+     * @result the pointer to the actual flag
+     */
+    volatile P * get() { return loc; }
+    /*!
+     * @result the flag_type
+     */
+    flag_type get_type() { return t; }
+    // Derived classes must provide the following:
+    /*
+    kmp_info_t * get_waiter(kmp_uint32 i);
+    kmp_uint32 get_num_waiters();
+    bool done_check();
+    bool done_check_val(P old_loc);
+    bool notdone_check();
+    P internal_release();
+    P set_sleeping();
+    P unset_sleeping();
+    bool is_sleeping();
+    bool is_sleeping_val(P old_loc);
+    */
+};
+
+/* Spin wait loop that first does pause, then yield, then sleep. A thread that calls __kmp_wait_*
+   must make certain that another thread calls __kmp_release to wake it back up to prevent deadlocks!  */
+template <class C>
+static inline void __kmp_wait_template(kmp_info_t *this_thr, C *flag, int final_spin
+                                       USE_ITT_BUILD_ARG(void * itt_sync_obj) )
+{
+    // NOTE: We may not belong to a team at this point.
+    volatile typename C::flag_t *spin = flag->get();
+    kmp_uint32 spins;
+    kmp_uint32 hibernate;
+    int th_gtid;
+    int tasks_completed = FALSE;
+
+    KMP_FSYNC_SPIN_INIT(spin, NULL);
+    if (flag->done_check()) {
+        KMP_FSYNC_SPIN_ACQUIRED(spin);
+        return;
+    }
+    th_gtid = this_thr->th.th_info.ds.ds_gtid;
+    KA_TRACE(20, ("__kmp_wait_sleep: T#%d waiting for flag(%p)\n", th_gtid, flag));
+
+    // Setup for waiting
+    KMP_INIT_YIELD(spins);
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        // The worker threads cannot rely on the team struct existing at this point.
+        // Use the bt values cached in the thread struct instead.
+#ifdef KMP_ADJUST_BLOCKTIME
+        if (__kmp_zero_bt && !this_thr->th.th_team_bt_set)
+            // Force immediate suspend if not set by user and more threads than available procs
+            hibernate = 0;
+        else
+            hibernate = this_thr->th.th_team_bt_intervals;
+#else
+        hibernate = this_thr->th.th_team_bt_intervals;
+#endif /* KMP_ADJUST_BLOCKTIME */
+
+        /* If the blocktime is nonzero, we want to make sure that we spin wait for the entirety
+           of the specified #intervals, plus up to one interval more.  This increment make
+           certain that this thread doesn't go to sleep too soon.  */
+        if (hibernate != 0)
+            hibernate++;
+
+        // Add in the current time value.
+        hibernate += TCR_4(__kmp_global.g.g_time.dt.t_value);
+        KF_TRACE(20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
+                      th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
+                      hibernate - __kmp_global.g.g_time.dt.t_value));
+    }
+    KMP_MB();
+
+    // Main wait spin loop
+    while (flag->notdone_check()) {
+        int in_pool;
+
+        /* If the task team is NULL, it means one of things:
+           1) A newly-created thread is first being released by __kmp_fork_barrier(), and
+              its task team has not been set up yet.
+           2) All tasks have been executed to completion, this thread has decremented the task
+              team's ref ct and possibly deallocated it, and should no longer reference it.
+           3) Tasking is off for this region.  This could be because we are in a serialized region
+              (perhaps the outer one), or else tasking was manually disabled (KMP_TASKING=0).  */
+        kmp_task_team_t * task_team = NULL;
+        if (__kmp_tasking_mode != tskm_immediate_exec) {
+            task_team = this_thr->th.th_task_team;
+            if (task_team != NULL) {
+                if (!TCR_SYNC_4(task_team->tt.tt_active)) {
+                    KMP_DEBUG_ASSERT(!KMP_MASTER_TID(this_thr->th.th_info.ds.ds_tid));
+                    __kmp_unref_task_team(task_team, this_thr);
+                } else if (KMP_TASKING_ENABLED(task_team, this_thr->th.th_task_state)) {
+                    flag->execute_tasks(this_thr, th_gtid, final_spin, &tasks_completed
+                                        USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+                }
+            } // if
+        } // if
+
+        KMP_FSYNC_SPIN_PREPARE(spin);
+        if (TCR_4(__kmp_global.g.g_done)) {
+            if (__kmp_global.g.g_abort)
+                __kmp_abort_thread();
+            break;
+        }
+
+        // If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield
+        KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+        // TODO: Should it be number of cores instead of thread contexts? Like:
+        // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
+        // Need performance improvement data to make the change...
+        KMP_YIELD_SPIN(spins);
+
+        // Check if this thread was transferred from a team
+        // to the thread pool (or vice-versa) while spinning.
+        in_pool = !!TCR_4(this_thr->th.th_in_pool);
+        if (in_pool != !!this_thr->th.th_active_in_pool) {
+            if (in_pool) { // Recently transferred from team to pool
+                KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+                this_thr->th.th_active_in_pool = TRUE;
+                /* Here, we cannot assert that:
+                   KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) <= __kmp_thread_pool_nth);
+                   __kmp_thread_pool_nth is inc/dec'd by the master thread while the fork/join
+                   lock is held, whereas __kmp_thread_pool_active_nth is inc/dec'd asynchronously
+                   by the workers.  The two can get out of sync for brief periods of time.  */
+            }
+            else { // Recently transferred from pool to team
+                KMP_TEST_THEN_DEC32((kmp_int32 *) &__kmp_thread_pool_active_nth);
+                KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+                this_thr->th.th_active_in_pool = FALSE;
+            }
+        }
+
+        // Don't suspend if KMP_BLOCKTIME is set to "infinite"
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
+            continue;
+
+        // Don't suspend if there is a likelihood of new tasks being spawned.
+        if ((task_team != NULL) && TCR_4(task_team->tt.tt_found_tasks))
+            continue;
+
+        // If we have waited a bit more, fall asleep
+        if (TCR_4(__kmp_global.g.g_time.dt.t_value) < hibernate)
+            continue;
+
+        KF_TRACE(50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid));
+
+        flag->suspend(th_gtid);
+
+        if (TCR_4(__kmp_global.g.g_done)) {
+            if (__kmp_global.g.g_abort)
+                __kmp_abort_thread();
+            break;
+        }
+        // TODO: If thread is done with work and times out, disband/free
+    }
+    KMP_FSYNC_SPIN_ACQUIRED(spin);
+}
+
+/* Release any threads specified as waiting on the flag by releasing the flag and resume the waiting thread
+   if indicated by the sleep bit(s). A thread that calls __kmp_wait_template must call this function to wake
+   up the potentially sleeping thread and prevent deadlocks!  */
+template <class C>
+static inline void __kmp_release_template(C *flag)
+{
+#ifdef KMP_DEBUG
+    // FIX ME
+    kmp_info_t * wait_thr = flag->get_waiter(0);
+    int target_gtid = wait_thr->th.th_info.ds.ds_gtid;
+    int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
+#endif
+    KF_TRACE(20, ("__kmp_release: T#%d releasing T#%d spin(%p)\n", gtid, target_gtid, flag->get()));
+    KMP_DEBUG_ASSERT(flag->get());
+    KMP_FSYNC_RELEASING(flag->get());
+
+    typename C::flag_t old_spin = flag->internal_release();
+
+    KF_TRACE(100, ("__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
+                   gtid, flag->get(), old_spin, *(flag->get())));
+
+    if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
+        // Only need to check sleep stuff if infinite block time not set
+        if (flag->is_sleeping_val(old_spin)) {
+            for (unsigned int i=0; i<flag->get_num_waiters(); ++i) {
+                kmp_info_t * waiter = flag->get_waiter(i);
+                int wait_gtid = waiter->th.th_info.ds.ds_gtid;
+                // Wake up thread if needed
+                KF_TRACE(50, ("__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
+                              gtid, wait_gtid, flag->get()));
+                flag->resume(wait_gtid);
+            }
+        } else {
+            KF_TRACE(50, ("__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
+                          gtid, target_gtid, flag->get()));
+        }
+    }
+}
+
+template <typename FlagType>
+struct flag_traits {};
+
+template <>
+struct flag_traits<kmp_uint32> {
+    typedef kmp_uint32 flag_t;
+    static const flag_type t = flag32;
+    static inline flag_t tcr(flag_t f) { return TCR_4(f); }
+    static inline flag_t test_then_add4(volatile flag_t *f) { return KMP_TEST_THEN_ADD4_32((volatile kmp_int32 *)f); }
+    static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_OR32((volatile kmp_int32 *)f, v); }
+    static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_AND32((volatile kmp_int32 *)f, v); }
+};
+
+template <>
+struct flag_traits<kmp_uint64> {
+    typedef kmp_uint64 flag_t;
+    static const flag_type t = flag64;
+    static inline flag_t tcr(flag_t f) { return TCR_8(f); }
+    static inline flag_t test_then_add4(volatile flag_t *f) { return KMP_TEST_THEN_ADD4_64((volatile kmp_int64 *)f); }
+    static inline flag_t test_then_or(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_OR64((volatile kmp_int64 *)f, v); }
+    static inline flag_t test_then_and(volatile flag_t *f, flag_t v) { return KMP_TEST_THEN_AND64((volatile kmp_int64 *)f, v); }
+};
+
+template <typename FlagType>
+class kmp_basic_flag : public kmp_flag<FlagType> {
+    typedef flag_traits<FlagType> traits_type;
+    FlagType checker;  /**< Value to compare flag to to check if flag has been released. */
+    kmp_info_t * waiting_threads[1];  /**< Array of threads sleeping on this thread. */
+    kmp_uint32 num_waiting_threads;       /**< Number of threads sleeping on this thread. */
+public:
+    kmp_basic_flag(volatile FlagType *p) : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+    kmp_basic_flag(volatile FlagType *p, kmp_info_t *thr) : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+        waiting_threads[0] = thr; 
+    }
+    kmp_basic_flag(volatile FlagType *p, FlagType c) : kmp_flag<FlagType>(p, traits_type::t), checker(c), num_waiting_threads(0) {}
+    /*!
+     * param i in   index into waiting_threads
+     * @result the thread that is waiting at index i
+     */
+    kmp_info_t * get_waiter(kmp_uint32 i) { 
+        KMP_DEBUG_ASSERT(i<num_waiting_threads);
+        return waiting_threads[i]; 
+    }
+    /*!
+     * @result num_waiting_threads
+     */
+    kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+    /*!
+     * @param thr in   the thread which is now waiting
+     *
+     * Insert a waiting thread at index 0.
+     */
+    void set_waiter(kmp_info_t *thr) { 
+        waiting_threads[0] = thr; 
+        num_waiting_threads = 1;
+    }
+    /*!
+     * @result true if the flag object has been released.
+     */
+    bool done_check() { return traits_type::tcr(*(this->get())) == checker; }
+    /*!
+     * @param old_loc in   old value of flag
+     * @result true if the flag's old value indicates it was released.
+     */
+    bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+    /*!
+     * @result true if the flag object is not yet released.
+     * Used in __kmp_wait_template like:
+     * @code
+     * while (flag.notdone_check()) { pause(); }
+     * @endcode
+     */
+    bool notdone_check() { return traits_type::tcr(*(this->get())) != checker; }
+    /*!
+     * @result Actual flag value before release was applied.
+     * Trigger all waiting threads to run by modifying flag to release state.
+     */
+    FlagType internal_release() {
+        return traits_type::test_then_add4((volatile FlagType *)this->get());
+    }
+    /*!
+     * @result Actual flag value before sleep bit(s) set.
+     * Notes that there is at least one thread sleeping on the flag by setting sleep bit(s).
+     */
+    FlagType set_sleeping() { 
+        return traits_type::test_then_or((volatile FlagType *)this->get(), KMP_BARRIER_SLEEP_STATE);
+    }
+    /*!
+     * @result Actual flag value before sleep bit(s) cleared.
+     * Notes that there are no longer threads sleeping on the flag by clearing sleep bit(s).
+     */
+    FlagType unset_sleeping() { 
+        return traits_type::test_then_and((volatile FlagType *)this->get(), ~KMP_BARRIER_SLEEP_STATE);
+    }
+    /*! 
+     * @param old_loc in   old value of flag
+     * Test whether there are threads sleeping on the flag's old value in old_loc.
+     */
+    bool is_sleeping_val(FlagType old_loc) { return old_loc & KMP_BARRIER_SLEEP_STATE; }
+    /*! 
+     * Test whether there are threads sleeping on the flag.
+     */
+    bool is_sleeping() { return is_sleeping_val(*(this->get())); }
+};
+
+class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
+public:
+    kmp_flag_32(volatile kmp_uint32 *p) : kmp_basic_flag<kmp_uint32>(p) {}
+    kmp_flag_32(volatile kmp_uint32 *p, kmp_info_t *thr) : kmp_basic_flag<kmp_uint32>(p, thr) {}
+    kmp_flag_32(volatile kmp_uint32 *p, kmp_uint32 c) : kmp_basic_flag<kmp_uint32>(p, c) {}
+    void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_32(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+};
+
+class kmp_flag_64 : public kmp_basic_flag<kmp_uint64> {
+public:
+    kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag<kmp_uint64>(p) {}
+    kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr) : kmp_basic_flag<kmp_uint64>(p, thr) {}
+    kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c) : kmp_basic_flag<kmp_uint64>(p, c) {}
+    void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_64(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+};
+
+// Hierarchical 64-bit on-core barrier instantiation
+class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
+    kmp_uint64 checker;
+    kmp_info_t * waiting_threads[1];
+    kmp_uint32 num_waiting_threads;
+    kmp_uint32 offset;      /**< Portion of flag that is of interest for an operation. */
+    bool flag_switch;       /**< Indicates a switch in flag location. */
+    enum barrier_type bt;   /**< Barrier type. */
+    kmp_info_t * this_thr;  /**< Thread that may be redirected to different flag location. */
+#if USE_ITT_BUILD
+    void *itt_sync_obj;     /**< ITT object that must be passed to new flag location. */
+#endif
+    char& byteref(volatile kmp_uint64* loc, size_t offset) { return ((char *)loc)[offset]; }
+public:
+    kmp_flag_oncore(volatile kmp_uint64 *p)
+        : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0), flag_switch(false) {}
+    kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
+        : kmp_flag<kmp_uint64>(p, flag_oncore), offset(idx), num_waiting_threads(0), flag_switch(false) {}
+    kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx, enum barrier_type bar_t,
+                    kmp_info_t * thr
+#if USE_ITT_BUILD
+                    , void *itt
+#endif
+                    ) 
+        : kmp_flag<kmp_uint64>(p, flag_oncore), checker(c), offset(idx), bt(bar_t), this_thr(thr)
+#if USE_ITT_BUILD
+        , itt_sync_obj(itt)
+#endif
+        , num_waiting_threads(0), flag_switch(false) {}
+    kmp_info_t * get_waiter(kmp_uint32 i) { 
+        KMP_DEBUG_ASSERT(i<num_waiting_threads);
+        return waiting_threads[i]; 
+    }
+    kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+    void set_waiter(kmp_info_t *thr) { 
+        waiting_threads[0] = thr; 
+        num_waiting_threads = 1;
+    }
+    bool done_check_val(kmp_uint64 old_loc) { return byteref(&old_loc,offset) == checker; }
+    bool done_check() { return done_check_val(*get()); }
+    bool notdone_check() { 
+        // Calculate flag_switch
+        if (this_thr->th.th_bar[bt].bb.wait_flag == KMP_BARRIER_SWITCH_TO_OWN_FLAG)
+            flag_switch = true;
+        if (byteref(get(),offset) != 1 && !flag_switch) 
+            return true;
+        else if (flag_switch) {
+            this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
+            kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go, (kmp_uint64)KMP_BARRIER_STATE_BUMP);
+            __kmp_wait_64(this_thr, &flag, TRUE
+#if USE_ITT_BUILD
+                          , itt_sync_obj
+#endif
+                          );
+        }
+        return false;
+    }
+    kmp_uint64 internal_release() { 
+        kmp_uint64 old_val;
+        if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
+            old_val = *get();
+            byteref(get(),offset) = 1;
+        }
+        else {
+            kmp_uint64 mask=0;
+            byteref(&mask,offset) = 1;
+            old_val = KMP_TEST_THEN_OR64((volatile kmp_int64 *)get(), mask);
+        }
+        return old_val;
+    }
+    kmp_uint64 set_sleeping() { 
+        return KMP_TEST_THEN_OR64((kmp_int64 volatile *)get(), KMP_BARRIER_SLEEP_STATE);
+    }
+    kmp_uint64 unset_sleeping() { 
+        return KMP_TEST_THEN_AND64((kmp_int64 volatile *)get(), ~KMP_BARRIER_SLEEP_STATE);
+    }
+    bool is_sleeping_val(kmp_uint64 old_loc) { return old_loc & KMP_BARRIER_SLEEP_STATE; }
+    bool is_sleeping() { return is_sleeping_val(*get()); }
+    void wait(kmp_info_t *this_thr, int final_spin
+              USE_ITT_BUILD_ARG(void * itt_sync_obj)) {
+        __kmp_wait_template(this_thr, this, final_spin
+                            USE_ITT_BUILD_ARG(itt_sync_obj));
+    }
+    void release() { __kmp_release_template(this); }
+    void suspend(int th_gtid) { __kmp_suspend_oncore(th_gtid, this); }
+    void resume(int th_gtid) { __kmp_resume_oncore(th_gtid, this); }
+    int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) {
+        return __kmp_execute_tasks_oncore(this_thr, gtid, this, final_spin, thread_finished
+                                      USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+    }
+};
+
+/*!
+@}
+*/
+
+#endif // KMP_WAIT_RELEASE_H
diff --git a/openmp/runtime/src/kmp_wrapper_getpid.h b/openmp/runtime/src/kmp_wrapper_getpid.h
index 5cdc1699198..35e4d179f62 100644
--- a/openmp/runtime/src/kmp_wrapper_getpid.h
+++ b/openmp/runtime/src/kmp_wrapper_getpid.h
@@ -1,7 +1,7 @@
 /*
  * kmp_wrapper_getpid.h -- getpid() declaration.
- * $Revision: 42181 $
- * $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/kmp_wrapper_malloc.h b/openmp/runtime/src/kmp_wrapper_malloc.h
index b2e3dbf9e90..b0564b9a3f7 100644
--- a/openmp/runtime/src/kmp_wrapper_malloc.h
+++ b/openmp/runtime/src/kmp_wrapper_malloc.h
@@ -1,8 +1,8 @@
 /*
  * kmp_wrapper_malloc.h -- Wrappers for memory allocation routines
  *                         (malloc(), free(), and others).
- * $Revision: 42181 $
- * $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+ * $Revision: 43084 $
+ * $Date: 2014-04-15 09:15:14 -0500 (Tue, 15 Apr 2014) $
  */
 
 
diff --git a/openmp/runtime/src/libiomp.rc.var b/openmp/runtime/src/libiomp.rc.var
index 619f99baf97..5e221b69622 100644
--- a/openmp/runtime/src/libiomp.rc.var
+++ b/openmp/runtime/src/libiomp.rc.var
@@ -1,6 +1,6 @@
 // libiomp.rc.var
-// $Revision: 42219 $
-// $Date: 2013-03-29 13:36:05 -0500 (Fri, 29 Mar 2013) $
+// $Revision: 42994 $
+// $Date: 2014-03-04 02:22:15 -0600 (Tue, 04 Mar 2014) $
 
 //
 ////===----------------------------------------------------------------------===//
@@ -41,8 +41,6 @@ VS_VERSION_INFO VERSIONINFO
 
                 // FileDescription and LegalCopyright should be short.
                 VALUE "FileDescription",  "Intel(R) OpenMP* Runtime Library${{ our $MESSAGE_CATALOG; $MESSAGE_CATALOG ? " Message Catalog" : "" }}\0"
-                VALUE "LegalCopyright",   "Copyright (C) 1997-2013, Intel Corporation. All rights reserved.\0"
-
                 // Following values may be relatively long.
                 VALUE "CompanyName",      "Intel Corporation\0"
                 // VALUE "LegalTrademarks",  "\0"  // Not used for now.
diff --git a/openmp/runtime/src/makefile.mk b/openmp/runtime/src/makefile.mk
index b4ed8e58067..553c2f8332e 100644
--- a/openmp/runtime/src/makefile.mk
+++ b/openmp/runtime/src/makefile.mk
@@ -1,6 +1,6 @@
 # makefile.mk #
-# $Revision: 42820 $
-# $Date: 2013-11-13 16:53:44 -0600 (Wed, 13 Nov 2013) $
+# $Revision: 43473 $
+# $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
 
 #
 #//===----------------------------------------------------------------------===//
@@ -221,6 +221,18 @@ ifeq "$(filter gcc clang,$(c))" ""
     endif
 endif
 
+# On Linux and Windows Intel64 we need offload attribute for all Fortran entries
+# in order to support OpenMP function calls inside Device constructs
+ifeq "$(fort)" "ifort"
+    ifeq "$(os)_$(arch)" "lin_32e"
+        # TODO: change to -qoffload... when we stop supporting 14.0 compiler (-offload is deprecated)
+        fort-flags += -offload-attribute-target=mic
+    endif
+    ifeq "$(os)_$(arch)" "win_32e"
+        fort-flags += /Qoffload-attribute-target:mic
+    endif
+endif
+
 ifeq "$(os)" "lrb"
     c-flags    += -mmic
     cxx-flags  += -mmic
@@ -361,6 +373,7 @@ ifeq "$(os)" "lin"
         # to remove dependency on libgcc_s:
         ifeq "$(c)" "gcc"
             ld-flags-dll += -static-libgcc
+            # omp_os is non-empty only in the open-source code
             ifneq "$(omp_os)" "freebsd"
                 ld-flags-extra += -Wl,-ldl
             endif
@@ -417,11 +430,15 @@ ifeq "$(os)" "lrb"
             ld-flags += -ldl
         endif
     endif
+    # include the c++ library for stats-gathering code
+    ifeq "$(stats)" "on"
+        ld-flags-extra += -Wl,-lstdc++
+    endif
   endif
 endif
 
 ifeq "$(os)" "mac"
-    ifeq "$(c)" "icc"
+    ifeq "$(ld)" "icc"
         ld-flags += -no-intel-extensions
     endif
     ld-flags += -single_module
@@ -483,6 +500,13 @@ endif
 cpp-flags += -D KMP_ADJUST_BLOCKTIME=1
 cpp-flags += -D BUILD_PARALLEL_ORDERED
 cpp-flags += -D KMP_ASM_INTRINS
+cpp-flags += -D KMP_USE_INTERNODE_ALIGNMENT=0
+# Linux and MIC compile with version symbols
+ifneq "$(filter lin lrb,$(os))" ""
+ifeq "$(filter ppc64,$(arch))" ""
+    cpp-flags += -D KMP_USE_VERSION_SYMBOLS
+endif
+endif
 ifneq "$(os)" "lrb"
     cpp-flags += -D USE_LOAD_BALANCE
 endif
@@ -506,43 +530,52 @@ else # 5
         cpp-flags += -D KMP_GOMP_COMPAT
     endif
 endif
-
+cpp-flags += -D KMP_NESTED_HOT_TEAMS
 ifneq "$(filter 32 32e,$(arch))" ""
 cpp-flags += -D KMP_USE_ADAPTIVE_LOCKS=1 -D KMP_DEBUG_ADAPTIVE_LOCKS=0
 endif
 
+# is the std c++ library needed? (for stats-gathering, it is)
+std_cpp_lib=0
+ifneq "$(filter lin lrb,$(os))" ""
+    ifeq "$(stats)" "on"
+        cpp-flags += -D KMP_STATS_ENABLED=1
+        std_cpp_lib=1
+    else
+        cpp-flags += -D KMP_STATS_ENABLED=0
+    endif
+else # no mac or windows support for stats-gathering
+    ifeq "$(stats)" "on"
+        $(error Statistics-gathering functionality not available on $(os) platform)
+    endif
+    cpp-flags += -D KMP_STATS_ENABLED=0
+endif
+
 # define compatibility with different OpenMP versions
 have_omp_50=0
 have_omp_41=0
 have_omp_40=0
-have_omp_30=0
 ifeq "$(OMP_VERSION)" "50"
 	have_omp_50=1
 	have_omp_41=1
 	have_omp_40=1
-	have_omp_30=1
 endif
 ifeq "$(OMP_VERSION)" "41"
 	have_omp_50=0
 	have_omp_41=1
 	have_omp_40=1
-	have_omp_30=1
 endif
 ifeq "$(OMP_VERSION)" "40"
 	have_omp_50=0
 	have_omp_41=0
 	have_omp_40=1
-	have_omp_30=1
 endif
 ifeq "$(OMP_VERSION)" "30"
 	have_omp_50=0
 	have_omp_41=0
 	have_omp_40=0
-	have_omp_30=1
 endif
-cpp-flags += -D OMP_50_ENABLED=$(have_omp_50) -D OMP_41_ENABLED=$(have_omp_41)
-cpp-flags += -D OMP_40_ENABLED=$(have_omp_40) -D OMP_30_ENABLED=$(have_omp_30)
-
+cpp-flags += -D OMP_50_ENABLED=$(have_omp_50) -D OMP_41_ENABLED=$(have_omp_41) -D OMP_40_ENABLED=$(have_omp_40)
 
 # Using ittnotify is enabled by default.
 USE_ITT_NOTIFY = 1
@@ -598,8 +631,8 @@ ifneq "$(os)" "win"
         z_Linux_asm$(obj) : \
 			cpp-flags += -D KMP_ARCH_PPC64		    
     else
-    	z_Linux_asm$(obj) : \
-       		cpp-flags += -D KMP_ARCH_X86$(if $(filter 32e,$(arch)),_64)	
+        z_Linux_asm$(obj) : \
+		    cpp-flags += -D KMP_ARCH_X86$(if $(filter 32e,$(arch)),_64)
     endif
 endif
 
@@ -699,6 +732,8 @@ else # norm or prof
         kmp_i18n                     \
         kmp_io                       \
         kmp_runtime                  \
+        kmp_wait_release             \
+        kmp_barrier                  \
         kmp_settings                 \
         kmp_str                      \
         kmp_tasking                  \
@@ -715,6 +750,10 @@ ifeq "$(OMP_VERSION)" "40"
     lib_cpp_items += kmp_taskdeps
     lib_cpp_items += kmp_cancel
 endif
+ifeq "$(stats)" "on"
+    lib_cpp_items += kmp_stats
+    lib_cpp_items += kmp_stats_timing
+endif
 
     # OS-specific files.
     ifeq "$(os)" "win"
@@ -1272,8 +1311,20 @@ ifneq "$(os)" "lrb"
         # On Linux* OS and OS X* the test is good enough because GNU compiler knows nothing
         # about libirc and Intel compiler private lib directories, but we will grep verbose linker
         # output just in case.
-        tt-c        = cc
-        ifeq "$(os)" "lin"    # GCC on OS X* does not recognize -pthread.
+        # Using clang on OS X* because of discontinued support of GNU compilers.
+        ifeq "$(os)" "mac"
+            ifeq "$(std_cpp_lib)" "1"
+                tt-c        = clang++
+            else
+                tt-c        = clang
+            endif
+        else # lin
+            ifeq "$(std_cpp_lib)" "1"
+                tt-c        = g++
+            else
+                tt-c        = gcc
+            endif
+            # GCC on OS X* does not recognize -pthread.
             tt-c-flags  += -pthread
         endif
         tt-c-flags += -o $(tt-exe-file)
@@ -1416,6 +1467,10 @@ ifneq "$(filter %-dyna win-%,$(os)-$(LINK_TYPE))" ""
             td_exp += libc.so.6
             td_exp += ld64.so.1
         endif
+        ifeq "$(std_cpp_lib)" "1"
+            td_exp += libstdc++.so.6
+        endif
+
         td_exp += libdl.so.2
         td_exp += libgcc_s.so.1
         ifeq "$(filter 32 32e 64 ppc64,$(arch))" ""
@@ -1428,6 +1483,9 @@ ifneq "$(filter %-dyna win-%,$(os)-$(LINK_TYPE))" ""
     endif
     ifeq "$(os)" "lrb"
         ifeq "$(MIC_OS)" "lin"
+            ifeq "$(std_cpp_lib)" "1"
+                td_exp += libstdc++.so.6
+            endif
             ifeq "$(MIC_ARCH)" "knf"
                 td_exp += "ld-linux-l1om.so.2"
                 td_exp += libc.so.6
@@ -1459,8 +1517,9 @@ ifneq "$(filter %-dyna win-%,$(os)-$(LINK_TYPE))" ""
             td_exp += uuid
         endif
     endif
+
     ifeq "$(omp_os)" "freebsd"
-        td_exp =
+        td_exp = 
         td_exp += libc.so.7
         td_exp += libthr.so.3
         td_exp += libunwind.so.5
diff --git a/openmp/runtime/src/rules.mk b/openmp/runtime/src/rules.mk
index 3d407356fdf..09f30ba26b9 100644
--- a/openmp/runtime/src/rules.mk
+++ b/openmp/runtime/src/rules.mk
@@ -1,6 +1,6 @@
 # rules.mk #
-# $Revision: 42423 $
-# $Date: 2013-06-07 09:25:21 -0500 (Fri, 07 Jun 2013) $
+# $Revision: 42951 $
+# $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
 
 #
 #//===----------------------------------------------------------------------===//
diff --git a/openmp/runtime/src/test-touch.c b/openmp/runtime/src/test-touch.c
index 3470f7ca6f9..6ce529ae23a 100644
--- a/openmp/runtime/src/test-touch.c
+++ b/openmp/runtime/src/test-touch.c
@@ -11,9 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 extern double omp_get_wtime();
 extern int    omp_get_num_threads();
 extern int    omp_get_max_threads();
+#ifdef __cplusplus
+}
+#endif
 
 int main() {
     omp_get_wtime();
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
index a9988db002f..d05d8b73c36 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify.h
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef _ITTNOTIFY_H_
 #define _ITTNOTIFY_H_
 
@@ -141,7 +140,7 @@ The same ID may not be reused for different instances, unless a previous
 #  if ITT_PLATFORM==ITT_PLATFORM_WIN
 #    define CDECL __cdecl
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
+#    if defined _M_IX86 || defined __i386__ 
 #      define CDECL __attribute__ ((cdecl))
 #    else  /* _M_IX86 || __i386__ */
 #      define CDECL /* actual only on x86 platform */
@@ -154,7 +153,7 @@ The same ID may not be reused for different instances, unless a previous
 #    define STDCALL __stdcall
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
+#      define STDCALL __attribute__ ((stdcall)) 
 #    else  /* _M_IX86 || __i386__ */
 #      define STDCALL /* supported only on x86 platform */
 #    endif /* _M_IX86 || __i386__ */
@@ -416,19 +415,19 @@ ITT_STUBV(ITTAPI, void, thread_ignore, (void))
  *********************************************************************/
 /** @{ */
 /**
- * @hideinitializer
+ * @hideinitializer 
  * @brief possible value for suppression mask
  */
 #define __itt_suppress_all_errors 0x7fffffff
 
 /**
- * @hideinitializer
+ * @hideinitializer 
  * @brief possible value for suppression mask (suppresses errors from threading analysis)
  */
 #define __itt_suppress_threading_errors 0x000000ff
 
 /**
- * @hideinitializer
+ * @hideinitializer 
  * @brief possible value for suppression mask (suppresses errors from memory analysis)
  */
 #define __itt_suppress_memory_errors 0x0000ff00
@@ -454,7 +453,7 @@ ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
 /** @endcond */
 
 /**
- * @brief Undo the effects of the matching call to __itt_suppress_push
+ * @brief Undo the effects of the matching call to __itt_suppress_push  
  */
 void ITTAPI __itt_suppress_pop(void);
 
@@ -1584,13 +1583,13 @@ ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
  * @brief Specify the type of heap detection/reporting to modify.
  */
 /**
- * @hideinitializer
+ * @hideinitializer 
  * @brief Report on memory leaks.
  */
 #define __itt_heap_leaks 0x00000001
 
 /**
- * @hideinitializer
+ * @hideinitializer 
  * @brief Report on memory growth.
  */
 #define __itt_heap_growth 0x00000002
@@ -1667,7 +1666,7 @@ typedef struct ___itt_domain
  * @ingroup domains
  * @brief Create a domain.
  * Create domain using some domain name: the URI naming style is recommended.
- * Because the set of domains is expected to be static over the application's
+ * Because the set of domains is expected to be static over the application's 
  * execution time, there is no mechanism to destroy a domain.
  * Any domain can be accessed by any thread in the process, regardless of
  * which thread created the domain. This call is thread-safe.
@@ -1802,7 +1801,7 @@ ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
  * @brief Destroy an instance of identifier.
  * This ends the lifetime of the current instance of the given ID value in the trace.
  * Any relationships that are established after this lifetime ends are invalid.
- * This call must be performed before the given ID value can be reused for a different
+ * This call must be performed before the given ID value can be reused for a different 
  * named entity instance.
  * @param[in] domain The domain controlling the execution of this call.
  * @param[in] id The ID to destroy.
@@ -1926,10 +1925,10 @@ static const __itt_timestamp __itt_timestamp_none = (__itt_timestamp)-1LL;
 
 /**
  * @ingroup timestamps
- * @brief Return timestamp corresponding to current moment.
- * This returns the timestamp in format that is most relevant for the current
- * host or platform.  Do not rely that it's RDTSC value.  It is possible
- * to compare __itt_timestamp values with "<" operator.
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
  */
 __itt_timestamp ITTAPI __itt_get_timestamp(void);
 
@@ -2053,13 +2052,13 @@ void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
 /** @cond exclude_from_documentation */
 #ifndef INTEL_NO_MACRO_BODY
 #ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, frame_begin_v3, (const __itt_domain *domain, __itt_id *id))
-ITT_STUBV(ITTAPI, void, frame_end_v3,   (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
 ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
-#define __itt_frame_begin_v3(d,x)   ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
-#define __itt_frame_begin_v3_ptr    ITTNOTIFY_NAME(frame_begin_v3)
-#define __itt_frame_end_v3(d,x)     ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
-#define __itt_frame_end_v3_ptr      ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
 #define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
 #define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
 #else  /* INTEL_NO_ITTNOTIFY_API */
@@ -2361,7 +2360,7 @@ ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id,
  * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
  * @param[in] key The name of the metadata
  * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated 
 */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
@@ -2397,9 +2396,9 @@ ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #else  /* INTEL_NO_ITTNOTIFY_API */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA(d,x,y,z,a)
+#define __itt_metadata_str_addA(d,x,y,z,a) 
 #define __itt_metadata_str_addA_ptr 0
-#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW(d,x,y,z,a) 
 #define __itt_metadata_str_addW_ptr 0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #define __itt_metadata_str_add(d,x,y,z,a)
@@ -2423,7 +2422,7 @@ ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id
  * @param[in] scope The scope of the instance to which the metadata is to be added
 
  * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
-
+ 
  * @param[in] key The name of the metadata
  * @param[in] type The type of the metadata
  * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
@@ -2456,7 +2455,7 @@ ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __
 
  * @param[in] key The name of the metadata
  * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated 
 */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
@@ -2492,9 +2491,9 @@ ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #else  /* INTEL_NO_ITTNOTIFY_API */
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) 
 #define __itt_metadata_str_add_with_scopeA_ptr  0
-#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) 
 #define __itt_metadata_str_add_with_scopeW_ptr  0
 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #define __itt_metadata_str_add_with_scope(d,x,y,z,a)
@@ -3080,9 +3079,9 @@ ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
 
 /**
  * @enum __itt_av_data_type
- * @brief Defines types of arrays data (for C/C++ intrinsic types)
+ * @brief Defines types of arrays data (for C/C++ intrinsic types) 
  */
-typedef enum
+typedef enum 
 {
     __itt_e_first = 0,
     __itt_e_char = 0,  /* 1-byte integer */
@@ -3102,8 +3101,8 @@ typedef enum
  * @brief Save an array data to a file.
  * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
  * @param[in] data - pointer to the array data
- * @param[in] rank - the rank of the array
- * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
+ * @param[in] rank - the rank of the array 
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions. 
  * The size of dimensions must be equal to the rank
  * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
  * @param[in] filePath - the file path; the output format is defined by the file extension
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
index 9e7b36b5890..863ab956d63 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h
@@ -74,7 +74,7 @@
 #  if ITT_PLATFORM==ITT_PLATFORM_WIN
 #    define CDECL __cdecl
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
+#    if defined _M_IX86 || defined __i386__ 
 #      define CDECL __attribute__ ((cdecl))
 #    else  /* _M_IX86 || __i386__ */
 #      define CDECL /* actual only on x86 platform */
@@ -87,7 +87,7 @@
 #    define STDCALL __stdcall
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
+#      define STDCALL __attribute__ ((stdcall)) 
 #    else  /* _M_IX86 || __i386__ */
 #      define STDCALL /* supported only on x86 platform */
 #    endif /* _M_IX86 || __i386__ */
@@ -267,7 +267,7 @@ ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
 #ifdef __INTEL_COMPILER
 #define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
 #else  /* __INTEL_COMPILER */
-/* TODO: Add Support for not Intel compilers for IA-64 architecture */
+/* TODO: Add Support for not Intel compilers for IA-64 */
 #endif /* __INTEL_COMPILER */
 #elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */
 ITT_INLINE long
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c
index 4b5f464feb8..1b44011b5e3 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.c
@@ -22,14 +22,14 @@
 #include <stdarg.h>
 #include <string.h>
 
-#define INTEL_NO_MACRO_BODY
+#define INTEL_NO_MACRO_BODY 
 #define INTEL_ITTNOTIFY_API_PRIVATE
 #include "ittnotify.h"
 #include "legacy/ittnotify.h"
 
 #include "disable_warnings.h"
 
-static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 42754 $\n";
+static const char api_version[] = API_VERSION "\0\n@(#) $Revision: 43375 $\n";
 
 #define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
 
@@ -44,13 +44,34 @@ static const char* ittnotify_lib_name = "libittnotify.dylib";
 #endif
 
 #ifdef __ANDROID__
+#include <android/log.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+
+#ifdef ITT_ANDROID_LOG
+    #define ITT_ANDROID_LOG_TAG   "INTEL_VTUNE_USERAPI"
+    #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+    #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__))
+#else
+    #define ITT_ANDROID_LOGI(...)
+    #define ITT_ANDROID_LOGW(...)
+    #define ITT_ANDROID_LOGE(...)
+    #define ITT_ANDROID_LOGD(...)
+#endif
+
 /* default location of userapi collector on Android */
 #define ANDROID_ITTNOTIFY_DEFAULT_PATH  "/data/data/com.intel.vtune/intel/libittnotify.so"
 #endif
 
 
 #ifndef LIB_VAR_NAME
-#if ITT_ARCH==ITT_ARCH_IA32
+#if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM
 #define LIB_VAR_NAME INTEL_LIBITTNOTIFY32
 #else
 #define LIB_VAR_NAME INTEL_LIBITTNOTIFY64
@@ -687,6 +708,92 @@ static const char* __itt_get_lib_name(void)
 {
     const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
 
+#ifdef __ANDROID__
+    if (lib_name == NULL)
+    {
+        const char* const system_wide_marker_filename = "/data/local/tmp/com.intel.itt.collector_lib";
+        int itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY);
+        ssize_t res = 0;
+
+        if (itt_marker_file_fd == -1)
+        {
+            const pid_t my_pid = getpid();
+            char cmdline_path[PATH_MAX] = {0};
+            char package_name[PATH_MAX] = {0};
+            char app_sandbox_file[PATH_MAX] = {0};
+            int cmdline_fd = 0;
+
+            ITT_ANDROID_LOGI("Unable to open system-wide marker file.");
+            snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid);
+            ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path);
+            cmdline_fd = open(cmdline_path, O_RDONLY);
+            if (cmdline_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path);
+                return lib_name;
+            }
+            res = read(cmdline_fd, package_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path);
+                res = close(cmdline_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                }
+                return lib_name;
+            }
+            res = close(cmdline_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Package name: %s\n", package_name);
+            snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/com.intel.itt.collector_lib", package_name);
+            ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file);
+            itt_marker_file_fd = open(app_sandbox_file, O_RDONLY);
+            if (itt_marker_file_fd == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to open app marker file!");
+                return lib_name;
+            }
+        }
+
+        {
+            char itt_lib_name[PATH_MAX] = {0};
+
+            res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd);
+                res = close(itt_marker_file_fd);
+                if (res == -1)
+                {
+                    ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                }
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name);
+            res = close(itt_marker_file_fd);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd);
+                return lib_name;
+            }
+            ITT_ANDROID_LOGI("Set env");
+            res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0);
+            if (res == -1)
+            {
+                ITT_ANDROID_LOGE("Unable to set env var!");
+                return lib_name;
+            }
+            lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME));
+            ITT_ANDROID_LOGI("ITT Lib path from env: %s", itt_lib_name);
+        }
+    }
+#endif
+
     return lib_name;
 }
 
diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
index fe1fe3c14f7..a218cc87bf1 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_static.h
@@ -105,14 +105,14 @@ ITT_STUBV(ITTAPI, void, model_clear_uses,          (void* addr),              (I
 
 #ifndef __ITT_INTERNAL_BODY
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name), (ITT_FORMAT name), model_site_beginW, __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name), (ITT_FORMAT name), model_task_beginW, __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_site_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_site_beginW,         __itt_group_model, "\"%s\"")
+ITT_STUBV(ITTAPI, void, model_task_beginW,         (const wchar_t *name),     (ITT_FORMAT name),       model_task_beginW,         __itt_group_model, "\"%s\"")
 ITT_STUBV(ITTAPI, void, model_iteration_taskW,     (const wchar_t *name),     (ITT_FORMAT name),       model_iteration_taskW,     __itt_group_model, "\"%s\"")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 ITT_STUBV(ITTAPI, void, model_site_beginA,         (const char *name),        (ITT_FORMAT name),       model_site_beginA,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL, __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_site_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL,    __itt_group_model, "\"%s\", %d")
 ITT_STUBV(ITTAPI, void, model_task_beginA,         (const char *name),        (ITT_FORMAT name),       model_task_beginA,         __itt_group_model, "\"%s\"")
-ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL, __itt_group_model, "\"%s\", %d")
+ITT_STUBV(ITTAPI, void, model_task_beginAL,        (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL,    __itt_group_model, "\"%s\", %d")
 ITT_STUBV(ITTAPI, void, model_iteration_taskA,     (const char *name),        (ITT_FORMAT name),       model_iteration_taskA,     __itt_group_model, "\"%s\"")
 ITT_STUBV(ITTAPI, void, model_iteration_taskAL,    (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d")
 ITT_STUBV(ITTAPI, void, model_site_end_2,          (void),                    (ITT_NO_PARAMS),         model_site_end_2,          __itt_group_model, "no args")
@@ -154,8 +154,8 @@ ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id,
 ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id),                                             (ITT_FORMAT domain, id),               region_end,   __itt_group_structure, "%p, %lu")
 
 #ifndef __ITT_INTERNAL_BODY
-ITT_STUBV(ITTAPI, void, frame_begin_v3, (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id), frame_begin_v3, __itt_group_structure, "%p, %p")
-ITT_STUBV(ITTAPI, void, frame_end_v3,   (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id), frame_end_v3,   __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_begin_v3,  __itt_group_structure, "%p, %p")
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id),                                             (ITT_FORMAT domain, id),             frame_end_v3,    __itt_group_structure, "%p, %p")
 ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu")
 #endif /* __ITT_INTERNAL_BODY */
 
diff --git a/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h b/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
index 08e5bb90883..4d87bd3ded9 100644
--- a/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
+++ b/openmp/runtime/src/thirdparty/ittnotify/legacy/ittnotify.h
@@ -79,7 +79,7 @@
 #  if ITT_PLATFORM==ITT_PLATFORM_WIN
 #    define CDECL __cdecl
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
+#    if defined _M_IX86 || defined __i386__ 
 #      define CDECL __attribute__ ((cdecl))
 #    else  /* _M_IX86 || __i386__ */
 #      define CDECL /* actual only on x86 platform */
@@ -92,7 +92,7 @@
 #    define STDCALL __stdcall
 #  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
+#      define STDCALL __attribute__ ((stdcall)) 
 #    else  /* _M_IX86 || __i386__ */
 #      define STDCALL /* supported only on x86 platform */
 #    endif /* _M_IX86 || __i386__ */
diff --git a/openmp/runtime/src/z_Linux_asm.s b/openmp/runtime/src/z_Linux_asm.s
index 64c80522614..2b982234307 100644
--- a/openmp/runtime/src/z_Linux_asm.s
+++ b/openmp/runtime/src/z_Linux_asm.s
@@ -1,7 +1,7 @@
 //  z_Linux_asm.s:  - microtasking routines specifically
 //                    written for Intel platforms running Linux* OS
-// $Revision: 42810 $
-// $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
+// $Revision: 43473 $
+// $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
 
 //
 ////===----------------------------------------------------------------------===//
@@ -489,118 +489,6 @@ __kmp_unnamed_critical_addr:
 
 //------------------------------------------------------------------------
 //
-// FUNCTION __kmp_test_then_add_real32
-//
-// kmp_real32
-// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
-//
-
-        PROC  __kmp_test_then_add_real32
-
-_addr = 8
-_data = 12
-_old_value = -4
-_new_value = -8
-
-        pushl   %ebp
-        movl    %esp, %ebp
-        subl    $8, %esp
-        pushl   %esi
-        pushl   %ebx
-        movl    _addr(%ebp), %esi
-L22:
-        flds    (%esi)
-                        // load <addr>
-        fsts    _old_value(%ebp)
-                        // store into old_value
-        fadds   _data(%ebp)
-        fstps   _new_value(%ebp)
-                        // new_value = old_value + data
-
-        movl    _old_value(%ebp), %eax
-                        // load old_value
-        movl    _new_value(%ebp), %ebx
-                        // load new_value
-
-	lock
-	cmpxchgl %ebx,(%esi)
-                        // Compare %EAX with <addr>.  If equal set
-                        // ZF and load %EBX into <addr>.  Else, clear
-                        // ZF and load <addr> into %EAX.
-        jnz     L22
-
-
-        flds    _old_value(%ebp)
-                        // return old_value
-        popl    %ebx
-        popl    %esi
-        movl    %ebp, %esp
-        popl    %ebp
-        ret
-
-        DEBUG_INFO __kmp_test_then_add_real32
-
-//------------------------------------------------------------------------
-//
-// FUNCTION __kmp_test_then_add_real64
-//
-// kmp_real64
-// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
-//
-        PROC  __kmp_test_then_add_real64
-
-_addr = 8
-_data = 12
-_old_value = -8
-_new_value = -16
-
-        pushl   %ebp
-        movl    %esp, %ebp
-        subl    $16, %esp
-        pushl   %esi
-        pushl   %ebx
-        pushl   %ecx
-        pushl   %edx
-        movl    _addr(%ebp), %esi
-L44:
-        fldl    (%esi)
-                        // load <addr>
-        fstl    _old_value(%ebp)
-                        // store into old_value
-        faddl   _data(%ebp)
-        fstpl   _new_value(%ebp)
-                        // new_value = old_value + data
-
-        movl    _old_value+4(%ebp), %edx
-        movl    _old_value(%ebp), %eax
-                        // load old_value
-        movl    _new_value+4(%ebp), %ecx
-        movl    _new_value(%ebp), %ebx
-                        // load new_value
-
-	lock
-	cmpxchg8b (%esi)
-                        // Compare %EDX:%EAX with <addr>.  If equal set
-                        // ZF and load %ECX:%EBX into <addr>.  Else, clear
-                        // ZF and load <addr> into %EDX:%EAX.
-        jnz     L44
-
-
-        fldl    _old_value(%ebp)
-                        // return old_value
-        popl    %edx
-        popl    %ecx
-        popl    %ebx
-        popl    %esi
-        movl    %ebp, %esp
-        popl    %ebp
-        ret
-
-        DEBUG_INFO __kmp_test_then_add_real64
-
-
-//------------------------------------------------------------------------
-//
 // FUNCTION __kmp_load_x87_fpu_control_word
 //
 // void
@@ -758,30 +646,7 @@ L44:
 	.data
 	ALIGN 4
 
-// AC: The following #if hiden the .text thus moving the rest of code into .data section on MIC.
-// To prevent this in future .text added to every routine definition for x86_64.
-# if __MIC__ || __MIC2__
-
-# else
-
-//------------------------------------------------------------------------
-//
-// FUNCTION __kmp_x86_pause
-//
-// void
-// __kmp_x86_pause( void );
-//
-
-        .text
-        PROC  __kmp_x86_pause
-
-        pause_op
-        ret
-
-        DEBUG_INFO __kmp_x86_pause
-
-# endif // __MIC__ || __MIC2__
-
+// To prevent getting our code into .data section .text added to every routine definition for x86_64.
 //------------------------------------------------------------------------
 //
 // FUNCTION __kmp_x86_cpuid
@@ -1176,79 +1041,6 @@ L44:
 
 # if ! (__MIC__ || __MIC2__)
 
-//------------------------------------------------------------------------
-//
-// FUNCTION __kmp_test_then_add_real32
-//
-// kmp_real32
-// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
-//
-// parameters:
-// 	addr:	%rdi
-// 	data:	%xmm0 (lower 4 bytes)
-//
-// return:	%xmm0 (lower 4 bytes)
-
-        .text
-        PROC  __kmp_test_then_add_real32
-1:
-	movss   (%rdi), %xmm1	// load value of <addr>
-	movd	%xmm1, %eax	// save old value of <addr>
-
-	addss	%xmm0, %xmm1	// new value = old value + <data>
-	movd	%xmm1, %ecx	// move new value to GP reg.
-
-	lock
-	cmpxchgl %ecx, (%rdi)	// Compare %EAX with <addr>.  If equal set
-                             	// ZF and exchange %ECX with <addr>.  Else,
-                                // clear ZF and load <addr> into %EAX.
-        jz      2f
-	pause_op
-	jmp	1b
-2:
-	movd	%eax, %xmm0	// load old value into return register
-        ret
-
-        DEBUG_INFO __kmp_test_then_add_real32
-
-
-//------------------------------------------------------------------------
-//
-// FUNCTION __kmp_test_then_add_real64
-//
-// kmp_real64
-// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
-//
-// parameters:
-//      addr:   %rdi
-//      data:   %xmm0 (lower 8 bytes)
-//      return: %xmm0 (lower 8 bytes)
-//
-
-        .text
-        PROC  __kmp_test_then_add_real64
-1:
-        movlpd	(%rdi), %xmm1	// load value of <addr>
-	movd	%xmm1, %rax	// save old value of <addr>
-
-	addsd	%xmm0, %xmm1	// new value = old value + <data>
-	movd	%xmm1, %rcx	// move new value to GP reg.
-
-	lock
-	cmpxchgq  %rcx, (%rdi) 	// Compare %RAX with <addr>.  If equal set
-				// ZF and exchange %RCX with <addr>.  Else,
-				// clear ZF and load <addr> into %RAX.
-        jz      2f
-	pause_op
-	jmp     1b
-
-2:
-	movd	%rax, %xmm0	// load old value into return register
-        ret
-
-        DEBUG_INFO __kmp_test_then_add_real64
-
-
 # if !KMP_ASM_INTRINS
 
 //------------------------------------------------------------------------
@@ -1382,7 +1174,7 @@ L44:
 // typedef void	(*microtask_t)( int *gtid, int *tid, ... );
 //
 // int
-// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...),
+// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
 //		           int gtid, int tid,
 //                         int argc, void *p_argv[] ) {
 //    (*pkfn)( & gtid, & tid, argv[0], ... );
@@ -1597,5 +1389,9 @@ __kmp_unnamed_critical_addr:
 #endif /* KMP_ARCH_PPC64 */
 
 #if defined(__linux__)
+# if KMP_ARCH_ARM
+.section .note.GNU-stack,"",%progbits
+# else
 .section .note.GNU-stack,"",@progbits
+# endif
 #endif
diff --git a/openmp/runtime/src/z_Linux_util.c b/openmp/runtime/src/z_Linux_util.c
index 01f463e4c50..d66d2a4557c 100644
--- a/openmp/runtime/src/z_Linux_util.c
+++ b/openmp/runtime/src/z_Linux_util.c
@@ -1,7 +1,7 @@
 /*
  * z_Linux_util.c -- platform specific routines.
- * $Revision: 42847 $
- * $Date: 2013-11-26 09:10:01 -0600 (Tue, 26 Nov 2013) $
+ * $Revision: 43473 $
+ * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
  */
 
 
@@ -21,6 +21,8 @@
 #include "kmp_str.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
+#include "kmp_stats.h"
+#include "kmp_wait_release.h"
 
 #if !KMP_OS_FREEBSD
 # include <alloca.h>
@@ -654,7 +656,6 @@ __kmp_set_stack_info( int gtid, kmp_info_t *th )
         return TRUE;
     }
 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
-
     /* Use incremental refinement starting from initial conservative estimate */
     TCW_PTR(th->th.th_info.ds.ds_stacksize, 0);
     TCW_PTR(th -> th.th_info.ds.ds_stackbase, &stack_data);
@@ -679,6 +680,10 @@ __kmp_launch_worker( void *thr )
 #ifdef KMP_TDATA_GTID
     __kmp_gtid = gtid;
 #endif
+#if KMP_STATS_ENABLED
+    // set __thread local index to point to thread-specific stats
+    __kmp_stats_thread_ptr = ((kmp_info_t*)thr)->th.th_stats;
+#endif
 
 #if USE_ITT_BUILD
     __kmp_itt_thread_name( gtid );
@@ -815,6 +820,7 @@ __kmp_launch_monitor( void *thr )
                 );
             }; // if
         }; // if
+        TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );  // AC: free thread that waits for monitor started
     }
     #endif // KMP_REAL_TIME_FIX
 
@@ -942,6 +948,25 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
 
     th->th.th_info.ds.ds_gtid = gtid;
 
+#if KMP_STATS_ENABLED
+    // sets up worker thread stats
+    __kmp_acquire_tas_lock(&__kmp_stats_lock, gtid);
+
+    // th->th.th_stats is used to transfer thread specific stats-pointer to __kmp_launch_worker
+    // So when thread is created (goes into __kmp_launch_worker) it will
+    // set it's __thread local pointer to th->th.th_stats
+    th->th.th_stats = __kmp_stats_list.push_back(gtid);
+    if(KMP_UBER_GTID(gtid)) {
+        __kmp_stats_start_time = tsc_tick_count::now();
+        __kmp_stats_thread_ptr = th->th.th_stats;
+        __kmp_stats_init();
+        KMP_START_EXPLICIT_TIMER(OMP_serial);
+        KMP_START_EXPLICIT_TIMER(OMP_start_end);
+    }
+    __kmp_release_tas_lock(&__kmp_stats_lock, gtid);
+
+#endif // KMP_STATS_ENABLED
+
     if ( KMP_UBER_GTID(gtid) ) {
         KA_TRACE( 10, ("__kmp_create_worker: uber thread (%d)\n", gtid ) );
         th -> th.th_info.ds.ds_thread = pthread_self();
@@ -1088,6 +1113,8 @@ __kmp_create_monitor( kmp_info_t *th )
     th->th.th_info.ds.ds_gtid = KMP_GTID_MONITOR;
     #if KMP_REAL_TIME_FIX
         TCW_4( __kmp_global.g.g_time.dt.t_value, -1 ); // Will use it for synchronization a bit later.
+    #else
+        TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );
     #endif // KMP_REAL_TIME_FIX
 
     #ifdef KMP_THREAD_ATTR
@@ -1156,8 +1183,6 @@ __kmp_create_monitor( kmp_info_t *th )
         }; // if
     #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
 
-    TCW_4( __kmp_global.g.g_time.dt.t_value, 0 );
-
     status = pthread_create( &handle, & thread_attr, __kmp_launch_monitor, (void *) th );
 
     if ( status != 0 ) {
@@ -1229,7 +1254,7 @@ void
 __kmp_exit_thread(
     int exit_status
 ) {
-    pthread_exit( (void *) exit_status );
+    pthread_exit( (void *)(intptr_t) exit_status );
 } // __kmp_exit_thread
 
 void __kmp_resume_monitor();
@@ -1642,62 +1667,54 @@ __kmp_suspend_uninitialize_thread( kmp_info_t *th )
     }
 }
 
-/*
- * This routine puts the calling thread to sleep after setting the
- * sleep bit for the indicated spin variable to true.
+/* This routine puts the calling thread to sleep after setting the
+ * sleep bit for the indicated flag variable to true.
  */
-
-void
-__kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
+template <class C>
+static inline void __kmp_suspend_template( int th_gtid, C *flag )
 {
+    KMP_TIME_BLOCK(USER_suspend);
     kmp_info_t *th = __kmp_threads[th_gtid];
     int status;
-    kmp_uint old_spin;
+    typename C::flag_t old_spin;
 
-    KF_TRACE( 30, ("__kmp_suspend: T#%d enter for spin = %p\n", th_gtid, spinner ) );
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d enter for flag = %p\n", th_gtid, flag->get() ) );
 
     __kmp_suspend_initialize_thread( th );
 
     status = pthread_mutex_lock( &th->th.th_suspend_mx.m_mutex );
     KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
 
-    KF_TRACE( 10, ( "__kmp_suspend: T#%d setting sleep bit for spin(%p)\n",
-                    th_gtid, spinner ) );
+    KF_TRACE( 10, ( "__kmp_suspend_template: T#%d setting sleep bit for spin(%p)\n",
+                    th_gtid, flag->get() ) );
 
     /* TODO: shouldn't this use release semantics to ensure that __kmp_suspend_initialize_thread
        gets called first?
     */
-    old_spin = KMP_TEST_THEN_OR32( (volatile kmp_int32 *) spinner,
-                                     KMP_BARRIER_SLEEP_STATE );
-
-    KF_TRACE( 5, ( "__kmp_suspend: T#%d set sleep bit for spin(%p)==%d\n",
-                   th_gtid, spinner, *spinner ) );
+    old_spin = flag->set_sleeping();
 
-    if ( old_spin == checker ) {
-        KMP_TEST_THEN_AND32( (volatile kmp_int32 *) spinner, ~(KMP_BARRIER_SLEEP_STATE) );
+    KF_TRACE( 5, ( "__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%d\n",
+                   th_gtid, flag->get(), *(flag->get()) ) );
 
-        KF_TRACE( 5, ( "__kmp_suspend: T#%d false alarm, reset sleep bit for spin(%p)\n",
-                       th_gtid, spinner) );
+    if ( flag->done_check_val(old_spin) ) {
+        old_spin = flag->unset_sleeping();
+        KF_TRACE( 5, ( "__kmp_suspend_template: T#%d false alarm, reset sleep bit for spin(%p)\n",
+                       th_gtid, flag->get()) );
     } else {
-
         /* Encapsulate in a loop as the documentation states that this may
          * "with low probability" return when the condition variable has
          * not been signaled or broadcast
          */
         int deactivated = FALSE;
-        TCW_PTR(th->th.th_sleep_loc, spinner);
-        while ( TCR_4( *spinner ) & KMP_BARRIER_SLEEP_STATE ) {
+        TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+        while ( flag->is_sleeping() ) {
 #ifdef DEBUG_SUSPEND
             char buffer[128];
             __kmp_suspend_count++;
             __kmp_print_cond( buffer, &th->th.th_suspend_cv );
-            __kmp_printf( "__kmp_suspend: suspending T#%d: %s\n", th_gtid, buffer );
+            __kmp_printf( "__kmp_suspend_template: suspending T#%d: %s\n", th_gtid, buffer );
 #endif
-
-            //
-            // Mark the thread as no longer active
-            // (only in the first iteration of the loop).
-            //
+            // Mark the thread as no longer active (only in the first iteration of the loop).
             if ( ! deactivated ) {
                 th->th.th_active = FALSE;
                 if ( th->th.th_active_in_pool ) {
@@ -1724,11 +1741,11 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
             now.tv_sec  += msecs / 1000;
             now.tv_nsec += (msecs % 1000)*1000;
 
-            KF_TRACE( 15, ( "__kmp_suspend: T#%d about to perform pthread_cond_timedwait\n",
+            KF_TRACE( 15, ( "__kmp_suspend_template: T#%d about to perform pthread_cond_timedwait\n",
                             th_gtid ) );
             status = pthread_cond_timedwait( &th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex, & now );
 #else
-            KF_TRACE( 15, ( "__kmp_suspend: T#%d about to perform pthread_cond_wait\n",
+            KF_TRACE( 15, ( "__kmp_suspend_template: T#%d about to perform pthread_cond_wait\n",
                                th_gtid ) );
 
             status = pthread_cond_wait( &th->th.th_suspend_cv.c_cond, &th->th.th_suspend_mx.m_mutex );
@@ -1739,28 +1756,23 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
             }
 #ifdef KMP_DEBUG
             if (status == ETIMEDOUT) {
-                if ( (*spinner) & KMP_BARRIER_SLEEP_STATE ) {
-                    KF_TRACE( 100, ( "__kmp_suspend: T#%d timeout wakeup\n", th_gtid ) );
+                if ( flag->is_sleeping() ) {
+                    KF_TRACE( 100, ( "__kmp_suspend_template: T#%d timeout wakeup\n", th_gtid ) );
                 } else {
-                    KF_TRACE( 2, ( "__kmp_suspend: T#%d timeout wakeup, sleep bit not set!\n",
+                    KF_TRACE( 2, ( "__kmp_suspend_template: T#%d timeout wakeup, sleep bit not set!\n",
                                    th_gtid ) );
                 }
-            } else if ( (*spinner) & KMP_BARRIER_SLEEP_STATE ) {
-                KF_TRACE( 100, ( "__kmp_suspend: T#%d spurious wakeup\n", th_gtid ) );
+            } else if ( flag->is_sleeping() ) {
+                KF_TRACE( 100, ( "__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid ) );
             }
 #endif
-
         } // while
 
-        //
-        // Mark the thread as active again
-        // (if it was previous marked as inactive)
-        //
+        // Mark the thread as active again (if it was previous marked as inactive)
         if ( deactivated ) {
             th->th.th_active = TRUE;
             if ( TCR_4(th->th.th_in_pool) ) {
-                KMP_TEST_THEN_INC32(
-                  (kmp_int32 *) &__kmp_thread_pool_active_nth );
+                KMP_TEST_THEN_INC32( (kmp_int32 *) &__kmp_thread_pool_active_nth );
                 th->th.th_active_in_pool = TRUE;
             }
         }
@@ -1770,7 +1782,7 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
     {
         char buffer[128];
         __kmp_print_cond( buffer, &th->th.th_suspend_cv);
-        __kmp_printf( "__kmp_suspend: T#%d has awakened: %s\n", th_gtid, buffer );
+        __kmp_printf( "__kmp_suspend_template: T#%d has awakened: %s\n", th_gtid, buffer );
     }
 #endif
 
@@ -1778,69 +1790,76 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
     status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
     KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
 
-    KF_TRACE( 30, ("__kmp_suspend: T#%d exit\n", th_gtid ) );
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d exit\n", th_gtid ) );
+}
+
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+    __kmp_suspend_template(th_gtid, flag);
 }
 
 
 /* This routine signals the thread specified by target_gtid to wake up
- * after setting the sleep bit indicated by the spin argument to FALSE.
- * The target thread must already have called __kmp_suspend()
+ * after setting the sleep bit indicated by the flag argument to FALSE.
+ * The target thread must already have called __kmp_suspend_template()
  */
-
-void
-__kmp_resume( int target_gtid, volatile kmp_uint *spin )
+template <class C>
+static inline void __kmp_resume_template( int target_gtid, C *flag )
 {
     kmp_info_t *th = __kmp_threads[target_gtid];
     int status;
-    kmp_uint old_spin;
 
 #ifdef KMP_DEBUG
     int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
 #endif
 
-    KF_TRACE( 30, ( "__kmp_resume: T#%d wants to wakeup T#%d enter\n",
-                    gtid, target_gtid ) );
-
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d wants to wakeup T#%d enter\n", gtid, target_gtid ) );
     KMP_DEBUG_ASSERT( gtid != target_gtid );
 
     __kmp_suspend_initialize_thread( th );
 
     status = pthread_mutex_lock( &th->th.th_suspend_mx.m_mutex );
     KMP_CHECK_SYSFAIL( "pthread_mutex_lock", status );
-    if ( spin == NULL ) {
-        spin = (volatile kmp_uint *)TCR_PTR(th->th.th_sleep_loc);
-        if ( spin == NULL ) {
-            KF_TRACE( 5, ( "__kmp_resume: T#%d exiting, thread T#%d already awake -  spin(%p)\n",
-                       gtid, target_gtid, spin ) );
 
-            status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
-            KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
-            return;
-        }
+    if (!flag) {
+        flag = (C *)th->th.th_sleep_loc;
     }
 
-    old_spin = KMP_TEST_THEN_AND32( (kmp_int32 volatile *) spin,
-      ~( KMP_BARRIER_SLEEP_STATE ) );
-    if ( ( old_spin & KMP_BARRIER_SLEEP_STATE ) == 0 ) {
-        KF_TRACE( 5, ( "__kmp_resume: T#%d exiting, thread T#%d already awake -  spin(%p): "
-                   "%u => %u\n",
-                   gtid, target_gtid, spin, old_spin, *spin ) );
-
+    if (!flag) {
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag(%p)\n",
+                       gtid, target_gtid, NULL ) );
         status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
         KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
         return;
     }
+    else {
+        typename C::flag_t old_spin = flag->unset_sleeping();
+        if ( ! flag->is_sleeping_val(old_spin) ) {
+            KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag(%p): "
+                           "%u => %u\n",
+                           gtid, target_gtid, flag->get(), old_spin, *flag->get() ) );
+
+            status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
+            KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
+            return;
+        }
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d about to wakeup T#%d, reset sleep bit for flag's loc(%p): "
+                       "%u => %u\n",
+                       gtid, target_gtid, flag->get(), old_spin, *flag->get() ) );
+    }
     TCW_PTR(th->th.th_sleep_loc, NULL);
 
-    KF_TRACE( 5, ( "__kmp_resume: T#%d about to wakeup T#%d, reset sleep bit for spin(%p): "
-                   "%u => %u\n",
-                   gtid, target_gtid, spin, old_spin, *spin ) );
 
 #ifdef DEBUG_SUSPEND
     {
         char buffer[128];
         __kmp_print_cond( buffer, &th->th.th_suspend_cv );
-        __kmp_printf( "__kmp_resume: T#%d resuming T#%d: %s\n", gtid, target_gtid, buffer );
+        __kmp_printf( "__kmp_resume_template: T#%d resuming T#%d: %s\n", gtid, target_gtid, buffer );
     }
 #endif
 
@@ -1849,13 +1868,24 @@ __kmp_resume( int target_gtid, volatile kmp_uint *spin )
     KMP_CHECK_SYSFAIL( "pthread_cond_signal", status );
     status = pthread_mutex_unlock( &th->th.th_suspend_mx.m_mutex );
     KMP_CHECK_SYSFAIL( "pthread_mutex_unlock", status );
-    KF_TRACE( 30, ( "__kmp_resume: T#%d exiting after signaling wake up for T#%d\n",
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d exiting after signaling wake up for T#%d\n",
                     gtid, target_gtid ) );
 }
 
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+
 void
 __kmp_resume_monitor()
 {
+    KMP_TIME_BLOCK(USER_resume);
     int status;
 #ifdef KMP_DEBUG
     int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
@@ -1899,7 +1929,7 @@ __kmp_gtid_set_specific( int gtid )
 {
     int status;
     KMP_ASSERT( __kmp_init_runtime );
-    status = pthread_setspecific( __kmp_gtid_threadprivate_key, (void*)(gtid+1) );
+    status = pthread_setspecific( __kmp_gtid_threadprivate_key, (void*)(intptr_t)(gtid+1) );
     KMP_CHECK_SYSFAIL( "pthread_setspecific", status );
 }
 
@@ -2052,9 +2082,9 @@ __kmp_get_xproc( void ) {
         int mib[] = { CTL_HW, HW_NCPU };
         size_t len = sizeof( r );
         if ( sysctl( mib, 2, &r, &len, NULL, 0 ) < 0 ) {
-            r = 0;
-            KMP_WARNING( CantGetNumAvailCPU );
-            KMP_INFORM( AssumedNumCPU );
+             r = 0;
+             KMP_WARNING( CantGetNumAvailCPU );
+             KMP_INFORM( AssumedNumCPU );
         }
 
     #else
@@ -2298,7 +2328,7 @@ __kmp_is_address_mapped( void * addr ) {
 
     #elif KMP_OS_FREEBSD
 
-        // FIXME(FreeBSD): Implement this.
+        // FIXME(FreeBSD*): Implement this
         found = 1;
 
     #else
diff --git a/openmp/runtime/src/z_Windows_NT-586_asm.asm b/openmp/runtime/src/z_Windows_NT-586_asm.asm
index feb0b1cd1f3..385e3263d2b 100644
--- a/openmp/runtime/src/z_Windows_NT-586_asm.asm
+++ b/openmp/runtime/src/z_Windows_NT-586_asm.asm
@@ -1,7 +1,7 @@
 ;  z_Windows_NT-586_asm.asm:  - microtasking routines specifically
 ;    written for IA-32 architecture and Intel(R) 64 running Windows* OS
-;  $Revision: 42487 $
-;  $Date: 2013-07-08 08:11:23 -0500 (Mon, 08 Jul 2013) $
+;  $Revision: 43373 $
+;  $Date: 2014-08-07 09:17:32 -0500 (Thu, 07 Aug 2014) $
 
 ;
 ;//===----------------------------------------------------------------------===//
@@ -495,121 +495,6 @@ lock    cmpxchg8b QWORD PTR [edi]
 ___kmp_compare_and_store_ret64 ENDP
 _TEXT     ENDS
 
-
-;------------------------------------------------------------------------
-;
-; FUNCTION ___kmp_test_then_add_real32
-;
-; kmp_real32
-; __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
-;
-
-PUBLIC  ___kmp_test_then_add_real32
-_TEXT   SEGMENT
-        ALIGN 16
-_addr$ = 8
-_data$ = 12
-_old_value$ = -4
-_new_value$ = -8
-
-___kmp_test_then_add_real32 PROC NEAR
-        push    ebp
-        mov     ebp, esp
-        sub     esp, 8
-        push    esi
-        push    ebx
-        mov     esi, DWORD PTR _addr$[ebp]
-$L22:
-        fld     DWORD PTR [esi]
-                        ;; load <addr>
-        fst     DWORD PTR _old_value$[ebp]
-                        ;; store into old_value
-        fadd    DWORD PTR _data$[ebp]
-        fstp    DWORD PTR _new_value$[ebp]
-                        ;; new_value = old_value + data
-
-        mov     eax, DWORD PTR _old_value$[ebp]
-                        ;; load old_value
-        mov     ebx, DWORD PTR _new_value$[ebp]
-                        ;; load new_value
-
-lock    cmpxchg DWORD PTR [esi], ebx
-                        ;; Compare EAX with <addr>.  If equal set
-                        ;; ZF and load EBX into <addr>.  Else, clear
-                        ;; ZF and load <addr> into EAX.
-        jnz     SHORT $L22
-
-
-        fld     DWORD PTR _old_value$[ebp]
-                        ;; return old_value
-        pop     ebx
-        pop     esi
-        mov     esp, ebp
-        pop     ebp
-        ret     0
-___kmp_test_then_add_real32 ENDP
-_TEXT     ENDS
-
-;------------------------------------------------------------------------
-;
-; FUNCTION ___kmp_test_then_add_real64
-;
-; kmp_real64
-; __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
-;
-
-PUBLIC  ___kmp_test_then_add_real64
-_TEXT   SEGMENT
-        ALIGN 16
-_addr$ = 8
-_data$ = 12
-_old_value$ = -8
-_new_value$ = -16
-
-___kmp_test_then_add_real64 PROC NEAR
-        push    ebp
-        mov     ebp, esp
-        sub     esp, 16
-        push    esi
-        push    ebx
-        push    ecx
-        push    edx
-        mov     esi, DWORD PTR _addr$[ebp]
-$L44:
-        fld     QWORD PTR [esi]
-                        ;; load <addr>
-        fst     QWORD PTR _old_value$[ebp]
-                        ;; store into old_value
-        fadd    QWORD PTR _data$[ebp]
-        fstp    QWORD PTR _new_value$[ebp]
-                        ;; new_value = old_value + data
-
-        mov     edx, DWORD PTR _old_value$[ebp+4]
-        mov     eax, DWORD PTR _old_value$[ebp]
-                        ;; load old_value
-        mov     ecx, DWORD PTR _new_value$[ebp+4]
-        mov     ebx, DWORD PTR _new_value$[ebp]
-                        ;; load new_value
-
-lock    cmpxchg8b QWORD PTR [esi]
-                        ;; Compare EDX:EAX with <addr>.  If equal set
-                        ;; ZF and load ECX:EBX into <addr>.  Else, clear
-                        ;; ZF and load <addr> into EDX:EAX.
-        jnz     SHORT $L44
-
-
-        fld     QWORD PTR _old_value$[ebp]
-                        ;; return old_value
-        pop     edx
-        pop     ecx
-        pop     ebx
-        pop     esi
-        mov     esp, ebp
-        pop     ebp
-        ret     0
-___kmp_test_then_add_real64 ENDP
-_TEXT   ENDS
-
 ;------------------------------------------------------------------------
 ;
 ; FUNCTION ___kmp_load_x87_fpu_control_word
@@ -788,27 +673,6 @@ ifdef _M_AMD64
 
 ;------------------------------------------------------------------------
 ;
-; FUNCTION __kmp_x86_pause
-;
-; void
-; __kmp_x86_pause( void )
-;
-
-PUBLIC  __kmp_x86_pause
-_TEXT   SEGMENT
-        ALIGN 16
-__kmp_x86_pause PROC ;NEAR
-
-        db      0f3H
-        db      090H    ; pause
-        ret
-
-__kmp_x86_pause ENDP
-_TEXT   ENDS
-
-
-;------------------------------------------------------------------------
-;
 ; FUNCTION __kmp_x86_cpuid
 ;
 ; void
@@ -1340,93 +1204,6 @@ lock    xchg    QWORD PTR [rcx], rax
 __kmp_xchg_real64 ENDP
 _TEXT   ENDS
 
-
-;------------------------------------------------------------------------
-;
-; FUNCTION __kmp_test_then_add_real32
-;
-; kmp_real32
-; __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
-;
-; parameters:
-;	addr:	rcx
-;	data:	xmm1 (lower 4 bytes)
-;
-; return:	xmm0 (lower 4 bytes)
-
-PUBLIC  __kmp_test_then_add_real32
-_TEXT   SEGMENT
-        ALIGN 16
-
-__kmp_test_then_add_real32 PROC ;NEAR
-$__kmp_real32_loop:
-        movss   xmm0, DWORD PTR [rcx]	; load value at <addr>
-	movd	eax, xmm0		; save old value at <addr>
-
-	addss	xmm0, xmm1		; new value = old value + <data>
-	movd	edx, xmm0		; move new value to GP reg.
-
-lock    cmpxchg DWORD PTR [rcx], edx
-                        ; Compare EAX with <addr>.  If equal set
-                        ; ZF and exchange EDX with <addr>.  Else, clear
-                        ; ZF and load <addr> into EAX.
-        jz     	SHORT $__kmp_real32_success
-
-        db      0f3H
-        db      090H    		; pause
-
-	jmp	SHORT $__kmp_real32_loop
-
-$__kmp_real32_success:
-	movd	xmm0, eax		; load old value into return register
-        ret
-__kmp_test_then_add_real32 ENDP
-_TEXT     ENDS
-
-
-;------------------------------------------------------------------------
-;
-; FUNCTION __kmp_test_then_add_real64
-;
-; kmp_real64
-; __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
-;
-; parameters:
-;	addr:	rcx
-;	data:	xmm1 (lower 8 bytes)
-;
-; return:	xmm0 (lower 8 bytes)
-
-PUBLIC  __kmp_test_then_add_real64
-_TEXT   SEGMENT
-        ALIGN 16
-
-__kmp_test_then_add_real64 PROC ;NEAR
-$__kmp_real64_loop:
-        movlpd  xmm0, QWORD PTR [rcx]	; load value at <addr>
-	movd	rax, xmm0		; save old value at <addr>
-
-	addsd	xmm0, xmm1		; new value = old value + <data>
-	movd	rdx, xmm0		; move new value to GP reg.
-
-lock    cmpxchg QWORD PTR [rcx], rdx
-                        ; Compare RAX with <addr>.  If equal set
-                        ; ZF and exchange RDX with <addr>.  Else, clear
-                        ; ZF and load <addr> into RAX.
-        jz     	SHORT $__kmp_real64_success
-
-        db      0f3H
-        db      090H    		; pause
-
-	jmp	SHORT $__kmp_real64_loop
-
-$__kmp_real64_success:
-	movd	xmm0, rax		; load old value into return register
-        ret
-__kmp_test_then_add_real64 ENDP
-_TEXT   ENDS
-
-
 ;------------------------------------------------------------------------
 ;
 ; FUNCTION __kmp_load_x87_fpu_control_word
diff --git a/openmp/runtime/src/z_Windows_NT-586_util.c b/openmp/runtime/src/z_Windows_NT-586_util.c
index 19bd96b4401..9e35df45d41 100644
--- a/openmp/runtime/src/z_Windows_NT-586_util.c
+++ b/openmp/runtime/src/z_Windows_NT-586_util.c
@@ -1,7 +1,7 @@
 /*
  * z_Windows_NT-586_util.c -- platform specific routines.
- * $Revision: 42181 $
- * $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $
+ * $Revision: 42951 $
+ * $Date: 2014-01-21 14:41:41 -0600 (Tue, 21 Jan 2014) $
  */
 
 
diff --git a/openmp/runtime/src/z_Windows_NT_util.c b/openmp/runtime/src/z_Windows_NT_util.c
index 97e7fd20222..9442444ac9f 100644
--- a/openmp/runtime/src/z_Windows_NT_util.c
+++ b/openmp/runtime/src/z_Windows_NT_util.c
@@ -1,7 +1,7 @@
 /*
  * z_Windows_NT_util.c -- platform specific routines.
- * $Revision: 42816 $
- * $Date: 2013-11-11 15:33:37 -0600 (Mon, 11 Nov 2013) $
+ * $Revision: 43389 $
+ * $Date: 2014-08-11 10:54:01 -0500 (Mon, 11 Aug 2014) $
  */
 
 
@@ -19,14 +19,15 @@
 #include "kmp_itt.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
+#include "kmp_wait_release.h"
 
 
 
 /* ----------------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------------------- */
 
-/* This code is related to NtQuerySystemInformation() function. This function 
-   is used in the Load balance algorithm for OMP_DYNAMIC=true to find the 
+/* This code is related to NtQuerySystemInformation() function. This function
+   is used in the Load balance algorithm for OMP_DYNAMIC=true to find the
    number of running threads in the system. */
 
 #include <ntstatus.h>
@@ -140,32 +141,6 @@ static HMODULE kernel32 = NULL;
 /* ----------------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------------------- */
 
-
-// Why do we have multiple copies of __kmp_static_delay() and __kmp_static_yield() in many files?
-#ifdef KMP_DEBUG
-
-static void
-__kmp_static_delay( int arg ) {
-    /* Work around weird code-gen bug that causes assert to trip */
-    #if KMP_ARCH_X86_64 && KMP_OS_LINUX
-        KMP_ASSERT( arg != 0 );
-    #else
-        KMP_ASSERT( arg >= 0 );
-    #endif
-}
-
-#else
-
-    #define __kmp_static_delay( arg )     /* nothing to do */
-
-#endif /* KMP_DEBUG */
-
-static void
-__kmp_static_yield( int arg )
-{
-    __kmp_yield( arg );
-}
-
 #if KMP_HANDLE_SIGNALS
     typedef void    (* sig_func_t )( int );
     static sig_func_t  __kmp_sighldrs[ NSIG ];
@@ -367,62 +342,50 @@ __kmp_suspend_uninitialize_thread( kmp_info_t *th )
     }
 }
 
-/*
- * This routine puts the calling thread to sleep after setting the
- * sleep bit for the indicated spin variable to true.
+/* This routine puts the calling thread to sleep after setting the
+ * sleep bit for the indicated flag variable to true.
  */
-
-void
-__kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
+template <class C>
+static inline void __kmp_suspend_template( int th_gtid, C *flag )
 {
     kmp_info_t *th = __kmp_threads[th_gtid];
     int status;
-    kmp_uint old_spin;
+    typename C::flag_t old_spin;
 
-    KF_TRACE( 30, ("__kmp_suspend: T#%d enter for spin = %p\n", th_gtid, spinner ) );
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d enter for flag's loc(%p)\n", th_gtid, flag->get() ) );
 
     __kmp_suspend_initialize_thread( th );
-
     __kmp_win32_mutex_lock( &th->th.th_suspend_mx );
 
-    KF_TRACE( 10, ( "__kmp_suspend: T#%d setting sleep bit for spin(%p)\n",
-                    th_gtid, spinner ) );
+    KF_TRACE( 10, ( "__kmp_suspend_template: T#%d setting sleep bit for flag's loc(%p)\n",
+                    th_gtid, flag->get() ) );
 
     /* TODO: shouldn't this use release semantics to ensure that __kmp_suspend_initialize_thread
        gets called first?
     */
-    old_spin = KMP_TEST_THEN_OR32( (volatile kmp_int32 *) spinner,
-                                     KMP_BARRIER_SLEEP_STATE );
+    old_spin = flag->set_sleeping();
 
-    KF_TRACE( 5, ( "__kmp_suspend: T#%d set sleep bit for spin(%p)==%d\n",
-                                   th_gtid, spinner, *spinner ) );
+    KF_TRACE( 5, ( "__kmp_suspend_template: T#%d set sleep bit for flag's loc(%p)==%d\n",
+                   th_gtid, flag->get(), *(flag->get()) ) );
 
-    if ( old_spin == checker ) {
-        KMP_TEST_THEN_AND32( (volatile kmp_int32 *) spinner, ~(KMP_BARRIER_SLEEP_STATE) );
-
-        KF_TRACE( 5, ( "__kmp_suspend: T#%d false alarm, reset sleep bit for spin(%p)\n",
-                       th_gtid, spinner) );
+    if ( flag->done_check_val(old_spin) ) {
+        old_spin = flag->unset_sleeping();
+        KF_TRACE( 5, ( "__kmp_suspend_template: T#%d false alarm, reset sleep bit for flag's loc(%p)\n",
+                       th_gtid, flag->get()) );
     } else {
 #ifdef DEBUG_SUSPEND
         __kmp_suspend_count++;
 #endif
-
         /* Encapsulate in a loop as the documentation states that this may
          * "with low probability" return when the condition variable has
          * not been signaled or broadcast
          */
         int deactivated = FALSE;
-        TCW_PTR(th->th.th_sleep_loc, spinner);
-        while ( TCR_4( *spinner ) & KMP_BARRIER_SLEEP_STATE ) {
-
-            KF_TRACE( 15, ("__kmp_suspend: T#%d about to perform kmp_win32_cond_wait()\n",
+        TCW_PTR(th->th.th_sleep_loc, (void *)flag);
+        while ( flag->is_sleeping() ) {
+            KF_TRACE( 15, ("__kmp_suspend_template: T#%d about to perform kmp_win32_cond_wait()\n",
                      th_gtid ) );
-
-
-            //
-            // Mark the thread as no longer active
-            // (only in the first iteration of the loop).
-            //
+            // Mark the thread as no longer active (only in the first iteration of the loop).
             if ( ! deactivated ) {
                 th->th.th_active = FALSE;
                 if ( th->th.th_active_in_pool ) {
@@ -441,17 +404,14 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
             }
 
 #ifdef KMP_DEBUG
-            if( (*spinner) & KMP_BARRIER_SLEEP_STATE ) {
-                KF_TRACE( 100, ("__kmp_suspend: T#%d spurious wakeup\n", th_gtid ));
+            if( flag->is_sleeping() ) {
+                KF_TRACE( 100, ("__kmp_suspend_template: T#%d spurious wakeup\n", th_gtid ));
             }
 #endif /* KMP_DEBUG */
 
-	} // while
+        } // while
 
-        //
-        // Mark the thread as active again
-        // (if it was previous marked as inactive)
-        //
+        // Mark the thread as active again (if it was previous marked as inactive)
         if ( deactivated ) {
             th->th.th_active = TRUE;
             if ( TCR_4(th->th.th_in_pool) ) {
@@ -465,66 +425,82 @@ __kmp_suspend( int th_gtid, volatile kmp_uint *spinner, kmp_uint checker )
 
     __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
 
-    KF_TRACE( 30, ("__kmp_suspend: T#%d exit\n", th_gtid ) );
+    KF_TRACE( 30, ("__kmp_suspend_template: T#%d exit\n", th_gtid ) );
 }
 
+void __kmp_suspend_32(int th_gtid, kmp_flag_32 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_64(int th_gtid, kmp_flag_64 *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+void __kmp_suspend_oncore(int th_gtid, kmp_flag_oncore *flag) {
+    __kmp_suspend_template(th_gtid, flag);
+}
+
+
 /* This routine signals the thread specified by target_gtid to wake up
- * after setting the sleep bit indicated by the spin argument to FALSE
+ * after setting the sleep bit indicated by the flag argument to FALSE
  */
-void
-__kmp_resume( int target_gtid, volatile kmp_uint *spin )
+template <class C>
+static inline void __kmp_resume_template( int target_gtid, C *flag )
 {
     kmp_info_t *th = __kmp_threads[target_gtid];
     int status;
-    kmp_uint32 old_spin;
 
 #ifdef KMP_DEBUG
     int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
 #endif
 
-    KF_TRACE( 30, ( "__kmp_resume: T#%d wants to wakeup T#%d enter\n",
-                     gtid, target_gtid ) );
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d wants to wakeup T#%d enter\n", gtid, target_gtid ) );
 
     __kmp_suspend_initialize_thread( th );
-
     __kmp_win32_mutex_lock( &th->th.th_suspend_mx );
 
-    if ( spin == NULL ) {
-        spin = (volatile kmp_uint *)TCR_PTR(th->th.th_sleep_loc);
-        if ( spin == NULL ) {
-            KF_TRACE( 5, ( "__kmp_resume: T#%d exiting, thread T#%d already awake -  spin(%p)\n",
-                       gtid, target_gtid, spin ) );
-
-            __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
-            return;
-        }
+    if (!flag) {
+        flag = (C *)th->th.th_sleep_loc;
     }
 
-    TCW_PTR(th->th.th_sleep_loc, NULL);
-    old_spin = KMP_TEST_THEN_AND32( (kmp_int32 volatile *) spin, ~( KMP_BARRIER_SLEEP_STATE ) );
-
-    if ( ( old_spin & KMP_BARRIER_SLEEP_STATE ) == 0 ) {
-        KF_TRACE( 5, ( "__kmp_resume: T#%d exiting, thread T#%d already awake - spin(%p): "
-                   "%u => %u\n",
-                   gtid, target_gtid, spin, old_spin, *spin ) );
-
+    if (!flag) {
+        KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag's loc(%p)\n",
+                       gtid, target_gtid, NULL ) );
         __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
         return;
     }
+    else {
+        typename C::flag_t old_spin = flag->unset_sleeping();
+        if ( !flag->is_sleeping_val(old_spin) ) {
+            KF_TRACE( 5, ( "__kmp_resume_template: T#%d exiting, thread T#%d already awake: flag's loc(%p): "
+                           "%u => %u\n",
+                           gtid, target_gtid, flag->get(), old_spin, *(flag->get()) ) );
+            __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
+            return;
+        }
+    }
     TCW_PTR(th->th.th_sleep_loc, NULL);
 
-    KF_TRACE( 5, ( "__kmp_resume: T#%d about to wakeup T#%d, reset sleep bit for spin(%p)\n",
-                    gtid, target_gtid, spin) );
+    KF_TRACE( 5, ( "__kmp_resume_template: T#%d about to wakeup T#%d, reset sleep bit for flag's loc(%p)\n",
+                   gtid, target_gtid, flag->get() ) );
 
 
     __kmp_win32_cond_signal(  &th->th.th_suspend_cv );
-
     __kmp_win32_mutex_unlock( &th->th.th_suspend_mx );
 
-    KF_TRACE( 30, ( "__kmp_resume: T#%d exiting after signaling wake up for T#%d\n",
+    KF_TRACE( 30, ( "__kmp_resume_template: T#%d exiting after signaling wake up for T#%d\n",
                     gtid, target_gtid ) );
 }
 
+void __kmp_resume_32(int target_gtid, kmp_flag_32 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_64(int target_gtid, kmp_flag_64 *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+void __kmp_resume_oncore(int target_gtid, kmp_flag_oncore *flag) {
+    __kmp_resume_template(target_gtid, flag);
+}
+
+
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
@@ -582,7 +558,6 @@ __kmp_get_proc_group( kmp_affin_mask_t const *mask )
 {
     int i;
     int group = -1;
-    struct GROUP_AFFINITY new_ga, prev_ga;
     for (i = 0; i < __kmp_num_proc_groups; i++) {
         if (mask[i] == 0) {
             continue;
@@ -607,7 +582,7 @@ __kmp_set_system_affinity( kmp_affin_mask_t const *mask, int abort_on_error )
         //
         // Check for a valid mask.
         //
-        struct GROUP_AFFINITY ga;
+        GROUP_AFFINITY ga;
         int group = __kmp_get_proc_group( mask );
         if (group < 0) {
             if (abort_on_error) {
@@ -620,9 +595,9 @@ __kmp_set_system_affinity( kmp_affin_mask_t const *mask, int abort_on_error )
         // Transform the bit vector into a GROUP_AFFINITY struct
         // and make the system call to set affinity.
         //
-        ga.group = group;
-        ga.mask = mask[group];
-        ga.reserved[0] = ga.reserved[1] = ga.reserved[2] = 0;
+        ga.Group = group;
+        ga.Mask = mask[group];
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 
         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
@@ -667,7 +642,7 @@ __kmp_get_system_affinity( kmp_affin_mask_t *mask, int abort_on_error )
 
     if (__kmp_num_proc_groups > 1) {
         KMP_CPU_ZERO(mask);
-        struct GROUP_AFFINITY ga;
+        GROUP_AFFINITY ga;
         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
 
         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
@@ -683,12 +658,12 @@ __kmp_get_system_affinity( kmp_affin_mask_t *mask, int abort_on_error )
             return error;
         }
 
-        if ((ga.group < 0) || (ga.group > __kmp_num_proc_groups)
-          || (ga.mask == 0)) {
+        if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups)
+          || (ga.Mask == 0)) {
             return -1;
         }
 
-        mask[ga.group] = ga.mask;
+        mask[ga.Group] = ga.Mask;
     }
     else
 
@@ -750,12 +725,12 @@ __kmp_affinity_bind_thread( int proc )
         // Form the GROUP_AFFINITY struct directly, rather than filling
         // out a bit vector and calling __kmp_set_system_affinity().
         //
-        struct GROUP_AFFINITY ga;
+        GROUP_AFFINITY ga;
         KMP_DEBUG_ASSERT((proc >= 0) && (proc < (__kmp_num_proc_groups
            * CHAR_BIT * sizeof(DWORD_PTR))));
-        ga.group = proc / (CHAR_BIT * sizeof(DWORD_PTR));
-        ga.mask = (unsigned long long)1 << (proc % (CHAR_BIT * sizeof(DWORD_PTR)));
-        ga.reserved[0] = ga.reserved[1] = ga.reserved[2] = 0;
+        ga.Group = proc / (CHAR_BIT * sizeof(DWORD_PTR));
+        ga.Mask = (unsigned long long)1 << (proc % (CHAR_BIT * sizeof(DWORD_PTR)));
+        ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
 
         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
@@ -875,7 +850,7 @@ __kmp_runtime_initialize( void )
     #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
     /* Set up minimum number of threads to switch to TLS gtid */
-    #if KMP_OS_WINDOWS && ! defined GUIDEDLL_EXPORTS 
+    #if KMP_OS_WINDOWS && ! defined GUIDEDLL_EXPORTS
         // Windows* OS, static library.
         /*
             New thread may use stack space previously used by another thread, currently terminated.
@@ -977,7 +952,7 @@ __kmp_runtime_initialize( void )
             // See if group affinity is supported on this system.
             // If so, calculate the #groups and #procs.
             //
-            // Group affinity was introduced with Windows* 7 OS and 
+            // Group affinity was introduced with Windows* 7 OS and
             // Windows* Server 2008 R2 OS.
             //
             if ( ( __kmp_GetActiveProcessorCount != NULL )
@@ -1368,11 +1343,11 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
 {
     kmp_thread_t   handle;
     DWORD          idThread;
-    
+
     KA_TRACE( 10, ("__kmp_create_worker: try to create thread (%d)\n", gtid ) );
-    
+
     th->th.th_info.ds.ds_gtid = gtid;
-    
+
     if ( KMP_UBER_GTID(gtid) ) {
         int     stack_data;
 
@@ -1411,12 +1386,12 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
         /* Set stack size for this thread now. */
         KA_TRACE( 10, ( "__kmp_create_worker: stack_size = %" KMP_SIZE_T_SPEC
                         " bytes\n", stack_size ) );
-        
+
         stack_size += gtid * __kmp_stkoffset;
-        
+
         TCW_PTR(th->th.th_info.ds.ds_stacksize, stack_size);
         TCW_4(th->th.th_info.ds.ds_stackgrow, FALSE);
-        
+
         KA_TRACE( 10, ( "__kmp_create_worker: (before) stack_size = %"
                         KMP_SIZE_T_SPEC
                         " bytes, &__kmp_launch_worker = %p, th = %p, "
@@ -1424,13 +1399,13 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
                         (SIZE_T) stack_size,
                         (LPTHREAD_START_ROUTINE) & __kmp_launch_worker,
                         (LPVOID) th, &idThread ) );
-        
+
             {
                 handle = CreateThread( NULL, (SIZE_T) stack_size,
                                        (LPTHREAD_START_ROUTINE) __kmp_launch_worker,
                                        (LPVOID) th, STACK_SIZE_PARAM_IS_A_RESERVATION, &idThread );
             }
-        
+
         KA_TRACE( 10, ( "__kmp_create_worker: (after) stack_size = %"
                         KMP_SIZE_T_SPEC
                         " bytes, &__kmp_launch_worker = %p, th = %p, "
@@ -1438,7 +1413,7 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
                         (SIZE_T) stack_size,
                         (LPTHREAD_START_ROUTINE) & __kmp_launch_worker,
                         (LPVOID) th, idThread, handle ) );
-        
+
             {
                 if ( handle == 0 ) {
                     DWORD error = GetLastError();
@@ -1454,7 +1429,7 @@ __kmp_create_worker( int gtid, kmp_info_t *th, size_t stack_size )
             }
         KMP_MB();       /* Flush all pending memory write invalidates.  */
     }
-    
+
     KA_TRACE( 10, ("__kmp_create_worker: done creating thread (%d)\n", gtid ) );
 }
 
@@ -1601,7 +1576,6 @@ __kmp_reap_common( kmp_info_t * th )
             KMP_FSYNC_SPIN_PREPARE( obj );
 #endif /* USE_ITT_BUILD */
             __kmp_is_thread_alive( th, &exit_val );
-            __kmp_static_delay( TRUE );
             KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
             KMP_YIELD_SPIN( spins );
         } while ( exit_val == STILL_ACTIVE && TCR_4( th->th.th_info.ds.ds_alive ) );