[OPENMP][NVPTX]Use __syncwarp() to reconverge the threads.

Summary: In Cuda 9.0 it is not guaranteed that threads in the warps are convergent. We need to use __syncwarp() function to reconverge the threads and to guarantee the memory ordering among threads in the warps. This is the first patch to fix the problem with the test libomptarget/deviceRTLs/nvptx/src/sync.cu on Cuda9+. This patch just replaces calls to __shfl_sync() function with the call of __syncwarp() function where we need to reconverge the threads when we try to modify the value of the parallel level counter. Reviewers: grokos Subscribers: guansong, jfb, jdoerfert, caomhin, kkwli0, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D65013 llvm-svn: 369796
author: Alexey Bataev <a.bataev@hotmail.com> 2019-08-23 18:34:48 +0000
committer: Alexey Bataev <a.bataev@hotmail.com> 2019-08-23 18:34:48 +0000
commit: 0366168f3ac6c838fa41cddc02db84d48403f21e (patch)
tree: 98a44872ce06de19945aac46c572d193c7b7d9f9
parent: 9cb059fdcc03e2144fa9aac0bfbb49c0d5d7efe2 (diff)
download: bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.tar.gz
bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.zip
2 files changed, 19 insertions, 10 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index f28284ded6b..a5e4a71bdf3 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -55,11 +55,14 @@
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down_sync((mask), (var), (delta), (width))
 #define __ACTIVEMASK() __activemask()
+#define __SYNCWARP(Mask) __syncwarp(Mask)
 #else
 #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
   __shfl_down((var), (delta), (width))
 #define __ACTIVEMASK() __ballot(1)
+// In Cuda < 9.0 no need to sync threads in warps.
+#define __SYNCWARP(Mask)
 #endif // CUDA_VERSION
 
 #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index ceb395153f1..ceed7d3f7c8 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -202,25 +202,31 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
 // Parallel level
 
 INLINE void IncParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  unsigned Active = __ACTIVEMASK();
+  __SYNCWARP(Active);
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
     parallelLevel[GetWarpId()] +=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
   }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP(Active);
 }
 
 INLINE void DecParallelLevel(bool ActiveParallel) {
-  unsigned tnum = __ACTIVEMASK();
-  int leader = __ffs(tnum) - 1;
-  __SHFL_SYNC(tnum, leader, leader);
-  if (GetLaneId() == leader) {
+  unsigned Active = __ACTIVEMASK();
+  __SYNCWARP(Active);
+  unsigned LaneMaskLt;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt));
+  unsigned Rank = __popc(Active & LaneMaskLt);
+  if (Rank == 0) {
     parallelLevel[GetWarpId()] -=
         (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
+    __threadfence();
   }
-  __SHFL_SYNC(tnum, leader, leader);
+  __SYNCWARP(Active);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
author	Alexey Bataev <a.bataev@hotmail.com>	2019-08-23 18:34:48 +0000
committer	Alexey Bataev <a.bataev@hotmail.com>	2019-08-23 18:34:48 +0000
commit	0366168f3ac6c838fa41cddc02db84d48403f21e (patch)
tree	98a44872ce06de19945aac46c572d193c7b7d9f9
parent	9cb059fdcc03e2144fa9aac0bfbb49c0d5d7efe2 (diff)
download	bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.tar.gz bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.zip