diff options
| author | Alexey Bataev <a.bataev@hotmail.com> | 2019-08-23 18:34:48 +0000 |
|---|---|---|
| committer | Alexey Bataev <a.bataev@hotmail.com> | 2019-08-23 18:34:48 +0000 |
| commit | 0366168f3ac6c838fa41cddc02db84d48403f21e (patch) | |
| tree | 98a44872ce06de19945aac46c572d193c7b7d9f9 | |
| parent | 9cb059fdcc03e2144fa9aac0bfbb49c0d5d7efe2 (diff) | |
| download | bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.tar.gz bcm5719-llvm-0366168f3ac6c838fa41cddc02db84d48403f21e.zip | |
[OPENMP][NVPTX]Use __syncwarp() to reconverge the threads.
Summary:
In Cuda 9.0 it is not guaranteed that threads in the warps are
convergent. We need to use __syncwarp() function to reconverge
the threads and to guarantee the memory ordering among threads in the
warps.
This is the first patch to fix the problem with the test
libomptarget/deviceRTLs/nvptx/src/sync.cu on Cuda9+.
This patch just replaces calls to __shfl_sync() function with the call
of __syncwarp() function where we need to reconverge the threads when we
try to modify the value of the parallel level counter.
Reviewers: grokos
Subscribers: guansong, jfb, jdoerfert, caomhin, kkwli0, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D65013
llvm-svn: 369796
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 3 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h | 26 |
2 files changed, 19 insertions, 10 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index f28284ded6b..a5e4a71bdf3 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -55,11 +55,14 @@ #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down_sync((mask), (var), (delta), (width)) #define __ACTIVEMASK() __activemask() +#define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ __shfl_down((var), (delta), (width)) #define __ACTIVEMASK() __ballot(1) +// In Cuda < 9.0 no need to sync threads in warps. +#define __SYNCWARP(Mask) #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index ceb395153f1..ceed7d3f7c8 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -202,25 +202,31 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } // Parallel level INLINE void IncParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + unsigned Active = __ACTIVEMASK(); + __SYNCWARP(Active); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] += (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { - unsigned tnum = __ACTIVEMASK(); - int leader = __ffs(tnum) - 1; - __SHFL_SYNC(tnum, leader, leader); - if (GetLaneId() == leader) { + unsigned Active = __ACTIVEMASK(); + __SYNCWARP(Active); + unsigned LaneMaskLt; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); + unsigned Rank = __popc(Active & LaneMaskLt); + if (Rank == 0) { parallelLevel[GetWarpId()] -= (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __threadfence(); } - __SHFL_SYNC(tnum, leader, leader); + __SYNCWARP(Active); } //////////////////////////////////////////////////////////////////////////////// |

