[OPENMP, NVPTX] Fixes for NVPTX RTL

Summary: Patch fixes several problems in the implementation of NVPTX RTL. 1. Detection of the last iteration for loops with static scheduling, no chunks. 2. Fixes reductions for the serialized parallel constructs. 3. Fixes handling of the barriers. Reviewers: grokos Reviewed By: grokos Subscribers: Hahnfeld, guansong, openmp-commits Differential Revision: https://reviews.llvm.org/D48480 llvm-svn: 335469
author: Alexey Bataev <a.bataev@hotmail.com> 2018-06-25 13:43:35 +0000
committer: Alexey Bataev <a.bataev@hotmail.com> 2018-06-25 13:43:35 +0000
commit: 0ac29350b5c081fa0dd469ca618a5857d659f3b0 (patch)
tree: 10e49f5d41de20e52609014487d692ae5411819a
parent: 750ea090eba50c004e49378e098f895fddeff0f1 (diff)
download: bcm5719-llvm-0ac29350b5c081fa0dd469ca618a5857d659f3b0.tar.gz
bcm5719-llvm-0ac29350b5c081fa0dd469ca618a5857d659f3b0.zip
3 files changed, 36 insertions, 32 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 80fb61ff484..0e808df048c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -86,7 +86,7 @@ public:
 
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
-    last = ub == inputUb;
+    last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index afa8e81eb83..aedb6359f9a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized = false) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(
+      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  if (NumThreads == 1)
+    return 1;
   /*
    * This reduce function handles reduction within a team. It handles
    * parallel regions in both L1 and L2 parallelism levels. It also
@@ -173,9 +178,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
    */
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
   uint32_t WarpId = BlockThreadId / WARPSIZE;
 
@@ -219,10 +221,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
                                     // early.
     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
   //
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index a577d7a6c53..68f08a16ac4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
 
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __syncthreads();
+  __kmpc_barrier(loc_ref, tid);
   PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
   return 0;
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  if (isSPMDMode()) {
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else if (isRuntimeUninitialized()) {
+    __kmpc_barrier_simple_generic(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+      int numberOfActiveOMPThreads =
+          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    // On Volta and newer architectures we require that all lanes in
-    // a warp (at least, all present for the kernel launch) participate in the
-    // barrier.  This is enforced when launching the parallel region.  An
-    // exception is when there are < WARPSIZE workers.  In this case only 1
-    // worker is started, so we don't need a barrier.
-    if (numberOfActiveOMPThreads > 1) {
+      // On Volta and newer architectures we require that all lanes in
+      // a warp (at least, all present for the kernel launch) participate in the
+      // barrier.  This is enforced when launching the parallel region.  An
+      // exception is when there are < WARPSIZE workers.  In this case only 1
+      // worker is started, so we don't need a barrier.
+      if (numberOfActiveOMPThreads > 1) {
 #endif
-      // The #threads parameter must be rounded up to the WARPSIZE.
-      int threads =
-          WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-      PRINT(LD_SYNC,
-            "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-            numberOfActiveOMPThreads, threads);
-      // Barrier #1 is for synchronization among active threads.
-      named_sync(L1_BARRIER, threads);
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              numberOfActiveOMPThreads, threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    } // numberOfActiveOMPThreads > 1
+      } // numberOfActiveOMPThreads > 1
 #endif
+    }
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
-  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
 }
 
 // Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
author	Alexey Bataev <a.bataev@hotmail.com>	2018-06-25 13:43:35 +0000
committer	Alexey Bataev <a.bataev@hotmail.com>	2018-06-25 13:43:35 +0000
commit	0ac29350b5c081fa0dd469ca618a5857d659f3b0 (patch)
tree	10e49f5d41de20e52609014487d692ae5411819a
parent	750ea090eba50c004e49378e098f895fddeff0f1 (diff)
download	bcm5719-llvm-0ac29350b5c081fa0dd469ca618a5857d659f3b0.tar.gz bcm5719-llvm-0ac29350b5c081fa0dd469ca618a5857d659f3b0.zip