summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-09-03 18:11:50 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-09-03 18:11:50 +0000
commit481294177677023d312271f736d56475195ecd40 (patch)
treecbf372de582f18ad302eac7ef76e9e4c68c26ef7
parentb187eef616438deb0fd9318eb4d0e11d08463940 (diff)
downloadbcm5719-llvm-481294177677023d312271f736d56475195ecd40.tar.gz
bcm5719-llvm-481294177677023d312271f736d56475195ecd40.zip
[OPENMP][NVPTX]Fix parallel level counter in non-SPMD mode.
Summary: In non-SPMD mode we may end up with the divergent threads when trying to increment/decrement parallel level counter. It may lead to incorrect calculations of the parallel level and wrong results when threads are divergent. We need to reconverge the threads before trying to modify the parallel level counter. Reviewers: grokos, jdoerfert Subscribers: guansong, openmp-commits, caomhin, kkwli0 Tags: #openmp Differential Revision: https://reviews.llvm.org/D66802 llvm-svn: 370803
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu26
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/support.h5
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h18
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c12
4 files changed, 45 insertions, 16 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d8049d9aaf4..5db443c3b33 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
(int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
- IncParallelLevel(threadsInTeam != 1);
+ // Reconverge the threads at the end of the parallel region to correctly
+ // handle parallel levels.
+ // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+ // warp. If only 1 thread is active, not need to reconverge the threads.
+ // If we have the whole warp, reconverge all the threads in the warp before
+ // actually trying to change the parallel level. Otherwise, parallel level
+ // can be changed incorrectly because of threads divergence.
+ bool IsActiveParallelRegion = threadsInTeam != 1;
+ IncParallelLevel(IsActiveParallelRegion,
+ IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}
return isActive;
@@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
- DecParallelLevel(threadsInTeam != 1);
+ // Reconverge the threads at the end of the parallel region to correctly
+ // handle parallel levels.
+ // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+ // warp. If only 1 thread is active, not need to reconverge the threads.
+ // If we have the whole warp, reconverge all the threads in the warp before
+ // actually trying to change the parallel level. Otherwise, parallel level can
+ // be changed incorrectly because of threads divergence.
+ bool IsActiveParallelRegion = threadsInTeam != 1;
+ DecParallelLevel(IsActiveParallelRegion,
+ IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}
////////////////////////////////////////////////////////////////////////////////
@@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
- IncParallelLevel(/*ActiveParallel=*/false);
+ IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
@@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
- DecParallelLevel(/*ActiveParallel=*/false);
+ DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
index 4df75edfbaa..e10f2a19d32 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
@@ -65,8 +66,8 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
INLINE int IsTeamMaster(int ompThreadId);
// Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel);
-INLINE void DecParallelLevel(bool ActiveParallel);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
////////////////////////////////////////////////////////////////////////////////
// Memory
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 1a0f1a9ecd1..d4da6ad73fa 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -203,30 +203,28 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
////////////////////////////////////////////////////////////////////////////////
// Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel) {
- __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
- __kmpc_impl_syncwarp(Active);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+ __kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
+ unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
- __kmpc_impl_syncwarp(Active);
+ __kmpc_impl_syncwarp(Mask);
}
-INLINE void DecParallelLevel(bool ActiveParallel) {
- __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
- __kmpc_impl_syncwarp(Active);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+ __kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
+ unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
- __kmpc_impl_syncwarp(Active);
+ __kmpc_impl_syncwarp(Mask);
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
index edb00e09e60..33e23246456 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@@ -135,5 +135,17 @@ int main(int argc, char *argv[]) {
}
}
+ // Check for paraller level in non-SPMD kernels.
+ level = 0;
+ #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
+ for (int i=0; i<5032; i+=32) {
+ int ub = (i+32 > 5032) ? 5032 : i+32;
+ #pragma omp parallel for schedule(dynamic)
+ for (int j=i ; j < ub; j++) ;
+ level += omp_get_level();
+ }
+ // CHECK: Integral level = 0.
+ printf("Integral level = %d.\n", level);
+
return 0;
}
OpenPOWER on IntegriCloud