summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu26
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/support.h5
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h18
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c12
4 files changed, 45 insertions, 16 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d8049d9aaf4..5db443c3b33 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -311,7 +311,16 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
(int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
- IncParallelLevel(threadsInTeam != 1);
+ // Reconverge the threads at the end of the parallel region to correctly
+ // handle parallel levels.
+ // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+ // warp. If only 1 thread is active, not need to reconverge the threads.
+ // If we have the whole warp, reconverge all the threads in the warp before
+ // actually trying to change the parallel level. Otherwise, parallel level
+ // can be changed incorrectly because of threads divergence.
+ bool IsActiveParallelRegion = threadsInTeam != 1;
+ IncParallelLevel(IsActiveParallelRegion,
+ IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}
return isActive;
@@ -329,7 +338,16 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
- DecParallelLevel(threadsInTeam != 1);
+ // Reconverge the threads at the end of the parallel region to correctly
+ // handle parallel levels.
+ // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole
+ // warp. If only 1 thread is active, not need to reconverge the threads.
+ // If we have the whole warp, reconverge all the threads in the warp before
+ // actually trying to change the parallel level. Otherwise, parallel level can
+ // be changed incorrectly because of threads divergence.
+ bool IsActiveParallelRegion = threadsInTeam != 1;
+ DecParallelLevel(IsActiveParallelRegion,
+ IsActiveParallelRegion ? 0xFFFFFFFF : 1u);
}
////////////////////////////////////////////////////////////////////////////////
@@ -339,7 +357,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
- IncParallelLevel(/*ActiveParallel=*/false);
+ IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
@@ -378,7 +396,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
uint32_t global_tid) {
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
- DecParallelLevel(/*ActiveParallel=*/false);
+ DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
if (checkRuntimeUninitialized(loc)) {
ASSERT0(LT_FUSSY, checkSPMDMode(loc),
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
index 4df75edfbaa..e10f2a19d32 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
@@ -65,8 +66,8 @@ INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
INLINE int IsTeamMaster(int ompThreadId);
// Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel);
-INLINE void DecParallelLevel(bool ActiveParallel);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
////////////////////////////////////////////////////////////////////////////////
// Memory
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 1a0f1a9ecd1..d4da6ad73fa 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -203,30 +203,28 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
////////////////////////////////////////////////////////////////////////////////
// Parallel level
-INLINE void IncParallelLevel(bool ActiveParallel) {
- __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
- __kmpc_impl_syncwarp(Active);
+INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+ __kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
+ unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
- __kmpc_impl_syncwarp(Active);
+ __kmpc_impl_syncwarp(Mask);
}
-INLINE void DecParallelLevel(bool ActiveParallel) {
- __kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
- __kmpc_impl_syncwarp(Active);
+INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
+ __kmpc_impl_syncwarp(Mask);
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
- unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
+ unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
- __kmpc_impl_syncwarp(Active);
+ __kmpc_impl_syncwarp(Mask);
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
index edb00e09e60..33e23246456 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
+++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c
@@ -135,5 +135,17 @@ int main(int argc, char *argv[]) {
}
}
+ // Check for paraller level in non-SPMD kernels.
+ level = 0;
+ #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
+ for (int i=0; i<5032; i+=32) {
+ int ub = (i+32 > 5032) ? 5032 : i+32;
+ #pragma omp parallel for schedule(dynamic)
+ for (int j=i ; j < ub; j++) ;
+ level += omp_get_level();
+ }
+ // CHECK: Integral level = 0.
+ printf("Integral level = %d.\n", level);
+
return 0;
}
OpenPOWER on IntegriCloud