summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu12
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu54
3 files changed, 36 insertions, 32 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
index 80fb61ff484..0e808df048c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -86,7 +86,7 @@ public:
T inputUb = ub;
ub = lb + chunk - 1; // Clang uses i <= ub
- last = ub == inputUb;
+ last = lb <= inputUb && inputUb <= ub;
stride = loopSize; // make sure we only do 1 chunk per warp
}
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index afa8e81eb83..aedb6359f9a 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode,
bool isRuntimeUninitialized = false) {
+ uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+ uint32_t NumThreads = GetNumberOfOmpThreads(
+ BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+ if (NumThreads == 1)
+ return 1;
/*
* This reduce function handles reduction within a team. It handles
* parallel regions in both L1 and L2 parallelism levels. It also
@@ -173,9 +178,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
*/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
- uint32_t NumThreads = GetNumberOfOmpThreads(
- BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
uint32_t WarpId = BlockThreadId / WARPSIZE;
@@ -219,10 +221,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
// early.
return gpu_irregular_simd_reduce(reduce_data, shflFct);
- uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
- uint32_t NumThreads = GetNumberOfOmpThreads(
- BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
// When we have more than [warpsize] number of threads
// a block reduction is performed here.
//
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index a577d7a6c53..68f08a16ac4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
- __syncthreads();
+ __kmpc_barrier(loc_ref, tid);
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
return 0;
}
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
- tid = GetLogicalThreadIdInBlock();
- omptarget_nvptx_TaskDescr *currTaskDescr =
- omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
- if (!currTaskDescr->InL2OrHigherParallelRegion()) {
- int numberOfActiveOMPThreads =
- GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+ if (isSPMDMode()) {
+ __kmpc_barrier_simple_spmd(loc_ref, tid);
+ } else if (isRuntimeUninitialized()) {
+ __kmpc_barrier_simple_generic(loc_ref, tid);
+ } else {
+ tid = GetLogicalThreadIdInBlock();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+ if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+ int numberOfActiveOMPThreads =
+ GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- // On Volta and newer architectures we require that all lanes in
- // a warp (at least, all present for the kernel launch) participate in the
- // barrier. This is enforced when launching the parallel region. An
- // exception is when there are < WARPSIZE workers. In this case only 1
- // worker is started, so we don't need a barrier.
- if (numberOfActiveOMPThreads > 1) {
+ // On Volta and newer architectures we require that all lanes in
+ // a warp (at least, all present for the kernel launch) participate in the
+ // barrier. This is enforced when launching the parallel region. An
+ // exception is when there are < WARPSIZE workers. In this case only 1
+ // worker is started, so we don't need a barrier.
+ if (numberOfActiveOMPThreads > 1) {
#endif
- // The #threads parameter must be rounded up to the WARPSIZE.
- int threads =
- WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
- PRINT(LD_SYNC,
- "call kmpc_barrier with %d omp threads, sync parameter %d\n",
- numberOfActiveOMPThreads, threads);
- // Barrier #1 is for synchronization among active threads.
- named_sync(L1_BARRIER, threads);
+ // The #threads parameter must be rounded up to the WARPSIZE.
+ int threads =
+ WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+ PRINT(LD_SYNC,
+ "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+ numberOfActiveOMPThreads, threads);
+ // Barrier #1 is for synchronization among active threads.
+ named_sync(L1_BARRIER, threads);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
- } // numberOfActiveOMPThreads > 1
+ } // numberOfActiveOMPThreads > 1
#endif
+ }
+ PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
- PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
OpenPOWER on IntegriCloud