summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-05-22 19:50:32 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-05-22 19:50:32 +0000
commit9d9e406684de7ae2da5c3ee2a45eca80eb0e3534 (patch)
tree7e13ef39b99f69183828d7d240c9666c2439be67
parentbb7357750e7f6035313ccc300b6d9e1b0ffd6b19 (diff)
downloadbcm5719-llvm-9d9e406684de7ae2da5c3ee2a45eca80eb0e3534.tar.gz
bcm5719-llvm-9d9e406684de7ae2da5c3ee2a45eca80eb0e3534.zip
[OPENMP][NVPTX]Fix barriers and parallel level counters, NFC.
Summary: Parallel level counter should be volatile to prevent some dangerous optimiations by the ptxas. Otherwise, ptxas optimizations lead to undefined behaviour in some cases. Also, use __threadfence() for #pragma omp flush and if the barrier should not be used (we have only one thread in the team), still perform flush operation since the standard requires implicit flush when executing barriers. Reviewers: gtbercea, kkwli0, grokos Subscribers: guansong, jfb, jdoerfert, openmp-commits, caomhin Tags: #openmp Differential Revision: https://reviews.llvm.org/D62199 llvm-svn: 361421
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu3
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h2
-rw-r--r--openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu4
3 files changed, 6 insertions, 3 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index d369da1cb7e..0cd9b57fd7c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -31,7 +31,8 @@ __device__ omptarget_nvptx_SimpleMemoryManager
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
-__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+__device__ __shared__ volatile uint8_t
+ parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
__device__ __shared__ uint16_t nThreads;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index cd51538ad79..b85d0a750f2 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -398,7 +398,7 @@ extern __device__ omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
-extern __device__ __shared__ uint8_t
+extern __device__ __shared__ volatile uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
index d81aa8f0f3a..191b046c9f4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -62,6 +62,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
// Barrier #1 is for synchronization among active threads.
named_sync(L1_BARRIER, threads);
}
+ } else {
+ __kmpc_flush(loc_ref);
} // numberOfActiveOMPThreads > 1
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
}
@@ -130,7 +132,7 @@ EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
EXTERN void __kmpc_flush(kmp_Ident *loc) {
PRINT0(LD_IO, "call kmpc_flush\n");
- __threadfence_system();
+ __threadfence();
}
////////////////////////////////////////////////////////////////////////////////
OpenPOWER on IntegriCloud