diff options
-rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 3 | ||||
-rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h | 10 | ||||
-rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 8 |
3 files changed, 13 insertions, 8 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 0a0c6cc4334..5519f647655 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -52,11 +52,8 @@ #error CUDA_VERSION macro is undefined, something wrong with cuda. #elif CUDA_VERSION >= 9000 #define __ACTIVEMASK() __activemask() -#define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __ACTIVEMASK() __ballot(1) -// In Cuda < 9.0 no need to sync threads in warps. -#define __SYNCWARP(Mask) #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index ceed7d3f7c8..c1a84679649 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -14,6 +14,8 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -203,7 +205,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } INLINE void IncParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -212,12 +214,12 @@ INLINE void IncParallelLevel(bool ActiveParallel) { (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -226,7 +228,7 @@ INLINE void DecParallelLevel(bool ActiveParallel) { (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 144f7ab1d79..91883eaea54 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -63,6 +63,12 @@ INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, #endif // CUDA_VERSION } -INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif // CUDA_VERSION +} #endif |