diff options
| author | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 02:02:53 +0000 |
|---|---|---|
| committer | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-08-28 02:02:53 +0000 |
| commit | be3d4873130096ee722ccad2eff392bc6955b333 (patch) | |
| tree | dc1c7b44f4944c91d0dd040904b53b1d49a07514 | |
| parent | e73e3013a646b9c5d3e11fac10afcfff464cb895 (diff) | |
| download | bcm5719-llvm-be3d4873130096ee722ccad2eff392bc6955b333.tar.gz bcm5719-llvm-be3d4873130096ee722ccad2eff392bc6955b333.zip | |
[libomptarget] Refactor syncwarp macro to inline function
Summary:
[libomptarget] Refactor syncwarp macro to inline function
See also abandoned D66846, split into this diff and others.
Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers
Subscribers: openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D66857
llvm-svn: 370149
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 3 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h | 10 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 8 |
3 files changed, 13 insertions, 8 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 0a0c6cc4334..5519f647655 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -52,11 +52,8 @@ #error CUDA_VERSION macro is undefined, something wrong with cuda. #elif CUDA_VERSION >= 9000 #define __ACTIVEMASK() __activemask() -#define __SYNCWARP(Mask) __syncwarp(Mask) #else #define __ACTIVEMASK() __ballot(1) -// In Cuda < 9.0 no need to sync threads in warps. -#define __SYNCWARP(Mask) #endif // CUDA_VERSION #define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory"); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h index ceed7d3f7c8..c1a84679649 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -14,6 +14,8 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// +#include "target_impl.h" + INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { execution_param = EMode; execution_param |= RMode; @@ -203,7 +205,7 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } INLINE void IncParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -212,12 +214,12 @@ INLINE void IncParallelLevel(bool ActiveParallel) { (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } INLINE void DecParallelLevel(bool ActiveParallel) { unsigned Active = __ACTIVEMASK(); - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); unsigned LaneMaskLt; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(LaneMaskLt)); unsigned Rank = __popc(Active & LaneMaskLt); @@ -226,7 +228,7 @@ INLINE void DecParallelLevel(bool ActiveParallel) { (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); __threadfence(); } - __SYNCWARP(Active); + __kmpc_impl_syncwarp(Active); } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 144f7ab1d79..91883eaea54 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -63,6 +63,12 @@ INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, #endif // CUDA_VERSION } -INLINE void __kmpc_impl_syncwarp(int32_t Mask) { __SYNCWARP(Mask); } +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif // CUDA_VERSION +} #endif |

