diff options
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h | 4 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 18 |
2 files changed, 11 insertions, 11 deletions
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index e0d6bb1953c..4e8232974e7 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -101,8 +101,6 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { return __lanemask_gt(); } -EXTERN bool __kmpc_impl_is_first_active_thread(); - INLINE uint32_t __kmpc_impl_smid() { return __smid(); } @@ -126,6 +124,8 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { return __ballot64(1); } +EXTERN bool __kmpc_impl_is_first_active_thread(); + EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, int32_t SrcLane); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 6f6c38956a9..8461a93913a 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -94,15 +94,6 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { return res; } -// Return true if this is the first active thread in the warp. -INLINE bool __kmpc_impl_is_first_active_thread() { - unsigned long long Mask = __kmpc_impl_activemask(); - unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); - unsigned long long Sh = Mask << ShNum; - // Truncate Sh to the 32 lower bits - return (unsigned)Sh == 0; -} - INLINE uint32_t __kmpc_impl_smid() { uint32_t id; asm("mov.u32 %0, %%smid;" : "=r"(id)); @@ -142,6 +133,15 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { #endif } +// Return true if this is the first active thread in the warp. +INLINE bool __kmpc_impl_is_first_active_thread() { + unsigned long long Mask = __kmpc_impl_activemask(); + unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); + unsigned long long Sh = Mask << ShNum; + // Truncate Sh to the 32 lower bits + return (unsigned)Sh == 0; +} + // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, |

