diff options
author | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-12-17 01:30:04 +0000 |
---|---|---|
committer | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-12-17 01:30:04 +0000 |
commit | 53bcd1e1413c878d2d988df80142a430a9abf24a (patch) | |
tree | e9d898437f380733c880212edacba2be66368def /openmp | |
parent | 7a31678b71465da7e6a69bb3d1987823f41589d6 (diff) | |
download | bcm5719-llvm-53bcd1e1413c878d2d988df80142a430a9abf24a.tar.gz bcm5719-llvm-53bcd1e1413c878d2d988df80142a430a9abf24a.zip |
[libomptarget][nfc] Wrap cuda min() in target_impl
Summary:
[libomptarget][nfc] Wrap cuda min() in target_impl
nvptx forwards to cuda min, amdgcn implements directly.
Sufficient to build parallel.cu for amdgcn, added to CMakeLists.
All call sites are homogenous except one that passes a uint32_t and an
int32_t. This could be smoothed over by taking two type parameters
and some care over the return type, but overall I think the inline
<uint32_t> calling attention to what was an implicit sign conversion
is cleaner.
Reviewers: ABataev, jdoerfert
Reviewed By: jdoerfert
Subscribers: jvesely, mgorny, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D71580
Diffstat (limited to 'openmp')
5 files changed, 14 insertions, 5 deletions
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt index 671508aac41..802ab0b42cc 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -59,6 +59,7 @@ set(cuda_sources ${devicertl_base_directory}/common/src/critical.cu ${devicertl_base_directory}/common/src/loop.cu ${devicertl_base_directory}/common/src/omptarget.cu + ${devicertl_base_directory}/common/src/parallel.cu ${devicertl_base_directory}/common/src/sync.cu ${devicertl_base_directory}/common/src/task.cu) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 5082d469d05..858a023eb8d 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -109,6 +109,10 @@ INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } +template <typename T> INLINE T __kmpc_impl_min(T x, T y) { + return x < y ? x : y; +} + INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { return __ballot64(1); } diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu index 4934621de58..2a02c69e7e8 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -72,7 +72,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, // We cannot have more than the # of convergent threads. if (SimdLimitSource > 0) - *NumLanes = min(ConvergentSize, SimdLimitSource); + *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource); else *NumLanes = ConvergentSize; ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", @@ -149,7 +149,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, // We cannot have more than the # of convergent threads. uint16_t NumThreads; if (NumThreadsSource > 0) - NumThreads = min(ConvergentSize, NumThreadsSource); + NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource); else NumThreads = ConvergentSize; ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu index cfccf78c377..fa9c130c0fc 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -480,14 +480,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // by returning 1 in the thread holding the reduction result. // Check if this is the very last team. - unsigned NumRecs = min(NumTeams, num_of_records); + unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records)); if (ChunkTeamCount == NumTeams - Bound - 1) { // // Last team processing. // if (ThreadId >= NumRecs) return 0; - NumThreads = roundToWarpsize(min(NumThreads, NumRecs)); + NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs)); if (ThreadId >= NumThreads) return 0; @@ -502,7 +502,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( // When we have more than [warpsize] number of threads // a block reduction is performed here. - uint32_t ActiveThreads = min(NumRecs, NumThreads); + uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads); if (ActiveThreads > WARPSIZE) { uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; // Gather all the reduced values from each warp diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 161cd6cac11..4bb66776a2a 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -104,6 +104,10 @@ INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } +template <typename T> INLINE T __kmpc_impl_min(T x, T y) { + return min(x, y); +} + #ifndef CUDA_VERSION #error CUDA_VERSION macro is undefined, something wrong with cuda. #endif |