diff options
| author | JonChesterfield <jonathanchesterfield@gmail.com> | 2019-12-18 19:39:34 +0000 |
|---|---|---|
| committer | Jon Chesterfield <jonathanchesterfield@gmail.com> | 2019-12-18 19:39:35 +0000 |
| commit | 8adae6027c0813df935e4f96067bab0051974910 (patch) | |
| tree | 0fa29c3af757a8e5431505bfe14985dda52764a4 /openmp | |
| parent | 9d38fd8d0be3074af036e9e3e36489f5b854faf4 (diff) | |
| download | bcm5719-llvm-8adae6027c0813df935e4f96067bab0051974910.tar.gz bcm5719-llvm-8adae6027c0813df935e4f96067bab0051974910.zip | |
[libomptarget][nfc] Extract function from data_sharing, move to common
Summary:
[libomptarget][nfc] Extract function from data_sharing, move to common
Finding the first active thread in the warp is different on nvptx and amdgcn,
mostly due to warp size and the desire for efficiency.
Reviewers: ABataev, jdoerfert, grokos
Reviewed By: jdoerfert
Subscribers: jvesely, mgorny, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D71643
Diffstat (limited to 'openmp')
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt | 5 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h | 2 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu (renamed from openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu) | 18 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 4 | ||||
| -rw-r--r-- | openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h | 9 |
5 files changed, 21 insertions, 17 deletions
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt index 9eadbdb1e24..ebea0a049b6 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -57,12 +57,13 @@ get_filename_component(devicertl_base_directory set(cuda_sources ${devicertl_base_directory}/common/src/cancel.cu ${devicertl_base_directory}/common/src/critical.cu - ${devicertl_base_directory}/common/src/loop.cu + ${devicertl_base_directory}/common/src/data_sharing.cu ${devicertl_base_directory}/common/src/libcall.cu - ${devicertl_base_directory}/common/src/reduction.cu + ${devicertl_base_directory}/common/src/loop.cu ${devicertl_base_directory}/common/src/omp_data.cu ${devicertl_base_directory}/common/src/omptarget.cu ${devicertl_base_directory}/common/src/parallel.cu + ${devicertl_base_directory}/common/src/reduction.cu ${devicertl_base_directory}/common/src/sync.cu ${devicertl_base_directory}/common/src/task.cu) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 713a880d9a5..40bbf943aef 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -101,6 +101,8 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { return __lanemask_gt(); } +EXTERN bool __kmpc_impl_is_first_active_thread(); + INLINE uint32_t __kmpc_impl_smid() { return __smid(); } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu index 6549d76def7..c259c770789 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -1,4 +1,4 @@ -//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===// +//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,21 +6,13 @@ // //===----------------------------------------------------------------------===// // -// This file contains the implementation of data sharing environments/ +// This file contains the implementation of data sharing environments // //===----------------------------------------------------------------------===// #include "common/omptarget.h" #include "target_impl.h" #include <stdio.h> -// Return true if this is the first active thread in the warp. -INLINE static bool IsWarpMasterActiveThread() { - unsigned long long Mask = __kmpc_impl_activemask(); - unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); - unsigned long long Sh = Mask << ShNum; - // Truncate Sh to the 32 lower bits - return (unsigned)Sh == 0; -} // Return true if this is the master thread. INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); @@ -128,7 +120,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin( DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); // Only the warp active master needs to grow the stack. - if (IsWarpMasterActiveThread()) { + if (__kmpc_impl_is_first_active_thread()) { // Save the current active threads. ActiveT = CurActiveThreads; @@ -229,7 +221,7 @@ EXTERN void __kmpc_data_sharing_environment_end( unsigned WID = GetWarpId(); if (IsEntryPoint) { - if (IsWarpMasterActiveThread()) { + if (__kmpc_impl_is_first_active_thread()) { DSPRINT0(DSFLAG, "Doing clean up\n"); // The master thread cleans the saved slot, because this is an environment @@ -255,7 +247,7 @@ EXTERN void __kmpc_data_sharing_environment_end( // warp diverged and returns in different places). This only works if we // assume that threads will converge right after the call site that started // the environment. - if (IsWarpMasterActiveThread()) { + if (__kmpc_impl_is_first_active_thread()) { __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Before restoring the stack\n"); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index d38d766a780..84b52f55b73 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -53,9 +53,8 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) set(cuda_src_files ${devicertl_common_directory}/src/cancel.cu ${devicertl_common_directory}/src/critical.cu - src/data_sharing.cu + ${devicertl_common_directory}/src/data_sharing.cu ${devicertl_common_directory}/src/libcall.cu - src/target_impl.cu ${devicertl_common_directory}/src/loop.cu ${devicertl_common_directory}/src/omptarget.cu ${devicertl_common_directory}/src/parallel.cu @@ -63,6 +62,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) ${devicertl_common_directory}/src/support.cu ${devicertl_common_directory}/src/sync.cu ${devicertl_common_directory}/src/task.cu + src/target_impl.cu ) set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 350d2cf5f2e..6f6c38956a9 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -94,6 +94,15 @@ INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { return res; } +// Return true if this is the first active thread in the warp. +INLINE bool __kmpc_impl_is_first_active_thread() { + unsigned long long Mask = __kmpc_impl_activemask(); + unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); + unsigned long long Sh = Mask << ShNum; + // Truncate Sh to the 32 lower bits + return (unsigned)Sh == 0; +} + INLINE uint32_t __kmpc_impl_smid() { uint32_t id; asm("mov.u32 %0, %%smid;" : "=r"(id)); |

