diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2019-05-03 20:00:38 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2019-05-03 20:00:38 +0000 |
commit | a857e310115cd21f187c028777c582e3089cf722 (patch) | |
tree | 37d3e0e57a560c48e0fb442f20a99011617529ea | |
parent | e5cbe78259c9b3f181f267ac82bde04dd8a57df8 (diff) | |
download | bcm5719-llvm-a857e310115cd21f187c028777c582e3089cf722.tar.gz bcm5719-llvm-a857e310115cd21f187c028777c582e3089cf722.zip |
[OPENMP][NVPTX]Improve thread limit counter, NFC.
Summary:
Patch improves performance of the full runtime mode by moving
thread-limit counter to the shared memory. It also allows to save
global memory.
Reviewers: grokos, gtbercea, kkwli0
Subscribers: guansong, jdoerfert, caomhin, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D61526
llvm-svn: 359922
5 files changed, 6 insertions, 12 deletions
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 681b8194afe..ae6f83f160e 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -70,10 +70,7 @@ EXTERN int omp_get_max_threads(void) { EXTERN int omp_get_thread_limit(void) { if (isSPMDMode()) return GetNumberOfThreadsInBlock(); - // per contention group.. meaning threads in current team - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int rc = currTaskDescr->ThreadLimit(); + int rc = threadLimit; PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; } diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu index 1c2da4608b5..5d6a1a3557c 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -32,7 +32,7 @@ __device__ __shared__ uint32_t usedMemIdx; __device__ __shared__ uint32_t usedSlotIdx; __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; - +__device__ __shared__ uint16_t threadLimit; // Pointer to this team's OpenMP state object __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index dc41a800c93..3c9141c65cc 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -74,7 +74,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); - currTaskDescr->ThreadLimit() = ThreadLimit; + threadLimit = ThreadLimit; } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { @@ -139,7 +139,6 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); - newTaskDescr->ThreadLimit() = ThreadLimit; // install new top descriptor omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 8ee69de2763..f55caabd153 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -165,7 +165,6 @@ public: INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } // methods for other fields INLINE uint16_t &NThreads() { return items.nthreads; } - INLINE uint16_t &ThreadLimit() { return items.threadlimit; } INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } @@ -213,7 +212,6 @@ private: uint8_t flags; // 6 bit used (see flag above) uint8_t unused; uint16_t nthreads; // thread num for subsequent parallel regions - uint16_t threadlimit; // thread limit ICV uint16_t threadId; // thread id uint16_t threadsInTeam; // threads in current team uint64_t runtimeChunkSize; // runtime chunk size @@ -408,6 +406,7 @@ extern __device__ __shared__ uint32_t usedMemIdx; extern __device__ __shared__ uint32_t usedSlotIdx; extern __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern __device__ __shared__ uint16_t threadLimit; extern __device__ __shared__ omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 45a6758120d..6bb61b17a69 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -249,9 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, uint16_t &NumThreadsClause = omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, currTaskDescr->NThreads(), - currTaskDescr->ThreadLimit()); + uint16_t NumThreads = determineNumberOfThreads( + NumThreadsClause, currTaskDescr->NThreads(), threadLimit); if (NumThreadsClause != 0) { // Reset request to avoid propagating to successive #parallel |