diff options
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 73 |
1 files changed, 43 insertions, 30 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 97b8f79a9f9..21911c96f36 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -56,12 +56,12 @@ enum OpenMPRTLFunctionNVPTX { /// Call to int64_t __kmpc_shuffle_int64(int64_t element, /// int16_t lane_offset, int16_t warp_size); OMPRTL_NVPTX__kmpc_shuffle_int64, - /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32 + /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2, /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 /// global_tid, kmp_critical_name *lck) OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, @@ -91,10 +91,11 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_parallel_level, /// Call to int8_t __kmpc_is_spmd_exec_mode(); OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, - /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size, - /// int16_t is_shared, const void **res); + /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + /// const void *buf, size_t size, int16_t is_shared, const void **res); OMPRTL_NVPTX__kmpc_get_team_static_memory, - /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared); + /// Call to void __kmpc_restore_team_static_memory(int16_t + /// isSPMDExecutionMode, int16_t is_shared); OMPRTL_NVPTX__kmpc_restore_team_static_memory, /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); OMPRTL__kmpc_barrier, @@ -1646,12 +1647,12 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); break; } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: { - // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid, - // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t Algorithm Version), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: { + // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, + // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* + // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t + // lane_id, int16_t lane_offset, int16_t Algorithm Version), void + // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, CGM.Int16Ty, CGM.Int16Ty}; auto *ShuffleReduceFnTy = @@ -1661,7 +1662,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *InterWarpCopyFnTy = llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), + CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, @@ -1670,7 +1672,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2"); break; } case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { @@ -1779,19 +1781,21 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { break; } case OMPRTL_NVPTX__kmpc_get_team_static_memory: { - // Build void __kmpc_get_team_static_memory(const void *buf, size_t size, - // int16_t is_shared, const void **res); - llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty, - CGM.VoidPtrPtrTy}; + // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + // const void *buf, size_t size, int16_t is_shared, const void **res); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy, + CGM.Int16Ty, CGM.VoidPtrPtrTy}; auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); break; } case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { - // Build void __kmpc_restore_team_static_memory(int16_t is_shared); + // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + // int16_t is_shared); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); break; @@ -2211,8 +2215,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGM.getContext().getSizeType(), Loc); llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld, - IsInSharedMemory, ResAddr}; + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_get_team_static_memory), GlobalRecordSizeArg); @@ -2400,10 +2407,15 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, Address(GlobalizedRecords.back().UseSharedMemory, CGM.getContext().getTypeAlignInChars(Int16Ty)), /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); + llvm::Value *Args[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + IsInSharedMemory}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_restore_team_static_memory), - IsInSharedMemory); + Args); } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( @@ -3608,7 +3620,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 3. Call the OpenMP runtime on the GPU to reduce within a team /// and store the result on the team master: /// -/// __kmpc_nvptx_parallel_reduce_nowait(..., +/// __kmpc_nvptx_parallel_reduce_nowait_v2(..., /// reduceData, shuffleReduceFn, interWarpCpyFn) /// /// where: @@ -3779,7 +3791,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// Intra-Team Reduction /// /// This function, as implemented in the runtime call -/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP +/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP /// threads in a team. It first reduces within a warp using the /// aforementioned algorithms. We then proceed to gather all such /// reduced values at the first warp. @@ -3802,7 +3814,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., /// the k'th worker reduces every k'th element. /// -/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to +/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to /// reduce across workers and compute a globally reduced value. /// void CGOpenMPRuntimeNVPTX::emitReduction( @@ -3832,6 +3844,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction( // RedList, shuffle_reduce_func, interwarp_copy_func); // or // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; @@ -3886,19 +3899,19 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Args[] = {ThreadId, + llvm::Value *Args[] = {RTLoc, + ThreadId, CGF.Builder.getInt32(RHSExprs.size()), ReductionArrayTySize, RL, ShuffleAndReduceFn, InterWarpCopyFn}; - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), - Args); + Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2), + Args); } else { assert(TeamsReduction && "expected teams reduction."); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); std::string Name = getName({"reduction"}); llvm::Value *Lock = getCriticalRegionLock(Name); llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; |