diff options
Diffstat (limited to 'clang/lib')
| -rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 265 | ||||
| -rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 18 | ||||
| -rw-r--r-- | clang/lib/Driver/ToolChains/Clang.cpp | 2 | ||||
| -rw-r--r-- | clang/lib/Frontend/CompilerInvocation.cpp | 8 |
4 files changed, 285 insertions, 8 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index e93b73bbccd..8daaf4cff5a 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -17,6 +17,7 @@ #include "clang/AST/DeclOpenMP.h" #include "clang/AST/StmtOpenMP.h" #include "clang/AST/StmtVisitor.h" +#include "clang/Basic/Cuda.h" #include "llvm/ADT/SmallPtrSet.h" using namespace clang; @@ -100,6 +101,11 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_parallel_level, /// Call to int8_t __kmpc_is_spmd_exec_mode(); OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, + /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size, + /// int16_t is_shared, const void **res); + OMPRTL_NVPTX__kmpc_get_team_static_memory, + /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared); + OMPRTL_NVPTX__kmpc_restore_team_static_memory, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -1120,19 +1126,39 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) : EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { - auto &RT = static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); + auto &RT = + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); RT.emitNonSPMDEntryHeader(CGF, EST, WST); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { - auto &RT = static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); + auto &RT = + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()); RT.clearLocThreadIdInsertPt(CGF); RT.emitNonSPMDEntryFooter(CGF, EST); } } Action(EST, WST); CodeGen.setAction(Action); IsInTTDRegion = true; + // Reserve place for the globalized memory. + GlobalizedRecords.emplace_back(); + if (!StaticGlobalized) { + StaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, nullptr, + "_openmp_static_glob_rd$ptr"); + StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + } + if (!KernelStaticGlobalized) { + KernelStaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1249,6 +1275,24 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, } Action(*this, EST, D); CodeGen.setAction(Action); IsInTTDRegion = true; + // Reserve place for the globalized memory. + GlobalizedRecords.emplace_back(); + if (!StaticGlobalized) { + StaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, nullptr, + "_openmp_static_glob_rd$ptr"); + StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + } + if (!KernelStaticGlobalized) { + KernelStaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, + llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(CGM.VoidPtrTy), + "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); + } emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); IsInTTDRegion = false; @@ -1758,6 +1802,24 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode"); break; } + case OMPRTL_NVPTX__kmpc_get_team_static_memory: { + // Build void __kmpc_get_team_static_memory(const void *buf, size_t size, + // int16_t is_shared, const void **res); + llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty, + CGM.VoidPtrPtrTy}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); + break; + } + case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { + // Build void __kmpc_restore_team_static_memory(int16_t is_shared); + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false); + RTLFn = + CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); + break; + } } return RTLFn; } @@ -1995,8 +2057,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); llvm::Value *GlobalRecCastAddr; llvm::Value *IsTTD = nullptr; - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2056,6 +2119,63 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, GlobalRecCastAddr = Phi; I->getSecond().GlobalRecordAddr = Phi; I->getSecond().IsInSPMDModeFlag = IsSPMD; + } else if (IsInTTDRegion) { + assert(GlobalizedRecords.back().Records.size() < 2 && + "Expected less than 2 globalized records: one for target and one " + "for teams."); + unsigned Offset = 0; + for (const RecordDecl *RD : GlobalizedRecords.back().Records) { + QualType RDTy = CGM.getContext().getRecordType(RD); + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); + unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); + Offset = + llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment); + } + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity(); + Offset = llvm::alignTo(Offset, Alignment); + GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); + ++GlobalizedRecords.back().RegionCounter; + if (GlobalizedRecords.back().Records.size() == 1) { + assert(StaticGlobalized && + "Static pointer must be initialized already."); + Address Buffer = CGF.EmitLoadOfPointer( + Address(StaticGlobalized, CGM.getPointerAlign()), + CGM.getContext() + .getPointerType(CGM.getContext().VoidPtrTy) + .castAs<PointerType>()); + auto *RecSize = new llvm::GlobalVariable( + CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, nullptr, + "_openmp_static_kernel$size"); + RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + llvm::Value *Ld = CGF.EmitLoadOfScalar( + Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false, + CGM.getContext().getSizeType(), Loc); + llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + KernelStaticGlobalized, CGM.VoidPtrPtrTy); + llvm::Value *GlobalRecordSizeArg[] = { + Buffer.getPointer(), Ld, + llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_get_team_static_memory), + GlobalRecordSizeArg); + GlobalizedRecords.back().RecSize = RecSize; + } + assert(KernelStaticGlobalized && "Global address must be set already."); + Address FrameAddr = CGF.EmitLoadOfPointer( + Address(KernelStaticGlobalized, CGM.getPointerAlign()), + CGM.getContext() + .getPointerType(CGM.getContext().VoidPtrTy) + .castAs<PointerType>()); + llvm::Value *GlobalRecValue = + Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One()) + .getPointer(); + I->getSecond().GlobalRecordAddr = GlobalRecValue; + I->getSecond().IsInSPMDModeFlag = nullptr; + GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo()); } else { // TODO: allow the usage of shared memory to be controlled by // the user, for now, default to global. @@ -2112,8 +2232,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, AlignmentSource::Decl); } Rec.second.PrivateAddr = VarAddr.getAddress(); - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { assert(I->getSecond().IsInSPMDModeFlag && "Expected unknown execution mode or required SPMD check."); if (IsTTD) { @@ -2193,8 +2314,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, Addr); } if (I->getSecond().GlobalRecordAddr) { - if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + if (!IsInTTDRegion && + (WithSPMDCheck || + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) { CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); @@ -2207,6 +2329,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); CGF.EmitBlock(ExitBB); + } else if (IsInTTDRegion) { + assert(GlobalizedRecords.back().RegionCounter > 0 && + "region counter must be > 0."); + --GlobalizedRecords.back().RegionCounter; + // Emit the restore function only in the target region. + if (GlobalizedRecords.back().RegionCounter == 0) { + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_restore_team_static_memory), + llvm::ConstantInt::getNullValue(CGM.Int16Ty)); + } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), @@ -4308,3 +4441,119 @@ void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas( } } +/// Get number of SMs and number of blocks per SM. +static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { + std::pair<unsigned, unsigned> Data; + if (CGM.getLangOpts().OpenMPCUDANumSMs) + Data.first = CGM.getLangOpts().OpenMPCUDANumSMs; + if (CGM.getLangOpts().OpenMPCUDABlocksPerSM) + Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM; + if (Data.first && Data.second) + return Data; + if (CGM.getTarget().hasFeature("ptx")) { + llvm::StringMap<bool> Features; + CGM.getTarget().initFeatureMap(Features, CGM.getDiags(), + CGM.getTarget().getTargetOpts().CPU, + CGM.getTarget().getTargetOpts().Features); + for (const auto &Feature : Features) { + if (Feature.getValue()) { + switch (StringToCudaArch(Feature.getKey())) { + case CudaArch::SM_20: + case CudaArch::SM_21: + case CudaArch::SM_30: + case CudaArch::SM_32: + case CudaArch::SM_35: + case CudaArch::SM_37: + case CudaArch::SM_50: + case CudaArch::SM_52: + case CudaArch::SM_53: + return {16, 16}; + case CudaArch::SM_60: + case CudaArch::SM_61: + case CudaArch::SM_62: + return {56, 32}; + case CudaArch::SM_70: + case CudaArch::SM_72: + case CudaArch::SM_75: + return {84, 32}; + case CudaArch::GFX600: + case CudaArch::GFX601: + case CudaArch::GFX700: + case CudaArch::GFX701: + case CudaArch::GFX702: + case CudaArch::GFX703: + case CudaArch::GFX704: + case CudaArch::GFX801: + case CudaArch::GFX802: + case CudaArch::GFX803: + case CudaArch::GFX810: + case CudaArch::GFX900: + case CudaArch::GFX902: + case CudaArch::GFX904: + case CudaArch::GFX906: + case CudaArch::GFX909: + case CudaArch::UNKNOWN: + break; + case CudaArch::LAST: + llvm_unreachable("Unexpected Cuda arch."); + } + } + } + } + llvm_unreachable("Unexpected NVPTX target without ptx feature."); +} + +void CGOpenMPRuntimeNVPTX::clear() { + if (!GlobalizedRecords.empty()) { + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); + StaticRD->startDefinition(); + for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { + if (Records.Records.empty()) + continue; + unsigned Size = 0; + unsigned RecAlignment = 0; + for (const RecordDecl *RD : Records.Records) { + QualType RDTy = CGM.getContext().getRecordType(RD); + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); + RecAlignment = std::max(RecAlignment, Alignment); + unsigned RecSize = + CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); + Size = + llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); + } + Size = llvm::alignTo(Size, RecAlignment); + llvm::APInt ArySize(/*numBits=*/64, Size); + QualType SubTy = C.getConstantArrayType( + C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, + C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); + llvm::APInt Size1(32, SMsBlockPerSM.second); + QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::APInt Size2(32, SMsBlockPerSM.first); + QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMArr2Ty, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); + StaticGlobalized->setInitializer( + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, + CGM.VoidPtrTy)); + } + CGOpenMPRuntime::clear(); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index b0b66d86320..dabc6b4f85d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -182,6 +182,7 @@ protected: public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + void clear() override; /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. @@ -413,6 +414,23 @@ private: /// Maps the function to the list of the globalized variables with their /// addresses. llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls; + /// List of records for the globalized variables in target/teams/distribute + /// contexts. Inner records are going to be joined into the single record, + /// while those resulting records are going to be joined into the single + /// union. This resulting union (one per CU) is the entry point for the static + /// memory management runtime functions. + struct GlobalPtrSizeRecsTy { + llvm::GlobalVariable *RecSize = nullptr; + llvm::SmallVector<const RecordDecl *, 2> Records; + unsigned RegionCounter = 0; + }; + llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords; + /// Global variable used for staticlly allocated global memoryused for + /// globalization in target/teams/distribute regions. + llvm::GlobalVariable *StaticGlobalized = nullptr; + /// Shared pointer for the global memory in the global memory buffer used for + /// the given kernel. + llvm::GlobalVariable *KernelStaticGlobalized = nullptr; }; } // CodeGen namespace. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a8ddd8adc32..41681e7314e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4298,6 +4298,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd, options::OPT_fno_openmp_simd); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ); + Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); // When in OpenMP offloading mode with NVPTX target, forward // cuda-mode flag diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 912afba340b..d78c1332dc7 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2686,6 +2686,14 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, Opts.Exceptions = 0; Opts.CXXExceptions = 0; } + if (Opts.OpenMPIsDevice && T.isNVPTX()) { + Opts.OpenMPCUDANumSMs = + getLastArgIntValue(Args, options::OPT_fopenmp_cuda_number_of_sm_EQ, + Opts.OpenMPCUDANumSMs, Diags); + Opts.OpenMPCUDABlocksPerSM = + getLastArgIntValue(Args, options::OPT_fopenmp_cuda_blocks_per_sm_EQ, + Opts.OpenMPCUDABlocksPerSM, Diags); + } // Get the OpenMP target triples if any. if (Arg *A = Args.getLastArg(options::OPT_fopenmp_targets_EQ)) { |

