diff options
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGDecl.cpp | 29 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntime.cpp | 5 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntime.h | 10 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 604 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 33 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGStmtOpenMP.cpp | 1 | ||||
-rw-r--r-- | clang/lib/CodeGen/CodeGenFunction.cpp | 5 |
7 files changed, 658 insertions, 29 deletions
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 52294c2034a..78acf8d749f 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1068,9 +1068,17 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) { } // A normal fixed sized variable becomes an alloca in the entry block, - // unless it's an NRVO variable. - - if (NRVO) { + // unless: + // - it's an NRVO variable. + // - we are compiling OpenMP and it's an OpenMP local variable. + + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { + address = OpenMPLocalAddr; + } else if (NRVO) { // The named return value optimization: allocate this variable in the // return slot, so that we can elide the copy when returning this // variable (C++0x [class.copy]p34). @@ -1896,9 +1904,18 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg, } } } else { - // Otherwise, create a temporary to hold the value. - DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), - D.getName() + ".addr"); + // Check if the parameter address is controlled by OpenMP runtime. + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { + DeclPtr = OpenMPLocalAddr; + } else { + // Otherwise, create a temporary to hold the value. + DeclPtr = CreateMemTemp(Ty, getContext().getDeclAlign(&D), + D.getName() + ".addr"); + } DoStore = true; } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 708f0abd08d..8c25d206f49 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -8100,6 +8100,11 @@ Address CGOpenMPRuntime::getParameterAddress(CodeGenFunction &CGF, return CGF.GetAddrOfLocalVar(NativeParam); } +Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) { + return Address::invalid(); +} + llvm::Value *CGOpenMPSIMDRuntime::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 179d9b43945..0f18d13296f 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -676,7 +676,7 @@ public: /// \brief Cleans up references to the objects in finished function. /// - void functionFinished(CodeGenFunction &CGF); + virtual void functionFinished(CodeGenFunction &CGF); /// \brief Emits code for parallel or serial call of the \a OutlinedFn with /// variables captured in a record which address is stored in \a @@ -1362,6 +1362,14 @@ public: emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> Args = llvm::None) const; + + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + virtual void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) {} + + /// Gets the OpenMP-specific address of the local variable. + virtual Address getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD); }; /// Class supports emissionof SIMD-only code. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index c0bf6e13b2a..b71965cfb53 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -13,9 +13,11 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeNVPTX.h" -#include "clang/AST/DeclOpenMP.h" #include "CodeGenFunction.h" +#include "clang/AST/DeclOpenMP.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/AST/StmtVisitor.h" +#include "llvm/ADT/SmallPtrSet.h" using namespace clang; using namespace CodeGen; @@ -70,7 +72,21 @@ enum OpenMPRTLFunctionNVPTX { /// index, int32_t width, int32_t reduce)) OMPRTL_NVPTX__kmpc_teams_reduce_nowait, /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); - OMPRTL_NVPTX__kmpc_end_reduce_nowait + OMPRTL_NVPTX__kmpc_end_reduce_nowait, + /// \brief Call to void __kmpc_data_sharing_init_stack(); + OMPRTL_NVPTX__kmpc_data_sharing_init_stack, + /// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size, + /// int16_t UseSharedMemory); + OMPRTL_NVPTX__kmpc_data_sharing_push_stack, + /// \brief Call to void __kmpc_data_sharing_pop_stack(void *a); + OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, + /// \brief Call to void __kmpc_begin_sharing_variables(void ***args, + /// size_t n_args); + OMPRTL_NVPTX__kmpc_begin_sharing_variables, + /// \brief Call to void __kmpc_end_sharing_variables(); + OMPRTL_NVPTX__kmpc_end_sharing_variables, + /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs) + OMPRTL_NVPTX__kmpc_get_shared_variables, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -149,6 +165,245 @@ enum NamedBarrier : unsigned { /// barrier. NB_Parallel = 1, }; + +/// Get the list of variables that can escape their declaration context. +class CheckVarsEscapingDeclContext final + : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { + CodeGenFunction &CGF; + llvm::SetVector<const ValueDecl *> EscapedDecls; + llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls; + bool AllEscaped = false; + RecordDecl *GlobalizedRD = nullptr; + llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; + + void markAsEscaped(const ValueDecl *VD) { + if (IgnoredDecls.count(VD) || + (CGF.CapturedStmtInfo && + CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD)))) + return; + EscapedDecls.insert(VD); + } + + void VisitValueDecl(const ValueDecl *VD) { + if (VD->getType()->isLValueReferenceType()) { + markAsEscaped(VD); + if (const auto *VarD = dyn_cast<VarDecl>(VD)) { + if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(VarD->getInit()); + AllEscaped = SavedAllEscaped; + } + } + } + } + void VisitOpenMPCapturedStmt(const CapturedStmt *S) { + if (!S) + return; + for (const auto &C : S->captures()) { + if (C.capturesVariable() && !C.capturesVariableByCopy()) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (isa<OMPCapturedExprDecl>(VD)) + VisitValueDecl(VD); + } + } + } + + typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy; + static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) { + return P1.first > P2.first; + } + + void buildRecordForGlobalizedVars() { + assert(!GlobalizedRD && + "Record for globalized variables is built already."); + if (EscapedDecls.empty()) + return; + ASTContext &C = CGF.getContext(); + SmallVector<VarsDataTy, 4> GlobalizedVars; + for (const auto *D : EscapedDecls) + GlobalizedVars.emplace_back(C.getDeclAlign(D), D); + std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(), + stable_sort_comparator); + // Build struct _globalized_locals_ty { + // /* globalized vars */ + // }; + GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); + GlobalizedRD->startDefinition(); + for (const auto &Pair : GlobalizedVars) { + const ValueDecl *VD = Pair.second; + QualType Type = VD->getType(); + if (Type->isLValueReferenceType()) + Type = C.getPointerType(Type.getNonReferenceType()); + else + Type = Type.getNonReferenceType(); + SourceLocation Loc = VD->getLocation(); + auto *Field = FieldDecl::Create( + C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, + C.getTrivialTypeSourceInfo(Type, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + GlobalizedRD->addDecl(Field); + if (VD->hasAttrs()) { + for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), + E(VD->getAttrs().end()); + I != E; ++I) + Field->addAttr(*I); + } + MappedDeclsFields.try_emplace(VD, Field); + } + GlobalizedRD->completeDefinition(); + } + +public: + CheckVarsEscapingDeclContext(CodeGenFunction &CGF, + ArrayRef<const ValueDecl *> IgnoredDecls) + : CGF(CGF), IgnoredDecls(IgnoredDecls.begin(), IgnoredDecls.end()) {} + virtual ~CheckVarsEscapingDeclContext() = default; + void VisitDeclStmt(const DeclStmt *S) { + if (!S) + return; + for (const auto *D : S->decls()) + if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) + VisitValueDecl(VD); + } + void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { + if (!D) + return; + if (D->hasAssociatedStmt()) { + if (const auto *S = + dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) + VisitOpenMPCapturedStmt(S); + } + } + void VisitCapturedStmt(const CapturedStmt *S) { + if (!S) + return; + for (const auto &C : S->captures()) { + if (C.capturesVariable() && !C.capturesVariableByCopy()) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (isa<OMPCapturedExprDecl>(VD)) + VisitValueDecl(VD); + } + } + } + void VisitLambdaExpr(const LambdaExpr *E) { + if (!E) + return; + for (const auto &C : E->captures()) { + if (C.capturesVariable()) { + if (C.getCaptureKind() == LCK_ByRef) { + const ValueDecl *VD = C.getCapturedVar(); + markAsEscaped(VD); + if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) + VisitValueDecl(VD); + } + } + } + } + void VisitBlockExpr(const BlockExpr *E) { + if (!E) + return; + for (const auto &C : E->getBlockDecl()->captures()) { + if (C.isByRef()) { + const VarDecl *VD = C.getVariable(); + markAsEscaped(VD); + if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) + VisitValueDecl(VD); + } + } + } + void VisitCallExpr(const CallExpr *E) { + if (!E) + return; + for (const Expr *Arg : E->arguments()) { + if (!Arg) + continue; + if (Arg->isLValue()) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(Arg); + AllEscaped = SavedAllEscaped; + } else + Visit(Arg); + } + Visit(E->getCallee()); + } + void VisitDeclRefExpr(const DeclRefExpr *E) { + if (!E) + return; + const ValueDecl *VD = E->getDecl(); + if (AllEscaped) + markAsEscaped(VD); + if (isa<OMPCapturedExprDecl>(VD)) + VisitValueDecl(VD); + else if (const auto *VarD = dyn_cast<VarDecl>(VD)) + if (VarD->isInitCapture()) + VisitValueDecl(VD); + } + void VisitUnaryOperator(const UnaryOperator *E) { + if (!E) + return; + if (E->getOpcode() == UO_AddrOf) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(E->getSubExpr()); + AllEscaped = SavedAllEscaped; + } else + Visit(E->getSubExpr()); + } + void VisitImplicitCastExpr(const ImplicitCastExpr *E) { + if (!E) + return; + if (E->getCastKind() == CK_ArrayToPointerDecay) { + const bool SavedAllEscaped = AllEscaped; + AllEscaped = true; + Visit(E->getSubExpr()); + AllEscaped = SavedAllEscaped; + } else + Visit(E->getSubExpr()); + } + void VisitExpr(const Expr *E) { + if (!E) + return; + bool SavedAllEscaped = AllEscaped; + if (!E->isLValue()) + AllEscaped = false; + for (const auto *Child : E->children()) + if (Child) + Visit(Child); + AllEscaped = SavedAllEscaped; + } + void VisitStmt(const Stmt *S) { + if (!S) + return; + for (const auto *Child : S->children()) + if (Child) + Visit(Child); + } + + const RecordDecl *getGlobalizedRecord() { + if (!GlobalizedRD) + buildRecordForGlobalizedVars(); + return GlobalizedRD; + } + + const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { + assert(GlobalizedRD && + "Record for globalized variables must be generated already."); + auto I = MappedDeclsFields.find(VD); + if (I == MappedDeclsFields.end()) + return nullptr; + return I->getSecond(); + } + + ArrayRef<const ValueDecl *> getEscapedDecls() const { + return EscapedDecls.getArrayRef(); + } +}; } // anonymous namespace /// Get the GPU warp size. @@ -288,6 +543,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, EntryFunctionState EST; WorkerFunctionState WST(CGM, D.getLocStart()); Work.clear(); + WrapperFunctionsMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -344,6 +600,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); + // SEQUENTIAL (MASTER) REGION START // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. // TODO: Optimize runtime initialization and pass in correct value. @@ -351,10 +608,65 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, Bld.getInt16(/*RequiresOMPRuntime=*/1)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); + + // For data sharing, we need to initialize the stack. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); + + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I == FunctionGlobalizedDecls.end()) + return; + const RecordDecl *GlobalizedVarsRecord = I->getSecond().first; + QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); + + // Recover pointer to this function's global record. The runtime will + // handle the specifics of the allocation of the memory. + // Use actual memory size of the record including the padding + // for alignment purposes. + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); + unsigned GlobalRecordSize = + CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); + GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + // TODO: allow the usage of shared memory to be controlled by + // the user, for now, default to global. + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), + CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack), + GlobalRecordSizeArg); + llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue); + + // Emit the "global alloca" which is a GEP from the global declaration record + // using the pointer returned by the runtime. + LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); + auto &Res = I->getSecond().second; + for (auto &Rec : Res) { + const FieldDecl *FD = Rec.second.first; + LValue VarAddr = CGF.EmitLValueForField(Base, FD); + Rec.second.second = VarAddr.getAddress(); + } } void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I != FunctionGlobalizedDecls.end()) { + if (!CGF.HaveInsertPoint()) + return; + auto I = FunctionToGlobalRecPtr.find(CGF.CurFn); + if (I != FunctionToGlobalRecPtr.end()) { + llvm::Value *Args[] = {I->getSecond()}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), + Args); + } + } + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -543,14 +855,13 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, // Execute this outlined function. CGF.EmitBlock(ExecuteFNBB); - // Insert call to work function. - // FIXME: Pass arguments to outlined function from master thread. - auto *Fn = cast<llvm::Function>(W); - Address ZeroAddr = - CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); - llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; - emitCall(CGF, WST.Loc, Fn, FnArgs); + // Insert call to work function via shared wrapper. The shared + // wrapper takes two arguments: + // - the parallelism level; + // - the master thread ID; + emitOutlinedFunctionCall(CGF, WST.Loc, W, + {Bld.getInt16(/*ParallelLevel=*/0), + getMasterThreadID(CGF)}); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); @@ -619,8 +930,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function, int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy, - CGM.Int16Ty}; + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); @@ -758,6 +1068,56 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); break; } + case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { + /// Build void __kmpc_data_sharing_init_stack(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { + // Build void *__kmpc_data_sharing_push_stack(size_t size, + // int16_t UseSharedMemory); + llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_data_sharing_push_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: { + // Build void __kmpc_data_sharing_pop_stack(void *a); + llvm::Type *TypeParams[] = {CGM.VoidPtrTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, + /*Name=*/"__kmpc_data_sharing_pop_stack"); + break; + } + case OMPRTL_NVPTX__kmpc_begin_sharing_variables: { + /// Build void __kmpc_begin_sharing_variables(void ***args, + /// size_t n_args); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables"); + break; + } + case OMPRTL_NVPTX__kmpc_end_sharing_variables: { + /// Build void __kmpc_end_sharing_variables(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables"); + break; + } + case OMPRTL_NVPTX__kmpc_get_shared_variables: { + /// Build void __kmpc_get_shared_variables(void ***GlobalArgs); + llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); + break; + } } return RTLFn; } @@ -847,8 +1207,16 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { - return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, - InnermostKind, CodeGen); + auto *OutlinedFun = + cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( + D, ThreadIDVar, InnermostKind, CodeGen)); + if (!isInSpmdExecutionMode()) { + llvm::Function *WrapperFun = + createParallelDataSharingWrapper(OutlinedFun, D); + WrapperFunctionsMap[OutlinedFun] = WrapperFun; + } + + return OutlinedFun; } llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( @@ -900,16 +1268,58 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + + assert(WFn && "Wrapper function does not exist!"); + + // Force inline this outlined function at its call site. + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF, + PrePostActionTy &) { CGBuilderTy &Bld = CGF.Builder; - // TODO: Optimize runtime initialization. - llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy), - /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + + // Prepare for parallel region. Indicate the outlined function. + llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); + + // Create a private scope that will globalize the arguments + // passed from the outside of the target region. + CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); + + // There's somehting to share. + if (!CapturedVars.empty()) { + // Prepare for parallel region. Indicate the outlined function. + Address SharedArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); + llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); + + llvm::Value *DataSharingArgs[] = { + SharedArgsPtr, + llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_begin_sharing_variables), + DataSharingArgs); + + // Store variable address in a list of references to pass to workers. + unsigned Idx = 0; + ASTContext &Ctx = CGF.getContext(); + Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs, + Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) + .castAs<PointerType>()); + for (llvm::Value *V : CapturedVars) { + Address Dst = Bld.CreateConstInBoundsGEP( + SharedArgListAddress, Idx, CGF.getPointerSize()); + llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); + CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + Idx++; + } + } // Activate workers. This barrier is used by the master to signal // work for the workers. @@ -923,8 +1333,12 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( // The master waits at this barrier until all workers are done. syncCTAThreads(CGF); + if (!CapturedVars.empty()) + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables)); + // Remember for post-processing in worker loop. - Work.emplace_back(Fn); + Work.emplace_back(WFn); }; auto *RTLoc = emitUpdateLocation(CGF, Loc); @@ -2343,3 +2757,149 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( } CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); } + +/// Emit function which wraps the outline parallel region +/// and controls the arguments which are passed to this function. +/// The wrapper ensures that the outlined function is called +/// with the correct arguments when data is shared. +llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { + ASTContext &Ctx = CGM.getContext(); + const auto &CS = *D.getCapturedStmt(OMPD_parallel); + + // Create a function that takes as argument the source thread. + FunctionArgList WrapperArgs; + QualType Int16QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); + QualType Int32QTy = + Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); + ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(), + /*Id=*/nullptr, Int16QTy, + ImplicitParamDecl::Other); + ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(), + /*Id=*/nullptr, Int32QTy, + ImplicitParamDecl::Other); + WrapperArgs.emplace_back(&ParallelLevelArg); + WrapperArgs.emplace_back(&WrapperArg); + + auto &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); + + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI); + Fn->setLinkage(llvm::GlobalValue::InternalLinkage); + + CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, + D.getLocStart(), D.getLocStart()); + + const auto *RD = CS.getCapturedRecordDecl(); + auto CurField = RD->field_begin(); + + // Get the array of arguments. + SmallVector<llvm::Value *, 8> Args; + + // TODO: suppport SIMD and pass actual values + Args.emplace_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + Args.emplace_back( + llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + + CGBuilderTy &Bld = CGF.Builder; + auto CI = CS.capture_begin(); + + // Use global memory for data sharing. + // Handle passing of global args to workers. + Address GlobalArgs = + CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); + llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); + llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables), + DataSharingArgs); + + // Retrieve the shared variables from the list of references returned + // by the runtime. Pass the variables to the outlined function. + if (CS.capture_size() > 0) { + ASTContext &CGFContext = CGF.getContext(); + Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs, + CGFContext + .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy)) + .castAs<PointerType>()); + for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { + QualType ElemTy = CurField->getType(); + Address Src = Bld.CreateConstInBoundsGEP( + SharedArgListAddress, I, CGF.getPointerSize()); + Address TypedAddress = Bld.CreateBitCast( + Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); + llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, + /*Volatile=*/false, + CGFContext.getPointerType(ElemTy), + CI->getLocation()); + Args.emplace_back(Arg); + } + } + + emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args); + CGF.FinishFunction(); + return Fn; +} + +void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, + const Decl *D) { + assert(D && "Expected function or captured|block decl."); + assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && + "Function is registered already."); + SmallVector<const ValueDecl *, 4> IgnoredDecls; + const Stmt *Body = nullptr; + if (const auto *FD = dyn_cast<FunctionDecl>(D)) { + Body = FD->getBody(); + } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { + Body = BD->getBody(); + } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { + Body = CD->getBody(); + if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) { + if (const auto *CS = dyn_cast<CapturedStmt>(Body)) { + IgnoredDecls.reserve(CS->capture_size()); + for (const auto &Capture : CS->captures()) + if (Capture.capturesVariable()) + IgnoredDecls.emplace_back(Capture.getCapturedVar()); + } + } + } + if (!Body) + return; + CheckVarsEscapingDeclContext VarChecker(CGF, IgnoredDecls); + VarChecker.Visit(Body); + const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(); + if (!GlobalizedVarsRecord) + return; + auto &Res = + FunctionGlobalizedDecls + .try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy()) + .first->getSecond() + .second; + for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { + const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); + Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); + } +} + +Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) { + auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I == FunctionGlobalizedDecls.end()) + return Address::invalid(); + auto VDI = I->getSecond().second.find(VD); + if (VDI == I->getSecond().second.end()) + return Address::invalid(); + return VDI->second.second; +} + +void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) { + FunctionToGlobalRecPtr.erase(CGF.CurFn); + FunctionGlobalizedDecls.erase(CGF.CurFn); + CGOpenMPRuntime::functionFinished(CGF); +} diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 6723c3ac4bb..ee1d727a573 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -289,6 +289,14 @@ public: CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> Args = llvm::None) const override; + /// Emits OpenMP-specific function prolog. + /// Required for device constructs. + void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override; + + /// Gets the OpenMP-specific address of the local variable. + Address getAddressOfLocalVariable(CodeGenFunction &CGF, + const VarDecl *VD) override; + /// Target codegen is specialized based on two programming models: the /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' /// model for constructs like 'target parallel' that support it. @@ -300,12 +308,37 @@ public: Unknown, }; + /// Cleans up references to the objects in finished function. + /// + void functionFinished(CodeGenFunction &CGF) override; + private: // Track the execution mode when codegening directives within a target // region. The appropriate mode (generic/spmd) is set on entry to the // target region and used by containing directives such as 'parallel' // to emit optimized code. ExecutionMode CurrentExecutionMode; + + /// Map between an outlined function and its wrapper. + llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap; + + /// Emit function which wraps the outline parallel region + /// and controls the parameters which are passed to this function. + /// The wrapper ensures that the outlined function is called + /// with the correct arguments when data is shared. + llvm::Function *createParallelDataSharingWrapper( + llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D); + + /// The map of local variables to their addresses in the global memory. + using DeclToAddrMapTy = llvm::MapVector<const Decl *, + std::pair<const FieldDecl *, Address>>; + /// Maps the function to the list of the globalized variables with their + /// addresses. + llvm::DenseMap<llvm::Function *, + std::pair<const RecordDecl *, DeclToAddrMapTy>> + FunctionGlobalizedDecls; + /// Map from function to global record pointer. + llvm::DenseMap<llvm::Function *, llvm::Value *> FunctionToGlobalRecPtr; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 957bcc43db0..7d2093bdf60 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -585,6 +585,7 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) { /*RegisterCastedArgsOnly=*/true, CapturedStmtInfo->getHelperName()); CodeGenFunction WrapperCGF(CGM, /*suppressNewContext=*/true); + WrapperCGF.CapturedStmtInfo = CapturedStmtInfo; Args.clear(); LocalAddrs.clear(); VLASizes.clear(); diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 2f814a7165a..c5de16bc6eb 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1067,6 +1067,11 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, EmitStartEHSpec(CurCodeDecl); PrologueCleanupDepth = EHStack.stable_begin(); + + // Emit OpenMP specific initialization of the device functions. + if (getLangOpts().OpenMP && CurCodeDecl) + CGM.getOpenMPRuntime().emitFunctionProlog(*this, CurCodeDecl); + EmitFunctionProlog(*CurFnInfo, CurFn, Args); if (D && isa<CXXMethodDecl>(D) && cast<CXXMethodDecl>(D)->isInstance()) { |