diff options
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 219 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 21 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGStmtOpenMP.cpp | 51 | ||||
-rw-r--r-- | clang/test/OpenMP/declare_target_codegen_globalization.cpp | 44 | ||||
-rw-r--r-- | clang/test/OpenMP/nvptx_teams_codegen.cpp | 24 |
5 files changed, 269 insertions, 90 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index b71965cfb53..637a86b4553 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -171,16 +171,23 @@ class CheckVarsEscapingDeclContext final : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { CodeGenFunction &CGF; llvm::SetVector<const ValueDecl *> EscapedDecls; + llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls; bool AllEscaped = false; RecordDecl *GlobalizedRD = nullptr; llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; void markAsEscaped(const ValueDecl *VD) { - if (IgnoredDecls.count(VD) || - (CGF.CapturedStmtInfo && - CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD)))) + if (IgnoredDecls.count(VD)) return; + // Variables captured by value must be globalized. + if (auto *CSI = CGF.CapturedStmtInfo) { + if (const FieldDecl *FD = CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))) { + if (FD->getType()->isReferenceType()) + return; + EscapedParameters.insert(VD); + } + } EscapedDecls.insert(VD); } @@ -385,12 +392,15 @@ public: Visit(Child); } + /// Returns the record that handles all the escaped local variables and used + /// instead of their original storage. const RecordDecl *getGlobalizedRecord() { if (!GlobalizedRD) buildRecordForGlobalizedVars(); return GlobalizedRD; } + /// Returns the field in the globalized record for the escaped variable. const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { assert(GlobalizedRD && "Record for globalized variables must be generated already."); @@ -400,9 +410,16 @@ public: return I->getSecond(); } + /// Returns the list of the escaped local variables/parameters. ArrayRef<const ValueDecl *> getEscapedDecls() const { return EscapedDecls.getArrayRef(); } + + /// Checks if the escaped local variable is actually a parameter passed by + /// value. + const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { + return EscapedParameters; + } }; } // anonymous namespace @@ -614,58 +631,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); - const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); - if (I == FunctionGlobalizedDecls.end()) - return; - const RecordDecl *GlobalizedVarsRecord = I->getSecond().first; - QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); - - // Recover pointer to this function's global record. The runtime will - // handle the specifics of the allocation of the memory. - // Use actual memory size of the record including the padding - // for alignment purposes. - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); - unsigned GlobalRecordSize = - CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); - GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); - FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue); - - // Emit the "global alloca" which is a GEP from the global declaration record - // using the pointer returned by the runtime. - LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); - auto &Res = I->getSecond().second; - for (auto &Rec : Res) { - const FieldDecl *FD = Rec.second.first; - LValue VarAddr = CGF.EmitLValueForField(Base, FD); - Rec.second.second = VarAddr.getAddress(); - } + emitGenericVarsProlog(CGF, WST.Loc); } void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { - const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); - if (I != FunctionGlobalizedDecls.end()) { - if (!CGF.HaveInsertPoint()) - return; - auto I = FunctionToGlobalRecPtr.find(CGF.CurFn); - if (I != FunctionToGlobalRecPtr.end()) { - llvm::Value *Args[] = {I->getSecond()}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - Args); - } - } + emitGenericVarsEpilog(CGF); + if (!CGF.HaveInsertPoint()) + return; if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -1207,6 +1180,24 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + SourceLocation Loc = D.getLocStart(); + + // Emit target region as a standalone region. + class NVPTXPrePostActionTy : public PrePostActionTy { + SourceLocation &Loc; + + public: + NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {} + void Enter(CodeGenFunction &CGF) override { + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) + .emitGenericVarsProlog(CGF, Loc); + } + void Exit(CodeGenFunction &CGF) override { + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) + .emitGenericVarsEpilog(CGF); + } + } Action(Loc); + CodeGen.setAction(Action); auto *OutlinedFun = cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen)); @@ -1222,7 +1213,24 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { + SourceLocation Loc = D.getLocStart(); + // Emit target region as a standalone region. + class NVPTXPrePostActionTy : public PrePostActionTy { + SourceLocation &Loc; + + public: + NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {} + void Enter(CodeGenFunction &CGF) override { + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) + .emitGenericVarsProlog(CGF, Loc); + } + void Exit(CodeGenFunction &CGF) override { + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) + .emitGenericVarsEpilog(CGF); + } + } Action(Loc); + CodeGen.setAction(Action); llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal); @@ -1233,6 +1241,73 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( return OutlinedFun; } +void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, + SourceLocation Loc) { + CGBuilderTy &Bld = CGF.Builder; + + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I == FunctionGlobalizedDecls.end()) + return; + const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord; + QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); + + // Recover pointer to this function's global record. The runtime will + // handle the specifics of the allocation of the memory. + // Use actual memory size of the record including the padding + // for alignment purposes. + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(RecTy).getQuantity(); + unsigned GlobalRecordSize = + CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); + GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + // TODO: allow the usage of shared memory to be controlled by + // the user, for now, default to global. + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), + CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack), + GlobalRecordSizeArg); + llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); + I->getSecond().GlobalRecordAddr = GlobalRecValue; + + // Emit the "global alloca" which is a GEP from the global declaration record + // using the pointer returned by the runtime. + for (auto &Rec : I->getSecond().LocalVarData) { + bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); + llvm::Value *ParValue; + if (EscapedParam) { + const auto *VD = cast<VarDecl>(Rec.first); + LValue ParLVal = + CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); + ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); + } + const FieldDecl *FD = Rec.second.first; + LValue VarAddr = CGF.EmitLValueForField(Base, FD); + Rec.second.second = VarAddr.getAddress(); + if (EscapedParam) { + const auto *VD = cast<VarDecl>(Rec.first); + CGF.EmitStoreOfScalar(ParValue, VarAddr); + I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); + } + } + I->getSecond().MappedParams->apply(CGF); +} + +void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) { + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); + if (I != FunctionGlobalizedDecls.end() && I->getSecond().GlobalRecordAddr) { + I->getSecond().MappedParams->restore(CGF); + if (!CGF.HaveInsertPoint()) + return; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), + I->getSecond().GlobalRecordAddr); + } +} + void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, @@ -1317,7 +1392,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy); CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, Ctx.getPointerType(Ctx.VoidPtrTy)); - Idx++; + ++Idx; } } @@ -2750,8 +2825,8 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall( continue; } llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo( - /*AddrSpace=*/0)); + NativeArg, + NativeArg->getType()->getPointerElementType()->getPointerTo()); TargetArgs.emplace_back( CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); } @@ -2788,7 +2863,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( auto *Fn = llvm::Function::Create( CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); Fn->setLinkage(llvm::GlobalValue::InternalLinkage); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); @@ -2854,6 +2929,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, "Function is registered already."); SmallVector<const ValueDecl *, 4> IgnoredDecls; const Stmt *Body = nullptr; + bool NeedToDelayGlobalization = false; if (const auto *FD = dyn_cast<FunctionDecl>(D)) { Body = FD->getBody(); } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { @@ -2861,6 +2937,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { Body = CD->getBody(); if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) { + NeedToDelayGlobalization = true; if (const auto *CS = dyn_cast<CapturedStmt>(Body)) { IgnoredDecls.reserve(CS->capture_size()); for (const auto &Capture : CS->captures()) @@ -2876,14 +2953,29 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(); if (!GlobalizedVarsRecord) return; - auto &Res = - FunctionGlobalizedDecls - .try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy()) - .first->getSecond() - .second; + auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; + I->getSecond().MappedParams = + llvm::make_unique<CodeGenFunction::OMPMapVars>(); + I->getSecond().GlobalRecord = GlobalizedVarsRecord; + I->getSecond().EscapedParameters.insert( + VarChecker.getEscapedParameters().begin(), + VarChecker.getEscapedParameters().end()); + DeclToAddrMapTy &Data = I->getSecond().LocalVarData; for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); + Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid()))); + } + if (!NeedToDelayGlobalization) { + emitGenericVarsProlog(CGF, D->getLocStart()); + struct GlobalizationScope final : EHScopeStack::Cleanup { + GlobalizationScope() = default; + + void Emit(CodeGenFunction &CGF, Flags flags) override { + static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime()) + .emitGenericVarsEpilog(CGF); + } + }; + CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); } } @@ -2892,14 +2984,13 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) return Address::invalid(); - auto VDI = I->getSecond().second.find(VD); - if (VDI == I->getSecond().second.end()) + auto VDI = I->getSecond().LocalVarData.find(VD); + if (VDI == I->getSecond().LocalVarData.end()) return Address::invalid(); return VDI->second.second; } void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) { - FunctionToGlobalRecPtr.erase(CGF.CurFn); FunctionGlobalizedDecls.erase(CGF.CurFn); CGOpenMPRuntime::functionFinished(CGF); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index ee1d727a573..7b1944dca56 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -61,6 +61,12 @@ private: /// function. void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// Helper for generic variables globalization prolog. + void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc); + + /// Helper for generic variables globalization epilog. + void emitGenericVarsEpilog(CodeGenFunction &CGF); + /// \brief Helper for Spmd mode target directive's entry function. void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, const OMPExecutableDirective &D); @@ -332,13 +338,18 @@ private: /// The map of local variables to their addresses in the global memory. using DeclToAddrMapTy = llvm::MapVector<const Decl *, std::pair<const FieldDecl *, Address>>; + /// Set of the parameters passed by value escaping OpenMP context. + using EscapedParamsTy = llvm::SmallPtrSet<const Decl *, 4>; + struct FunctionData { + DeclToAddrMapTy LocalVarData; + const RecordDecl *GlobalRecord = nullptr; + llvm::Value *GlobalRecordAddr = nullptr; + EscapedParamsTy EscapedParameters; + std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams; + }; /// Maps the function to the list of the globalized variables with their /// addresses. - llvm::DenseMap<llvm::Function *, - std::pair<const RecordDecl *, DeclToAddrMapTy>> - FunctionGlobalizedDecls; - /// Map from function to global record pointer. - llvm::DenseMap<llvm::Function *, llvm::Value *> FunctionToGlobalRecPtr; + llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls; }; } // CodeGen namespace. diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 9d077a2be16..02bb8ae88a1 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1262,7 +1262,7 @@ static void emitEmptyBoundParameters(CodeGenFunction &, void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { // Emit parallel region as a standalone region. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); bool Copyins = CGF.EmitOMPCopyinClause(S); (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); @@ -1277,6 +1277,7 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) { CGF.EmitOMPPrivateClause(S, PrivateScope); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.EmitStmt(S.getCapturedStmt(OMPD_parallel)->getCapturedStmt()); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); }; @@ -2100,7 +2101,8 @@ emitInnerParallelForWhenCombined(CodeGenFunction &CGF, const OMPLoopDirective &S, CodeGenFunction::JumpDest LoopExit) { auto &&CGInlinedWorksharingLoop = [&S](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { + Action.Enter(CGF); bool HasCancel = false; if (!isOpenMPSimdDirective(S.getDirectiveKind())) { if (const auto *D = dyn_cast<OMPTeamsDistributeParallelForDirective>(&S)) @@ -2682,7 +2684,8 @@ void CodeGenFunction::EmitOMPParallelForDirective( const OMPParallelForDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); OMPCancelStackRAII CancelRegion(CGF, OMPD_parallel_for, S.hasCancel()); CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds, emitDispatchForLoopBounds); @@ -2695,7 +2698,8 @@ void CodeGenFunction::EmitOMPParallelForSimdDirective( const OMPParallelForSimdDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds, emitDispatchForLoopBounds); }; @@ -2707,7 +2711,8 @@ void CodeGenFunction::EmitOMPParallelSectionsDirective( const OMPParallelSectionsDirective &S) { // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'sections' directive. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); CGF.EmitSections(S); }; emitCommonOMPParallelDirective(*this, S, OMPD_sections, CodeGen, @@ -4033,12 +4038,13 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF, void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) { // Emit teams region as a standalone region. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); CGF.EmitOMPPrivateClause(S, PrivateScope); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.EmitStmt(S.getCapturedStmt(OMPD_teams)->getCapturedStmt()); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); }; @@ -4099,10 +4105,11 @@ emitTargetTeamsDistributeRegion(CodeGenFunction &CGF, PrePostActionTy &Action, // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute, CodeGenDistribute); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4144,10 +4151,11 @@ static void emitTargetTeamsDistributeSimdRegion( // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute, CodeGenDistribute); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4188,10 +4196,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeDirective( // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute, CodeGenDistribute); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4209,10 +4218,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeSimdDirective( // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd, CodeGenDistribute); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4231,10 +4241,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeParallelForDirective( // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute, CodeGenDistribute); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4253,10 +4264,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeParallelForSimdDirective( // Emit teams region as a standalone region. auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective( CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4277,10 +4289,11 @@ static void emitTargetTeamsDistributeParallelForRegion( // Emit teams region as a standalone region. auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective( CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4328,10 +4341,11 @@ static void emitTargetTeamsDistributeParallelForSimdRegion( // Emit teams region as a standalone region. auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF, - PrePostActionTy &) { + PrePostActionTy &Action) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); CGF.CGM.getOpenMPRuntime().emitInlinedDirective( CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); @@ -4599,12 +4613,13 @@ static void emitTargetParallelRegion(CodeGenFunction &CGF, // Get the captured statement associated with the 'parallel' region. auto *CS = S.getCapturedStmt(OMPD_parallel); Action.Enter(CGF); - auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &Action) { CodeGenFunction::OMPPrivateScope PrivateScope(CGF); (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); CGF.EmitOMPPrivateClause(S, PrivateScope); CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); + Action.Enter(CGF); // TODO: Add support for clauses. CGF.EmitStmt(CS->getCapturedStmt()); CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); @@ -4643,7 +4658,8 @@ static void emitTargetParallelForRegion(CodeGenFunction &CGF, Action.Enter(CGF); // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); CodeGenFunction::OMPCancelStackRAII CancelRegion( CGF, OMPD_target_parallel_for, S.hasCancel()); CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds, @@ -4683,7 +4699,8 @@ emitTargetParallelForSimdRegion(CodeGenFunction &CGF, Action.Enter(CGF); // Emit directive as a combined directive that consists of two implicit // directives: 'parallel' with 'for' directive. - auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds, emitDispatchForLoopBounds); }; diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp new file mode 100644 index 00000000000..db0df9bd930 --- /dev/null +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -0,0 +1,44 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// expected-no-diagnostics + +int foo(int &a) { return a; } + +int bar() { + int a; + return foo(a); +} + +// CHECK: define void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+5]](i32* dereferenceable{{.*}}) +// CHECK-NOT: @__kmpc_data_sharing_push_stack + +int maini1() { + int a; +#pragma omp target parallel map(from:a) + { + int b; + a = foo(b) + bar(); + } + return a; +} + +// parallel region +// CHECK: define {{.*}}void @{{.*}}(i32* noalias {{.*}}, i32* noalias {{.*}}, i32* dereferenceable{{.*}}) +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0) +// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%struct[.].*]]* +// CHECK: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: call {{.*}}[[FOO:@.*foo.*]](i32* dereferenceable{{.*}} [[B_ADDR]]) +// CHECK: call {{.*}}[[BAR:@.*bar.*]]() +// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]]) +// CHECK: ret void + +// CHECK: define {{.*}}[[FOO]](i32* dereferenceable{{.*}}) +// CHECK-NOT: @__kmpc_data_sharing_push_stack + +// CHECK: define {{.*}}[[BAR]]() +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0) +// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%struct[.].*]]* +// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: call {{.*}}[[FOO]](i32* dereferenceable{{.*}} [[A_ADDR]]) +// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]]) +// CHECK: ret i32 diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp index b26d47c706a..c530d19fd7b 100644 --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -36,8 +36,12 @@ int main (int argc, char **argv) { // CK1: store {{.+}} 0, {{.+}}, // CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], // CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// CK1-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK1-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) +// CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] +// CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], +// CK1: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK1: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], // CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams( @@ -49,6 +53,10 @@ int main (int argc, char **argv) { // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***, // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) +// CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]], // CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], @@ -104,8 +112,12 @@ int main (int argc, char **argv) { // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// CK2-64: store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]], -// CK2-32: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) +// CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] +// CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], +// CK2: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]], // CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams( @@ -121,6 +133,10 @@ int main (int argc, char **argv) { // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) +// CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]], // CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]], // CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]], |