diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2018-10-12 16:04:20 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2018-10-12 16:04:20 +0000 |
commit | 9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac (patch) | |
tree | 6a8da57be99cd61b82cbe3160d91fc0fcd578744 /clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | |
parent | c046b6856ec91e65b1112e66af0fa6735c20bc7d (diff) | |
download | bcm5719-llvm-9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac.tar.gz bcm5719-llvm-9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac.zip |
[OPENMP][NVPTX]Reduce memory usage in orphaned functions.
if the function has globalized variables and called in context of
target/teams/distribute regions, it does not need to globalize 32
copies of the same variables for memory coalescing, it is enough to
have just one copy, because there is parallel region.
Patch does this by adding call for `__kmpc_parallel_level` function and
checking its return value. If the code sees that the parallel level is
0, then only one variable is allocated, not 32.
llvm-svn: 344356
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 79 |
1 files changed, 71 insertions, 8 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 944130b1abf..7ae83773117 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -1972,6 +1972,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, return; if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) { QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord); + QualType SecGlobalRecTy; // Recover pointer to this function's global record. The runtime will // handle the specifics of the allocation of the memory. @@ -1986,11 +1987,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, llvm::PointerType *GlobalRecPtrTy = CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo(); llvm::Value *GlobalRecCastAddr; + llvm::Value *IsTTD = nullptr; if (WithSPMDCheck || getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); + if (I->getSecond().SecondaryGlobalRecord.hasValue()) { + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *PL = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), + {RTLoc, ThreadID}); + IsTTD = Bld.CreateIsNull(PL); + } llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); @@ -2003,11 +2013,28 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(NonSPMDBB); + llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize); + if (const RecordDecl *SecGlobalizedVarsRecord = + I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) { + SecGlobalRecTy = + CGM.getContext().getRecordType(SecGlobalizedVarsRecord); + + // Recover pointer to this function's global record. The runtime will + // handle the specifics of the allocation of the memory. + // Use actual memory size of the record including the padding + // for alignment purposes. + unsigned Alignment = + CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity(); + unsigned GlobalRecordSize = + CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity(); + GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); + Size = Bld.CreateSelect( + IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size); + } // TODO: allow the usage of shared memory to be controlled by // the user, for now, default to global. llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_data_sharing_push_stack), @@ -2042,6 +2069,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, // Emit the "global alloca" which is a GEP from the global declaration // record using the pointer returned by the runtime. + LValue SecBase; + decltype(I->getSecond().LocalVarData)::const_iterator SecIt; + if (IsTTD) { + SecIt = I->getSecond().SecondaryLocalVarData->begin(); + llvm::PointerType *SecGlobalRecPtrTy = + CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo(); + SecBase = CGF.MakeNaturalAlignPointeeAddrLValue( + Bld.CreatePointerBitCastOrAddrSpaceCast( + I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy), + SecGlobalRecTy); + } for (auto &Rec : I->getSecond().LocalVarData) { bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); llvm::Value *ParValue; @@ -2055,23 +2093,32 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, // Emit VarAddr basing on lane-id if required. QualType VarTy; if (Rec.second.IsOnePerTeam) { - Rec.second.PrivateAddr = VarAddr.getAddress(); VarTy = Rec.second.FD->getType(); } else { llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP( VarAddr.getAddress().getPointer(), {Bld.getInt32(0), getNVPTXLaneID(CGF)}); - Rec.second.PrivateAddr = - Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)); VarTy = Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType(); - VarAddr = CGF.MakeAddrLValue(Rec.second.PrivateAddr, VarTy, - AlignmentSource::Decl); + VarAddr = CGF.MakeAddrLValue( + Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy, + AlignmentSource::Decl); } + Rec.second.PrivateAddr = VarAddr.getAddress(); if (WithSPMDCheck || - getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { assert(I->getSecond().IsInSPMDModeFlag && "Expected unknown execution mode or required SPMD check."); + if (IsTTD) { + assert(SecIt->second.IsOnePerTeam && + "Secondary glob data must be one per team."); + LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD); + VarAddr.setAddress( + Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(), + VarAddr.getPointer()), + VarAddr.getAlignment())); + Rec.second.PrivateAddr = VarAddr.getAddress(); + } Address GlobalPtr = Rec.second.PrivateAddr; Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName()); Rec.second.PrivateAddr = Address( @@ -2084,6 +2131,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGF.EmitStoreOfScalar(ParValue, VarAddr); I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); } + ++SecIt; } } for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) { @@ -4115,6 +4163,21 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Data.insert( std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion))); } + if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization && + !IsInParallelRegion) { + CheckVarsEscapingDeclContext VarChecker(CGF); + VarChecker.Visit(Body); + I->getSecond().SecondaryGlobalRecord = + VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true); + I->getSecond().SecondaryLocalVarData.emplace(); + DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); + for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { + assert(VD->isCanonicalDecl() && "Expected canonical declaration"); + const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); + Data.insert(std::make_pair( + VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true))); + } + } if (!NeedToDelayGlobalization) { emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); struct GlobalizationScope final : EHScopeStack::Cleanup { |