summaryrefslogtreecommitdiffstats
path: root/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2018-10-12 16:04:20 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2018-10-12 16:04:20 +0000
commit9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac (patch)
tree6a8da57be99cd61b82cbe3160d91fc0fcd578744 /clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
parentc046b6856ec91e65b1112e66af0fa6735c20bc7d (diff)
downloadbcm5719-llvm-9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac.tar.gz
bcm5719-llvm-9bfe91da3d2c2eb3f0cba9dc587a51ddbbf4d8ac.zip
[OPENMP][NVPTX]Reduce memory usage in orphaned functions.
if the function has globalized variables and called in context of target/teams/distribute regions, it does not need to globalize 32 copies of the same variables for memory coalescing, it is enough to have just one copy, because there is parallel region. Patch does this by adding call for `__kmpc_parallel_level` function and checking its return value. If the code sees that the parallel level is 0, then only one variable is allocated, not 32. llvm-svn: 344356
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp79
1 files changed, 71 insertions, 8 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 944130b1abf..7ae83773117 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -1972,6 +1972,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
return;
if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
+ QualType SecGlobalRecTy;
// Recover pointer to this function's global record. The runtime will
// handle the specifics of the allocation of the memory.
@@ -1986,11 +1987,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
llvm::PointerType *GlobalRecPtrTy =
CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
llvm::Value *GlobalRecCastAddr;
+ llvm::Value *IsTTD = nullptr;
if (WithSPMDCheck ||
getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
+ if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
+ llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+ llvm::Value *ThreadID = getThreadID(CGF, Loc);
+ llvm::Value *PL = CGF.EmitRuntimeCall(
+ createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
+ {RTLoc, ThreadID});
+ IsTTD = Bld.CreateIsNull(PL);
+ }
llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
@@ -2003,11 +2013,28 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(NonSPMDBB);
+ llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
+ if (const RecordDecl *SecGlobalizedVarsRecord =
+ I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
+ SecGlobalRecTy =
+ CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
+
+ // Recover pointer to this function's global record. The runtime will
+ // handle the specifics of the allocation of the memory.
+ // Use actual memory size of the record including the padding
+ // for alignment purposes.
+ unsigned Alignment =
+ CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
+ unsigned GlobalRecordSize =
+ CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
+ GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
+ Size = Bld.CreateSelect(
+ IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
+ }
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
llvm::Value *GlobalRecordSizeArg[] = {
- llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
- CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
+ Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
llvm::Value *GlobalRecValue =
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
@@ -2042,6 +2069,17 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
// Emit the "global alloca" which is a GEP from the global declaration
// record using the pointer returned by the runtime.
+ LValue SecBase;
+ decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
+ if (IsTTD) {
+ SecIt = I->getSecond().SecondaryLocalVarData->begin();
+ llvm::PointerType *SecGlobalRecPtrTy =
+ CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
+ SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
+ Bld.CreatePointerBitCastOrAddrSpaceCast(
+ I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
+ SecGlobalRecTy);
+ }
for (auto &Rec : I->getSecond().LocalVarData) {
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
llvm::Value *ParValue;
@@ -2055,23 +2093,32 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
// Emit VarAddr basing on lane-id if required.
QualType VarTy;
if (Rec.second.IsOnePerTeam) {
- Rec.second.PrivateAddr = VarAddr.getAddress();
VarTy = Rec.second.FD->getType();
} else {
llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
VarAddr.getAddress().getPointer(),
{Bld.getInt32(0), getNVPTXLaneID(CGF)});
- Rec.second.PrivateAddr =
- Address(Ptr, CGM.getContext().getDeclAlign(Rec.first));
VarTy =
Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
- VarAddr = CGF.MakeAddrLValue(Rec.second.PrivateAddr, VarTy,
- AlignmentSource::Decl);
+ VarAddr = CGF.MakeAddrLValue(
+ Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
+ AlignmentSource::Decl);
}
+ Rec.second.PrivateAddr = VarAddr.getAddress();
if (WithSPMDCheck ||
- getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
+ getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
assert(I->getSecond().IsInSPMDModeFlag &&
"Expected unknown execution mode or required SPMD check.");
+ if (IsTTD) {
+ assert(SecIt->second.IsOnePerTeam &&
+ "Secondary glob data must be one per team.");
+ LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
+ VarAddr.setAddress(
+ Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
+ VarAddr.getPointer()),
+ VarAddr.getAlignment()));
+ Rec.second.PrivateAddr = VarAddr.getAddress();
+ }
Address GlobalPtr = Rec.second.PrivateAddr;
Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
Rec.second.PrivateAddr = Address(
@@ -2084,6 +2131,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
CGF.EmitStoreOfScalar(ParValue, VarAddr);
I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
}
+ ++SecIt;
}
}
for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
@@ -4115,6 +4163,21 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
Data.insert(
std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion)));
}
+ if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization &&
+ !IsInParallelRegion) {
+ CheckVarsEscapingDeclContext VarChecker(CGF);
+ VarChecker.Visit(Body);
+ I->getSecond().SecondaryGlobalRecord =
+ VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true);
+ I->getSecond().SecondaryLocalVarData.emplace();
+ DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
+ for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
+ assert(VD->isCanonicalDecl() && "Expected canonical declaration");
+ const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
+ Data.insert(std::make_pair(
+ VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true)));
+ }
+ }
if (!NeedToDelayGlobalization) {
emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
struct GlobalizationScope final : EHScopeStack::Cleanup {
OpenPOWER on IntegriCloud