summaryrefslogtreecommitdiffstats
path: root/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp')
-rw-r--r--clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp150
1 files changed, 97 insertions, 53 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 86acf57d721..ab9631c1fee 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -176,6 +176,9 @@ enum MachineConfiguration : unsigned {
/// Global memory alignment for performance.
GlobalMemoryAlignment = 128,
+
+ /// Maximal size of the shared memory buffer.
+ SharedMemorySize = 128,
};
enum NamedBarrier : unsigned {
@@ -1143,13 +1146,6 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
- if (!StaticGlobalized) {
- StaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage, nullptr,
- "_openmp_static_glob_rd$ptr");
- StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
- }
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
@@ -1277,13 +1273,6 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
- if (!StaticGlobalized) {
- StaticGlobalized = new llvm::GlobalVariable(
- CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
- llvm::GlobalValue::WeakAnyLinkage, nullptr,
- "_openmp_static_glob_rd$ptr");
- StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
- }
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
@@ -2138,30 +2127,41 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
++GlobalizedRecords.back().RegionCounter;
if (GlobalizedRecords.back().Records.size() == 1) {
- assert(StaticGlobalized &&
- "Static pointer must be initialized already.");
- Address Buffer = CGF.EmitLoadOfPointer(
- Address(StaticGlobalized, CGM.getPointerAlign()),
- CGM.getContext()
- .getPointerType(CGM.getContext().VoidPtrTy)
- .castAs<PointerType>());
+ assert(KernelStaticGlobalized &&
+ "Kernel static pointer must be initialized already.");
+ auto *UseSharedMemory = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
+ llvm::GlobalValue::InternalLinkage, nullptr,
+ "_openmp_static_kernel$is_shared");
+ UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
+ QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/16, /*Signed=*/0);
+ llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
+ Address(UseSharedMemory,
+ CGM.getContext().getTypeAlignInChars(Int16Ty)),
+ /*Volatile=*/false, Int16Ty, Loc);
+ auto *StaticGlobalized = new llvm::GlobalVariable(
+ CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
+ llvm::GlobalValue::WeakAnyLinkage, nullptr);
auto *RecSize = new llvm::GlobalVariable(
CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_static_kernel$size");
RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
llvm::Value *Ld = CGF.EmitLoadOfScalar(
- Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false,
+ Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
CGM.getContext().getSizeType(), Loc);
llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
KernelStaticGlobalized, CGM.VoidPtrPtrTy);
- llvm::Value *GlobalRecordSizeArg[] = {
- Buffer.getPointer(), Ld,
- llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr};
+ llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
+ IsInSharedMemory, ResAddr};
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_get_team_static_memory),
GlobalRecordSizeArg);
+ GlobalizedRecords.back().Buffer = StaticGlobalized;
GlobalizedRecords.back().RecSize = RecSize;
+ GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
+ GlobalizedRecords.back().Loc = Loc;
}
assert(KernelStaticGlobalized && "Global address must be set already.");
Address FrameAddr = CGF.EmitLoadOfPointer(
@@ -2336,10 +2336,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
--GlobalizedRecords.back().RegionCounter;
// Emit the restore function only in the target region.
if (GlobalizedRecords.back().RegionCounter == 0) {
+ QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
+ /*DestWidth=*/16, /*Signed=*/0);
+ llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
+ Address(GlobalizedRecords.back().UseSharedMemory,
+ CGM.getContext().getTypeAlignInChars(Int16Ty)),
+ /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_restore_team_static_memory),
- llvm::ConstantInt::getNullValue(CGM.Int16Ty));
+ IsInSharedMemory);
}
} else {
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
@@ -4507,21 +4513,24 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
void CGOpenMPRuntimeNVPTX::clear() {
if (!GlobalizedRecords.empty()) {
ASTContext &C = CGM.getContext();
+ llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
+ llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
+ RecordDecl *SharedStaticRD = C.buildImplicitRecord(
+ "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
+ SharedStaticRD->startDefinition();
for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
if (Records.Records.empty())
continue;
unsigned Size = 0;
unsigned RecAlignment = 0;
for (const RecordDecl *RD : Records.Records) {
- QualType RDTy = CGM.getContext().getRecordType(RD);
- unsigned Alignment =
- CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
+ QualType RDTy = C.getRecordType(RD);
+ unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
RecAlignment = std::max(RecAlignment, Alignment);
- unsigned RecSize =
- CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
+ unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
Size =
llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
}
@@ -4529,32 +4538,67 @@ void CGOpenMPRuntimeNVPTX::clear() {
llvm::APInt ArySize(/*numBits=*/64, Size);
QualType SubTy = C.getConstantArrayType(
C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
- auto *Field = FieldDecl::Create(
- C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
- C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
- /*BW=*/nullptr, /*Mutable=*/false,
- /*InitStyle=*/ICIS_NoInit);
+ const bool UseSharedMemory = Size <= SharedMemorySize;
+ auto *Field =
+ FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
+ SourceLocation(), SourceLocation(), nullptr, SubTy,
+ C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
+ /*BW=*/nullptr, /*Mutable=*/false,
+ /*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
- StaticRD->addDecl(Field);
+ if (UseSharedMemory) {
+ SharedStaticRD->addDecl(Field);
+ SharedRecs.push_back(&Records);
+ } else {
+ StaticRD->addDecl(Field);
+ GlobalRecs.push_back(&Records);
+ }
Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
+ Records.UseSharedMemory->setInitializer(
+ llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
+ }
+ SharedStaticRD->completeDefinition();
+ if (!SharedStaticRD->field_empty()) {
+ QualType StaticTy = C.getRecordType(SharedStaticRD);
+ llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMStaticTy,
+ /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::Constant::getNullValue(LLVMStaticTy),
+ "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
+ llvm::GlobalValue::NotThreadLocal,
+ C.getTargetAddressSpace(LangAS::cuda_shared));
+ auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ GV, CGM.VoidPtrTy);
+ for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
+ Rec->Buffer->replaceAllUsesWith(Replacement);
+ Rec->Buffer->eraseFromParent();
+ }
}
StaticRD->completeDefinition();
- QualType StaticTy = C.getRecordType(StaticRD);
- std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
- llvm::APInt Size1(32, SMsBlockPerSM.second);
- QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
- llvm::APInt Size2(32, SMsBlockPerSM.first);
- QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
- /*IndexTypeQuals=*/0);
- llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
- auto *GV = new llvm::GlobalVariable(
- CGM.getModule(), LLVMArr2Ty,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
- llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_");
- StaticGlobalized->setInitializer(
- llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
- CGM.VoidPtrTy));
+ if (!StaticRD->field_empty()) {
+ QualType StaticTy = C.getRecordType(StaticRD);
+ std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
+ llvm::APInt Size1(32, SMsBlockPerSM.second);
+ QualType Arr1Ty =
+ C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
+ /*IndexTypeQuals=*/0);
+ llvm::APInt Size2(32, SMsBlockPerSM.first);
+ QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
+ /*IndexTypeQuals=*/0);
+ llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMArr2Ty,
+ /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ llvm::Constant::getNullValue(LLVMArr2Ty),
+ "_openmp_static_glob_rd_$_");
+ auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ GV, CGM.VoidPtrTy);
+ for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
+ Rec->Buffer->replaceAllUsesWith(Replacement);
+ Rec->Buffer->eraseFromParent();
+ }
+ }
}
CGOpenMPRuntime::clear();
}
OpenPOWER on IntegriCloud