diff options
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 150 | ||||
-rw-r--r-- | clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 6 |
2 files changed, 100 insertions, 56 deletions
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 86acf57d721..ab9631c1fee 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -176,6 +176,9 @@ enum MachineConfiguration : unsigned { /// Global memory alignment for performance. GlobalMemoryAlignment = 128, + + /// Maximal size of the shared memory buffer. + SharedMemorySize = 128, }; enum NamedBarrier : unsigned { @@ -1143,13 +1146,6 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, IsInTTDRegion = true; // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); - if (!StaticGlobalized) { - StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, nullptr, - "_openmp_static_glob_rd$ptr"); - StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - } if (!KernelStaticGlobalized) { KernelStaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, @@ -1277,13 +1273,6 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, IsInTTDRegion = true; // Reserve place for the globalized memory. GlobalizedRecords.emplace_back(); - if (!StaticGlobalized) { - StaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, nullptr, - "_openmp_static_glob_rd$ptr"); - StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); - } if (!KernelStaticGlobalized) { KernelStaticGlobalized = new llvm::GlobalVariable( CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, @@ -2138,30 +2127,41 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord); ++GlobalizedRecords.back().RegionCounter; if (GlobalizedRecords.back().Records.size() == 1) { - assert(StaticGlobalized && - "Static pointer must be initialized already."); - Address Buffer = CGF.EmitLoadOfPointer( - Address(StaticGlobalized, CGM.getPointerAlign()), - CGM.getContext() - .getPointerType(CGM.getContext().VoidPtrTy) - .castAs<PointerType>()); + assert(KernelStaticGlobalized && + "Kernel static pointer must be initialized already."); + auto *UseSharedMemory = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, nullptr, + "_openmp_static_kernel$is_shared"); + UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( + /*DestWidth=*/16, /*Signed=*/0); + llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( + Address(UseSharedMemory, + CGM.getContext().getTypeAlignInChars(Int16Ty)), + /*Volatile=*/false, Int16Ty, Loc); + auto *StaticGlobalized = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false, + llvm::GlobalValue::WeakAnyLinkage, nullptr); auto *RecSize = new llvm::GlobalVariable( CGM.getModule(), CGM.SizeTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage, nullptr, "_openmp_static_kernel$size"); RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); llvm::Value *Ld = CGF.EmitLoadOfScalar( - Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false, + Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false, CGM.getContext().getSizeType(), Loc); llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = { - Buffer.getPointer(), Ld, - llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr}; + llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld, + IsInSharedMemory, ResAddr}; CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_get_team_static_memory), GlobalRecordSizeArg); + GlobalizedRecords.back().Buffer = StaticGlobalized; GlobalizedRecords.back().RecSize = RecSize; + GlobalizedRecords.back().UseSharedMemory = UseSharedMemory; + GlobalizedRecords.back().Loc = Loc; } assert(KernelStaticGlobalized && "Global address must be set already."); Address FrameAddr = CGF.EmitLoadOfPointer( @@ -2336,10 +2336,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, --GlobalizedRecords.back().RegionCounter; // Emit the restore function only in the target region. if (GlobalizedRecords.back().RegionCounter == 0) { + QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth( + /*DestWidth=*/16, /*Signed=*/0); + llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar( + Address(GlobalizedRecords.back().UseSharedMemory, + CGM.getContext().getTypeAlignInChars(Int16Ty)), + /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_restore_team_static_memory), - llvm::ConstantInt::getNullValue(CGM.Int16Ty)); + IsInSharedMemory); } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( @@ -4507,21 +4513,24 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) { void CGOpenMPRuntimeNVPTX::clear() { if (!GlobalizedRecords.empty()) { ASTContext &C = CGM.getContext(); + llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs; + llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs; RecordDecl *StaticRD = C.buildImplicitRecord( "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); StaticRD->startDefinition(); + RecordDecl *SharedStaticRD = C.buildImplicitRecord( + "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union); + SharedStaticRD->startDefinition(); for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) { if (Records.Records.empty()) continue; unsigned Size = 0; unsigned RecAlignment = 0; for (const RecordDecl *RD : Records.Records) { - QualType RDTy = CGM.getContext().getRecordType(RD); - unsigned Alignment = - CGM.getContext().getTypeAlignInChars(RDTy).getQuantity(); + QualType RDTy = C.getRecordType(RD); + unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity(); RecAlignment = std::max(RecAlignment, Alignment); - unsigned RecSize = - CGM.getContext().getTypeSizeInChars(RDTy).getQuantity(); + unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity(); Size = llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment); } @@ -4529,32 +4538,67 @@ void CGOpenMPRuntimeNVPTX::clear() { llvm::APInt ArySize(/*numBits=*/64, Size); QualType SubTy = C.getConstantArrayType( C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0); - auto *Field = FieldDecl::Create( - C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy, - C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), - /*BW=*/nullptr, /*Mutable=*/false, - /*InitStyle=*/ICIS_NoInit); + const bool UseSharedMemory = Size <= SharedMemorySize; + auto *Field = + FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD, + SourceLocation(), SourceLocation(), nullptr, SubTy, + C.getTrivialTypeSourceInfo(SubTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); Field->setAccess(AS_public); - StaticRD->addDecl(Field); + if (UseSharedMemory) { + SharedStaticRD->addDecl(Field); + SharedRecs.push_back(&Records); + } else { + StaticRD->addDecl(Field); + GlobalRecs.push_back(&Records); + } Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size)); + Records.UseSharedMemory->setInitializer( + llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0)); + } + SharedStaticRD->completeDefinition(); + if (!SharedStaticRD->field_empty()) { + QualType StaticTy = C.getRecordType(SharedStaticRD); + llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMStaticTy, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::Constant::getNullValue(LLVMStaticTy), + "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr, + llvm::GlobalValue::NotThreadLocal, + C.getTargetAddressSpace(LangAS::cuda_shared)); + auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + GV, CGM.VoidPtrTy); + for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) { + Rec->Buffer->replaceAllUsesWith(Replacement); + Rec->Buffer->eraseFromParent(); + } } StaticRD->completeDefinition(); - QualType StaticTy = C.getRecordType(StaticRD); - std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); - llvm::APInt Size1(32, SMsBlockPerSM.second); - QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::APInt Size2(32, SMsBlockPerSM.first); - QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, - /*IndexTypeQuals=*/0); - llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); - auto *GV = new llvm::GlobalVariable( - CGM.getModule(), LLVMArr2Ty, - /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, - llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_"); - StaticGlobalized->setInitializer( - llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, - CGM.VoidPtrTy)); + if (!StaticRD->field_empty()) { + QualType StaticTy = C.getRecordType(StaticRD); + std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM); + llvm::APInt Size1(32, SMsBlockPerSM.second); + QualType Arr1Ty = + C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::APInt Size2(32, SMsBlockPerSM.first); + QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal, + /*IndexTypeQuals=*/0); + llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMArr2Ty, + /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage, + llvm::Constant::getNullValue(LLVMArr2Ty), + "_openmp_static_glob_rd_$_"); + auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast( + GV, CGM.VoidPtrTy); + for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) { + Rec->Buffer->replaceAllUsesWith(Replacement); + Rec->Buffer->eraseFromParent(); + } + } } CGOpenMPRuntime::clear(); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index dabc6b4f85d..a157e421e06 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -420,14 +420,14 @@ private: /// union. This resulting union (one per CU) is the entry point for the static /// memory management runtime functions. struct GlobalPtrSizeRecsTy { + llvm::GlobalVariable *UseSharedMemory = nullptr; llvm::GlobalVariable *RecSize = nullptr; + llvm::GlobalVariable *Buffer = nullptr; + SourceLocation Loc; llvm::SmallVector<const RecordDecl *, 2> Records; unsigned RegionCounter = 0; }; llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords; - /// Global variable used for staticlly allocated global memoryused for - /// globalization in target/teams/distribute regions. - llvm::GlobalVariable *StaticGlobalized = nullptr; /// Shared pointer for the global memory in the global memory buffer used for /// the given kernel. llvm::GlobalVariable *KernelStaticGlobalized = nullptr; |