diff options
Diffstat (limited to 'clang/lib')
-rw-r--r-- | clang/lib/CodeGen/CGCUDANV.cpp | 122 |
1 files changed, 105 insertions, 17 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index b541b1046f5..5fcc9e011bc 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -309,12 +309,24 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { } /// Creates a global constructor function for the module: +/// +/// For CUDA: /// \code /// void __cuda_module_ctor(void*) { /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); /// __cuda_register_globals(Handle); /// } /// \endcode +/// +/// For HIP: +/// \code +/// void __hip_module_ctor(void*) { +/// if (__hip_gpubin_handle == 0) { +/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); +/// __hip_register_globals(__hip_gpubin_handle); +/// } +/// } +/// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { bool IsHIP = CGM.getLangOpts().HIP; // No need to generate ctors/dtors if there is no GPU binary. @@ -427,22 +439,68 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // Register binary with CUDA/HIP runtime. This is substantially different in - // default mode vs. separate compilation! - if (!RelocatableDeviceCode) { - // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); + // There is only one HIP fat binary per linked module, however there are + // multiple constructor functions. Make sure the fat binary is registered + // only once. The constructor functions are executed by the dynamic loader + // before the program gains control. The dynamic loader cannot execute the + // constructor functions concurrently since doing that would not guarantee + // thread safety of the loaded program. Therefore we can assume sequential + // execution of constructor functions here. + if (IsHIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); + // The name, size, and initialization pattern of this variable is part + // of HIP ABI. + GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, /*isConstant=*/false, + llvm::GlobalValue::LinkOnceAnyLinkage, + /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), + "__hip_gpubin_handle"); + GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); + Address GpuBinaryAddr( + GpuBinaryHandle, + CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); + { + auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + llvm::Constant *Zero = + llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); + CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); + } + { + CtorBuilder.SetInsertPoint(IfBlock); + // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); + CtorBuilder.CreateBr(ExitBlock); + } + { + CtorBuilder.SetInsertPoint(ExitBlock); + // Call __hip_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) { + auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); + } + } + } else if (!RelocatableDeviceCode) { + // Register binary with CUDA runtime. This is substantially different in + // default mode vs. separate compilation! + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); GpuBinaryHandle = new llvm::GlobalVariable( TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), - addUnderscoredPrefixToName("_gpubin_handle")); - + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); - // Call __{cuda|hip}_register_globals(GpuBinaryHandle); + // Call __cuda_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); } else { @@ -453,15 +511,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { llvm::Constant *ModuleIDConstant = makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); - // Create an alias for the FatbinWrapper that nvcc or hip backend will - // look for. + // Create an alias for the FatbinWrapper that nvcc will look for. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, Twine("__fatbinwrap") + ModuleID, FatbinWrapper); - // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *, + // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, // void *, void (*)(void **)) - SmallString<128> RegisterLinkedBinaryName( - addUnderscoredPrefixToName("RegisterLinkedBinary")); + SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); RegisterLinkedBinaryName += ModuleID; llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); @@ -493,11 +549,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { /// Creates a global destructor function that unregisters the GPU code blob /// registered by constructor. +/// +/// For CUDA: /// \code /// void __cuda_module_dtor(void*) { /// __cudaUnregisterFatBinary(Handle); /// } /// \endcode +/// +/// For HIP: +/// \code +/// void __hip_module_dtor(void*) { +/// if (__hip_gpubin_handle) { +/// __hipUnregisterFatBinary(__hip_gpubin_handle); +/// __hip_gpubin_handle = 0; +/// } +/// } +/// \endcode llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { // No need for destructor if we don't have a handle to unregister. if (!GpuBinaryHandle) @@ -518,10 +586,30 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { CGBuilderTy DtorBuilder(CGM, Context); DtorBuilder.SetInsertPoint(DtorEntryBB); - auto HandleValue = - DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); - DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); - + Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( + GpuBinaryHandle->getAlignment())); + auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); + // There is only one HIP fat binary per linked module, however there are + // multiple destructor functions. Make sure the fat binary is unregistered + // only once. + if (CGM.getLangOpts().HIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); + llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); + DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); + + DtorBuilder.SetInsertPoint(IfBlock); + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + DtorBuilder.CreateStore(Zero, GpuBinaryAddr); + DtorBuilder.CreateBr(ExitBlock); + + DtorBuilder.SetInsertPoint(ExitBlock); + } else { + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + } DtorBuilder.CreateRetVoid(); return ModuleDtorFunc; } |