summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/CodeGen/CGCUDANV.cpp122
1 files changed, 105 insertions, 17 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index b541b1046f5..5fcc9e011bc 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -309,12 +309,24 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
}
/// Creates a global constructor function for the module:
+///
+/// For CUDA:
/// \code
/// void __cuda_module_ctor(void*) {
/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
/// __cuda_register_globals(Handle);
/// }
/// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_ctor(void*) {
+/// if (__hip_gpubin_handle == 0) {
+/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
+/// __hip_register_globals(__hip_gpubin_handle);
+/// }
+/// }
+/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
bool IsHIP = CGM.getLangOpts().HIP;
// No need to generate ctors/dtors if there is no GPU binary.
@@ -427,22 +439,68 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
/*constant*/ true);
FatbinWrapper->setSection(FatbinSectionName);
- // Register binary with CUDA/HIP runtime. This is substantially different in
- // default mode vs. separate compilation!
- if (!RelocatableDeviceCode) {
- // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
+ // There is only one HIP fat binary per linked module, however there are
+ // multiple constructor functions. Make sure the fat binary is registered
+ // only once. The constructor functions are executed by the dynamic loader
+ // before the program gains control. The dynamic loader cannot execute the
+ // constructor functions concurrently since doing that would not guarantee
+ // thread safety of the loaded program. Therefore we can assume sequential
+ // execution of constructor functions here.
+ if (IsHIP) {
+ llvm::BasicBlock *IfBlock =
+ llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
+ llvm::BasicBlock *ExitBlock =
+ llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
+ // The name, size, and initialization pattern of this variable is part
+ // of HIP ABI.
+ GpuBinaryHandle = new llvm::GlobalVariable(
+ TheModule, VoidPtrPtrTy, /*isConstant=*/false,
+ llvm::GlobalValue::LinkOnceAnyLinkage,
+ /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
+ "__hip_gpubin_handle");
+ GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
+ Address GpuBinaryAddr(
+ GpuBinaryHandle,
+ CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
+ {
+ auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+ llvm::Constant *Zero =
+ llvm::Constant::getNullValue(HandleValue->getType());
+ llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
+ CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
+ }
+ {
+ CtorBuilder.SetInsertPoint(IfBlock);
+ // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
+ llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+ RegisterFatbinFunc,
+ CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+ CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
+ CtorBuilder.CreateBr(ExitBlock);
+ }
+ {
+ CtorBuilder.SetInsertPoint(ExitBlock);
+ // Call __hip_register_globals(GpuBinaryHandle);
+ if (RegisterGlobalsFunc) {
+ auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
+ CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
+ }
+ }
+ } else if (!RelocatableDeviceCode) {
+ // Register binary with CUDA runtime. This is substantially different in
+ // default mode vs. separate compilation!
+ // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
RegisterFatbinFunc,
CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
GpuBinaryHandle = new llvm::GlobalVariable(
TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
- llvm::ConstantPointerNull::get(VoidPtrPtrTy),
- addUnderscoredPrefixToName("_gpubin_handle"));
-
+ llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+ GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity());
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());
- // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
+ // Call __cuda_register_globals(GpuBinaryHandle);
if (RegisterGlobalsFunc)
CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
} else {
@@ -453,15 +511,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
llvm::Constant *ModuleIDConstant =
makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
- // Create an alias for the FatbinWrapper that nvcc or hip backend will
- // look for.
+ // Create an alias for the FatbinWrapper that nvcc will look for.
llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
- // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
+ // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
// void *, void (*)(void **))
- SmallString<128> RegisterLinkedBinaryName(
- addUnderscoredPrefixToName("RegisterLinkedBinary"));
+ SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
RegisterLinkedBinaryName += ModuleID;
llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
@@ -493,11 +549,23 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
/// Creates a global destructor function that unregisters the GPU code blob
/// registered by constructor.
+///
+/// For CUDA:
/// \code
/// void __cuda_module_dtor(void*) {
/// __cudaUnregisterFatBinary(Handle);
/// }
/// \endcode
+///
+/// For HIP:
+/// \code
+/// void __hip_module_dtor(void*) {
+/// if (__hip_gpubin_handle) {
+/// __hipUnregisterFatBinary(__hip_gpubin_handle);
+/// __hip_gpubin_handle = 0;
+/// }
+/// }
+/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
// No need for destructor if we don't have a handle to unregister.
if (!GpuBinaryHandle)
@@ -518,10 +586,30 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
CGBuilderTy DtorBuilder(CGM, Context);
DtorBuilder.SetInsertPoint(DtorEntryBB);
- auto HandleValue =
- DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
- DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
-
+ Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
+ GpuBinaryHandle->getAlignment()));
+ auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
+ // There is only one HIP fat binary per linked module, however there are
+ // multiple destructor functions. Make sure the fat binary is unregistered
+ // only once.
+ if (CGM.getLangOpts().HIP) {
+ llvm::BasicBlock *IfBlock =
+ llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
+ llvm::BasicBlock *ExitBlock =
+ llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
+ llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
+ llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
+ DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
+
+ DtorBuilder.SetInsertPoint(IfBlock);
+ DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+ DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
+ DtorBuilder.CreateBr(ExitBlock);
+
+ DtorBuilder.SetInsertPoint(ExitBlock);
+ } else {
+ DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
+ }
DtorBuilder.CreateRetVoid();
return ModuleDtorFunc;
}
OpenPOWER on IntegriCloud