4 files changed, 212 insertions, 43 deletions
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index e9e5483e5ac..4c2e10b65cc 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -27,6 +27,8 @@ using namespace clang;
 using namespace CodeGen;
 
 namespace {
+constexpr unsigned CudaFatMagic = 0x466243b1;
+constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
 class CGNVCUDARuntime : public CGCUDARuntime {
 
@@ -310,19 +312,20 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
+  bool IsHIP = CGM.getLangOpts().HIP;
   // No need to generate ctors/dtors if there is no GPU binary.
-  std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
-  if (GpuBinaryFileName.empty())
+  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+  if (CudaGpuBinaryFileName.empty() && !IsHIP)
     return nullptr;
 
-  // void __cuda_register_globals(void* handle);
+  // void __{cuda|hip}_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
   // We always need a function to pass in as callback. Create a dummy
   // implementation if we don't need to register anything.
   if (RelocatableDeviceCode && !RegisterGlobalsFunc)
     RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
 
-  // void ** __cudaRegisterFatBinary(void *);
+  // void ** __{cuda|hip}RegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
       addUnderscoredPrefixToName("RegisterFatBinary"));
@@ -334,12 +337,16 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   // global variable and save a reference in GpuBinaryHandle to be cleaned up
   // in destructor on exit. Then associate all known kernels with the GPU binary
   // handle so CUDA runtime can figure out what to call on the GPU side.
-  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
-      llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
-  if (std::error_code EC = GpuBinaryOrErr.getError()) {
-    CGM.getDiags().Report(diag::err_cannot_open_file)
-        << GpuBinaryFileName << EC.message();
-    return nullptr;
+  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary;
+  if (!IsHIP) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
+        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
+    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
+      CGM.getDiags().Report(diag::err_cannot_open_file)
+          << CudaGpuBinaryFileName << EC.message();
+      return nullptr;
+    }
+    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
   }
 
   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
@@ -353,28 +360,60 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
   CtorBuilder.SetInsertPoint(CtorEntryBB);
 
   const char *FatbinConstantName;
-  if (RelocatableDeviceCode)
+  const char *FatbinSectionName;
+  const char *ModuleIDSectionName;
+  StringRef ModuleIDPrefix;
+  llvm::Constant *FatBinStr;
+  unsigned FatMagic;
+  if (IsHIP) {
+    FatbinConstantName = ".hip_fatbin";
+    FatbinSectionName = ".hipFatBinSegment";
+
+    ModuleIDSectionName = "__hip_module_id";
+    ModuleIDPrefix = "__hip_";
+
+    // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin.
+    // The external symbol is supposed to contain the fat binary but will be
+    // populated somewhere else, e.g. by lld through link script.
+    FatBinStr = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.Int8Ty,
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
+        "__hip_fatbin", nullptr,
+        llvm::GlobalVariable::NotThreadLocal);
+    cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
+
+    FatMagic = HIPFatMagic;
+  } else {
+    if (RelocatableDeviceCode)
+      // TODO: Figure out how this is called on mac OS!
+      FatbinConstantName = "__nv_relfatbin";
+    else
+      FatbinConstantName =
+          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinSectionName =
+        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
     // TODO: Figure out how this is called on mac OS!
-    FatbinConstantName = "__nv_relfatbin";
-  else
-    FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
-  // NVIDIA's cuobjdump looks for fatbins in this section.
-  const char *FatbinSectionName =
-      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
-  // TODO: Figure out how this is called on mac OS!
-  const char *NVModuleIDSectionName = "__nv_module_id";
+    ModuleIDSectionName = "__nv_module_id";
+    ModuleIDPrefix = "__nv_";
+
+    // For CUDA, create a string literal containing the fat binary loaded from
+    // the given file.
+    FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "",
+                                   FatbinConstantName, 8);
+    FatMagic = CudaFatMagic;
+  }
 
   // Create initialized wrapper structure that points to the loaded GPU binary
   ConstantInitBuilder Builder(CGM);
   auto Values = Builder.beginStruct(FatbinWrapperTy);
   // Fatbin wrapper magic.
-  Values.addInt(IntTy, 0x466243b1);
+  Values.addInt(IntTy, FatMagic);
   // Fatbin version.
   Values.addInt(IntTy, 1);
   // Data.
-  Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
-                                FatbinConstantName, 8));
+  Values.add(FatBinStr);
   // Unused in fatbin v1.
   Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
   llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
@@ -382,10 +421,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
       /*constant*/ true);
   FatbinWrapper->setSection(FatbinSectionName);
 
-  // Register binary with CUDA runtime. This is substantially different in
+  // Register binary with CUDA/HIP runtime. This is substantially different in
   // default mode vs. separate compilation!
   if (!RelocatableDeviceCode) {
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+    // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
         RegisterFatbinFunc,
         CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
@@ -397,34 +436,34 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                    CGM.getPointerAlign());
 
-    // Call __cuda_register_globals(GpuBinaryHandle);
+    // Call __{cuda|hip}_register_globals(GpuBinaryHandle);
     if (RegisterGlobalsFunc)
       CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
   } else {
     // Generate a unique module ID.
-    SmallString<64> NVModuleID;
-    llvm::raw_svector_ostream OS(NVModuleID);
-    OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
-    llvm::Constant *NVModuleIDConstant =
-        makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
-
-    // Create an alias for the FatbinWrapper that nvcc will look for.
+    SmallString<64> ModuleID;
+    llvm::raw_svector_ostream OS(ModuleID);
+    OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID());
+    llvm::Constant *ModuleIDConstant =
+        makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32);
+
+    // Create an alias for the FatbinWrapper that nvcc or hip backend will
+    // look for.
     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
-                              Twine("__fatbinwrap") + NVModuleID,
-                              FatbinWrapper);
+                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
 
-    // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+    // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
     // void *, void (*)(void **))
     SmallString<128> RegisterLinkedBinaryName(
         addUnderscoredPrefixToName("RegisterLinkedBinary"));
-    RegisterLinkedBinaryName += NVModuleID;
+    RegisterLinkedBinaryName += ModuleID;
     llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
         getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
 
     assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
     llvm::Value *Args[] = {RegisterGlobalsFunc,
                            CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
-                           NVModuleIDConstant,
+                           ModuleIDConstant,
                            makeDummyFunction(getCallbackFnTy())};
     CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
   }
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 9a3ab8d4b2f..b7e86cb0436 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -146,12 +146,14 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs,
   Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input);
 
   for (const auto &II : Inputs) {
-    // If the current tool chain refers to an OpenMP offloading host, we should
-    // ignore inputs that refer to OpenMP offloading devices - they will be
-    // embedded according to a proper linker script.
+    // If the current tool chain refers to an OpenMP or HIP offloading host, we
+    // should ignore inputs that refer to OpenMP or HIP offloading devices -
+    // they will be embedded according to a proper linker script.
     if (auto *IA = II.getAction())
-      if (JA.isHostOffloading(Action::OFK_OpenMP) &&
-          IA->isDeviceOffloading(Action::OFK_OpenMP))
+      if ((JA.isHostOffloading(Action::OFK_OpenMP) &&
+           IA->isDeviceOffloading(Action::OFK_OpenMP)) ||
+          (JA.isHostOffloading(Action::OFK_HIP) &&
+           IA->isDeviceOffloading(Action::OFK_HIP)))
         continue;
 
     if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType()))
@@ -1288,6 +1290,124 @@ void tools::AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
   Lksf << LksBuffer;
 }
 
+/// Add HIP linker script arguments at the end of the argument list so that
+/// the fat binary is built by embedding the device images into the host. The
+/// linker script also defines a symbol required by the code generation so that
+/// the image can be retrieved at runtime. This should be used only in tool
+/// chains that support linker scripts.
+void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                               const InputInfo &Output,
+                               const InputInfoList &Inputs, const ArgList &Args,
+                               ArgStringList &CmdArgs, const JobAction &JA,
+                               const Tool &T) {
+
+  // If this is not a HIP host toolchain, we don't need to do anything.
+  if (!JA.isHostOffloading(Action::OFK_HIP))
+    return;
+
+  // Create temporary linker script. Keep it if save-temps is enabled.
+  const char *LKS;
+  SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
+  if (C.getDriver().isSaveTempsEnabled()) {
+    llvm::sys::path::replace_extension(Name, "lk");
+    LKS = C.getArgs().MakeArgString(Name.c_str());
+  } else {
+    llvm::sys::path::replace_extension(Name, "");
+    Name = C.getDriver().GetTemporaryPath(Name, "lk");
+    LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str()));
+  }
+
+  // Add linker script option to the command.
+  CmdArgs.push_back("-T");
+  CmdArgs.push_back(LKS);
+
+  // Create a buffer to write the contents of the linker script.
+  std::string LksBuffer;
+  llvm::raw_string_ostream LksStream(LksBuffer);
+
+  // Get the HIP offload tool chain.
+  auto *HIPTC = static_cast<const toolchains::CudaToolChain *>(
+      C.getSingleOffloadToolChain<Action::OFK_HIP>());
+  assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn &&
+         "Wrong platform");
+
+  // Construct clang-offload-bundler command to bundle object files for
+  // for different GPU archs.
+  ArgStringList BundlerArgs;
+  BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+  // ToDo: Remove the dummy host binary entry which is required by
+  // clang-offload-bundler.
+  std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+  std::string BundlerInputArg = "-inputs=/dev/null";
+
+  for (const auto &II : Inputs) {
+    const Action *A = II.getAction();
+    // Is this a device linking action?
+    if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+      BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+                         StringRef(A->getOffloadingArch()).str();
+      BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+    }
+  }
+  BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+  BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+  std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+  const char *BundleFile =
+      C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(BundleFile));
+  BundlerArgs.push_back(BundlerOutputArg);
+
+  SmallString<128> BundlerPath(C.getDriver().Dir);
+  llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+  const char *Bundler = Args.MakeArgString(BundlerPath);
+  C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+
+  // Add commands to embed target binaries. We ensure that each section and
+  // image is 16-byte aligned. This is not mandatory, but increases the
+  // likelihood of data to be aligned with a cache block in several main host
+  // machines.
+  LksStream << "/*\n";
+  LksStream << "       HIP Offload Linker Script\n";
+  LksStream << " *** Automatically generated by Clang ***\n";
+  LksStream << "*/\n";
+  LksStream << "TARGET(binary)\n";
+  LksStream << "INPUT(" << BundleFileName << ")\n";
+  LksStream << "SECTIONS\n";
+  LksStream << "{\n";
+  LksStream << "  .hip_fatbin :\n";
+  LksStream << "  ALIGN(0x10)\n";
+  LksStream << "  {\n";
+  LksStream << "    PROVIDE_HIDDEN(__hip_fatbin = .);\n";
+  LksStream << "    " << BundleFileName << "\n";
+  LksStream << "  }\n";
+  LksStream << "}\n";
+  LksStream << "INSERT BEFORE .data\n";
+  LksStream.flush();
+
+  // Dump the contents of the linker script if the user requested that. We
+  // support this option to enable testing of behavior with -###.
+  if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script))
+    llvm::errs() << LksBuffer;
+
+  // If this is a dry run, do not create the linker script file.
+  if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
+    return;
+
+  // Open script file and write the contents.
+  std::error_code EC;
+  llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None);
+
+  if (EC) {
+    C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message();
+    return;
+  }
+
+  Lksf << LksBuffer;
+}
+
 SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args,
                                          const InputInfo &Output,
                                          const InputInfo &Input,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h
index 00bb2e4ec47..e8ebe2225e1 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.h
+++ b/clang/lib/Driver/ToolChains/CommonArgs.h
@@ -52,6 +52,12 @@ void AddOpenMPLinkerScript(const ToolChain &TC, Compilation &C,
                            llvm::opt::ArgStringList &CmdArgs,
                            const JobAction &JA);
 
+void AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
+                        const InputInfo &Output, const InputInfoList &Inputs,
+                        const llvm::opt::ArgList &Args,
+                        llvm::opt::ArgStringList &CmdArgs, const JobAction &JA,
+                        const Tool &T);
+
 const char *SplitDebugName(const llvm::opt::ArgList &Args,
                            const InputInfo &Input);
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 56c2a97dc4c..cc925c70276 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -535,6 +535,10 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // Add OpenMP offloading linker script args if required.
   AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA);
 
+  // Add HIP offloading linker script args if required.
+  AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA,
+                     *this);
+
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }