[HIP] Support early finalization of device code for -fno-gpu-rdc

This patch renames -f{no-}cuda-rdc to -f{no-}gpu-rdc and keeps the original options as aliases. When -fgpu-rdc is off, clang will assume the device code in each translation unit does not call external functions except those in the device library, therefore it is possible to compile the device code in each translation unit to self-contained kernels and embed them in the host object, so that the host object behaves like usual host object which can be linked by lld. The benefits of this feature is: 1. allow users to create static libraries which can be linked by host linker; 2. amortized device code linking time. This patch modifies HIP action builder to insert actions for linking device code and generating HIP fatbin, and pass HIP fatbin to host backend action. It extracts code for constructing command for generating HIP fatbin as a function so that it can be reused by early finalization. It also modifies codegen of HIP host constructor functions to embed the device fatbin when it is available. Differential Revision: https://reviews.llvm.org/D52377 llvm-svn: 343611
author: Yaxun Liu <Yaxun.Liu@amd.com> 2018-10-02 17:48:54 +0000
committer: Yaxun Liu <Yaxun.Liu@amd.com> 2018-10-02 17:48:54 +0000
commit: 9767089d003b52ec9d32b93c8533c815c9906902 (patch)
tree: e9788366c4929488c5cc4c31054fa4ebc749b0f9 /clang/lib/Driver/Driver.cpp
parent: 2b5259afb3ce81334fab0f4d7a741b85f465caf7 (diff)
download: bcm5719-llvm-9767089d003b52ec9d32b93c8533c815c9906902.tar.gz
bcm5719-llvm-9767089d003b52ec9d32b93c8533c815c9906902.zip
1 files changed, 64 insertions, 10 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index c5aed55eb94..ebf2ffc3558 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final {
   class HIPActionBuilder final : public CudaActionBuilderBase {
     /// The linker inputs obtained for each device arch.
     SmallVector<ActionList, 8> DeviceLinkerInputs;
+    bool Relocatable;
 
   public:
     HIPActionBuilder(Compilation &C, DerivedArgList &Args,
                      const Driver::InputList &Inputs)
-        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+        : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+          Relocatable(false) {}
 
     bool canUseBundlerUnbundler() const override { return true; }
 
@@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final {
                          phases::ID CurPhase, phases::ID FinalPhase,
                          PhasesTy &Phases) override {
       // amdgcn does not support linking of object files, therefore we skip
-      // backend and assemble phases to output LLVM IR.
-      if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+      // backend and assemble phases to output LLVM IR. Except for generating
+      // non-relocatable device coee, where we generate fat binary for device
+      // code and pass to host in Backend phase.
+      if (CudaDeviceActions.empty() ||
+          (CurPhase == phases::Backend && Relocatable) ||
           CurPhase == phases::Assemble)
         return ABRT_Success;
 
-      assert((CurPhase == phases::Link ||
+      assert(((CurPhase == phases::Link && Relocatable) ||
               CudaDeviceActions.size() == GpuArchList.size()) &&
              "Expecting one action per GPU architecture.");
       assert(!CompileHostOnly &&
              "Not expecting CUDA actions in host-only compilation.");
 
-      // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
-      // This happens to each device action originated from each input file.
-      // Later on, device actions in DeviceLinkerInputs are used to create
-      // device link actions in appendLinkDependences and the created device
-      // link actions are passed to the offload action as device dependence.
-      if (CurPhase == phases::Link) {
+      if (!Relocatable && CurPhase == phases::Backend) {
+        // If we are in backend phase, we attempt to generate the fat binary.
+        // We compile each arch to IR and use a link action to generate code
+        // object containing ISA. Then we use a special "link" action to create
+        // a fat binary containing all the code objects for different GPU's.
+        // The fat binary is then an input to the host action.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Create a link action to link device IR with device library
+          // and generate ISA.
+          ActionList AL;
+          AL.push_back(CudaDeviceActions[I]);
+          CudaDeviceActions[I] =
+              C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+          // OffloadingActionBuilder propagates device arch until an offload
+          // action. Since the next action for creating fatbin does
+          // not have device arch, whereas the above link action and its input
+          // have device arch, an offload action is needed to stop the null
+          // device arch of the next action being propagated to the above link
+          // action.
+          OffloadAction::DeviceDependences DDep;
+          DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+                   CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+          CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+              DDep, CudaDeviceActions[I]->getType());
+        }
+        // Create HIP fat binary with a special "link" action.
+        CudaFatBinary =
+            C.MakeAction<LinkJobAction>(CudaDeviceActions,
+                types::TY_HIP_FATBIN);
+
+        DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+               AssociatedOffloadKind);
+        // Clear the fat binary, it is already a dependence to an host
+        // action.
+        CudaFatBinary = nullptr;
+
+        // Remove the CUDA actions as they are already connected to an host
+        // action or fat binary.
+        CudaDeviceActions.clear();
+
+        return ABRT_Success;
+      } else if (CurPhase == phases::Link) {
+        // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+        // This happens to each device action originated from each input file.
+        // Later on, device actions in DeviceLinkerInputs are used to create
+        // device link actions in appendLinkDependences and the created device
+        // link actions are passed to the offload action as device dependence.
         DeviceLinkerInputs.resize(CudaDeviceActions.size());
         auto LI = DeviceLinkerInputs.begin();
         for (auto *A : CudaDeviceActions) {
@@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final {
         ++I;
       }
     }
+
+    bool initialize() override {
+      Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+          options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+      return CudaActionBuilderBase::initialize();
+    }
   };
 
   /// OpenMP action builder. The host bitcode is passed to the device frontend
author	Yaxun Liu <Yaxun.Liu@amd.com>	2018-10-02 17:48:54 +0000
committer	Yaxun Liu <Yaxun.Liu@amd.com>	2018-10-02 17:48:54 +0000
commit	9767089d003b52ec9d32b93c8533c815c9906902 (patch)
tree	e9788366c4929488c5cc4c31054fa4ebc749b0f9 /clang/lib/Driver/Driver.cpp
parent	2b5259afb3ce81334fab0f4d7a741b85f465caf7 (diff)
download	bcm5719-llvm-9767089d003b52ec9d32b93c8533c815c9906902.tar.gz bcm5719-llvm-9767089d003b52ec9d32b93c8533c815c9906902.zip