diff options
Diffstat (limited to 'clang/lib/Driver/Driver.cpp')
-rw-r--r-- | clang/lib/Driver/Driver.cpp | 74 |
1 files changed, 64 insertions, 10 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index c5aed55eb94..ebf2ffc3558 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final { class HIPActionBuilder final : public CudaActionBuilderBase { /// The linker inputs obtained for each device arch. SmallVector<ActionList, 8> DeviceLinkerInputs; + bool Relocatable; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) - : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {} + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP), + Relocatable(false) {} bool canUseBundlerUnbundler() const override { return true; } @@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final { phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) override { // amdgcn does not support linking of object files, therefore we skip - // backend and assemble phases to output LLVM IR. - if (CudaDeviceActions.empty() || CurPhase == phases::Backend || + // backend and assemble phases to output LLVM IR. Except for generating + // non-relocatable device coee, where we generate fat binary for device + // code and pass to host in Backend phase. + if (CudaDeviceActions.empty() || + (CurPhase == phases::Backend && Relocatable) || CurPhase == phases::Assemble) return ABRT_Success; - assert((CurPhase == phases::Link || + assert(((CurPhase == phases::Link && Relocatable) || CudaDeviceActions.size() == GpuArchList.size()) && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && "Not expecting CUDA actions in host-only compilation."); - // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. - // This happens to each device action originated from each input file. - // Later on, device actions in DeviceLinkerInputs are used to create - // device link actions in appendLinkDependences and the created device - // link actions are passed to the offload action as device dependence. - if (CurPhase == phases::Link) { + if (!Relocatable && CurPhase == phases::Backend) { + // If we are in backend phase, we attempt to generate the fat binary. + // We compile each arch to IR and use a link action to generate code + // object containing ISA. Then we use a special "link" action to create + // a fat binary containing all the code objects for different GPU's. + // The fat binary is then an input to the host action. + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Create a link action to link device IR with device library + // and generate ISA. + ActionList AL; + AL.push_back(CudaDeviceActions[I]); + CudaDeviceActions[I] = + C.MakeAction<LinkJobAction>(AL, types::TY_Image); + + // OffloadingActionBuilder propagates device arch until an offload + // action. Since the next action for creating fatbin does + // not have device arch, whereas the above link action and its input + // have device arch, an offload action is needed to stop the null + // device arch of the next action being propagated to the above link + // action. + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), + CudaArchToString(GpuArchList[I]), AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction<OffloadAction>( + DDep, CudaDeviceActions[I]->getType()); + } + // Create HIP fat binary with a special "link" action. + CudaFatBinary = + C.MakeAction<LinkJobAction>(CudaDeviceActions, + types::TY_HIP_FATBIN); + + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + AssociatedOffloadKind); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + + return ABRT_Success; + } else if (CurPhase == phases::Link) { + // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. + // This happens to each device action originated from each input file. + // Later on, device actions in DeviceLinkerInputs are used to create + // device link actions in appendLinkDependences and the created device + // link actions are passed to the offload action as device dependence. DeviceLinkerInputs.resize(CudaDeviceActions.size()); auto LI = DeviceLinkerInputs.begin(); for (auto *A : CudaDeviceActions) { @@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final { ++I; } } + + bool initialize() override { + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); + + return CudaActionBuilderBase::initialize(); + } }; /// OpenMP action builder. The host bitcode is passed to the device frontend |