diff options
Diffstat (limited to 'clang/lib/Driver')
-rw-r--r-- | clang/lib/Driver/Driver.cpp | 74 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/Clang.cpp | 14 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/CommonArgs.cpp | 48 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/Cuda.cpp | 8 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/HIP.cpp | 41 | ||||
-rw-r--r-- | clang/lib/Driver/ToolChains/HIP.h | 5 |
6 files changed, 136 insertions, 54 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index c5aed55eb94..ebf2ffc3558 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final { class HIPActionBuilder final : public CudaActionBuilderBase { /// The linker inputs obtained for each device arch. SmallVector<ActionList, 8> DeviceLinkerInputs; + bool Relocatable; public: HIPActionBuilder(Compilation &C, DerivedArgList &Args, const Driver::InputList &Inputs) - : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {} + : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP), + Relocatable(false) {} bool canUseBundlerUnbundler() const override { return true; } @@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final { phases::ID CurPhase, phases::ID FinalPhase, PhasesTy &Phases) override { // amdgcn does not support linking of object files, therefore we skip - // backend and assemble phases to output LLVM IR. - if (CudaDeviceActions.empty() || CurPhase == phases::Backend || + // backend and assemble phases to output LLVM IR. Except for generating + // non-relocatable device coee, where we generate fat binary for device + // code and pass to host in Backend phase. + if (CudaDeviceActions.empty() || + (CurPhase == phases::Backend && Relocatable) || CurPhase == phases::Assemble) return ABRT_Success; - assert((CurPhase == phases::Link || + assert(((CurPhase == phases::Link && Relocatable) || CudaDeviceActions.size() == GpuArchList.size()) && "Expecting one action per GPU architecture."); assert(!CompileHostOnly && "Not expecting CUDA actions in host-only compilation."); - // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. - // This happens to each device action originated from each input file. - // Later on, device actions in DeviceLinkerInputs are used to create - // device link actions in appendLinkDependences and the created device - // link actions are passed to the offload action as device dependence. - if (CurPhase == phases::Link) { + if (!Relocatable && CurPhase == phases::Backend) { + // If we are in backend phase, we attempt to generate the fat binary. + // We compile each arch to IR and use a link action to generate code + // object containing ISA. Then we use a special "link" action to create + // a fat binary containing all the code objects for different GPU's. + // The fat binary is then an input to the host action. + for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) { + // Create a link action to link device IR with device library + // and generate ISA. + ActionList AL; + AL.push_back(CudaDeviceActions[I]); + CudaDeviceActions[I] = + C.MakeAction<LinkJobAction>(AL, types::TY_Image); + + // OffloadingActionBuilder propagates device arch until an offload + // action. Since the next action for creating fatbin does + // not have device arch, whereas the above link action and its input + // have device arch, an offload action is needed to stop the null + // device arch of the next action being propagated to the above link + // action. + OffloadAction::DeviceDependences DDep; + DDep.add(*CudaDeviceActions[I], *ToolChains.front(), + CudaArchToString(GpuArchList[I]), AssociatedOffloadKind); + CudaDeviceActions[I] = C.MakeAction<OffloadAction>( + DDep, CudaDeviceActions[I]->getType()); + } + // Create HIP fat binary with a special "link" action. + CudaFatBinary = + C.MakeAction<LinkJobAction>(CudaDeviceActions, + types::TY_HIP_FATBIN); + + DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr, + AssociatedOffloadKind); + // Clear the fat binary, it is already a dependence to an host + // action. + CudaFatBinary = nullptr; + + // Remove the CUDA actions as they are already connected to an host + // action or fat binary. + CudaDeviceActions.clear(); + + return ABRT_Success; + } else if (CurPhase == phases::Link) { + // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch. + // This happens to each device action originated from each input file. + // Later on, device actions in DeviceLinkerInputs are used to create + // device link actions in appendLinkDependences and the created device + // link actions are passed to the offload action as device dependence. DeviceLinkerInputs.resize(CudaDeviceActions.size()); auto LI = DeviceLinkerInputs.begin(); for (auto *A : CudaDeviceActions) { @@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final { ++I; } } + + bool initialize() override { + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); + + return CudaActionBuilderBase::initialize(); + } }; /// OpenMP action builder. The host bitcode is passed to the device frontend diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 3ca31ab5cbd..070abd82508 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -4920,16 +4920,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back(Args.MakeArgString(Flags)); } - if (IsCuda) { - // Host-side cuda compilation receives all device-side outputs in a single - // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. - if (CudaDeviceInput) { + // Host-side cuda compilation receives all device-side outputs in a single + // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. + if ((IsCuda || IsHIP) && CudaDeviceInput) { CmdArgs.push_back("-fcuda-include-gpubinary"); CmdArgs.push_back(CudaDeviceInput->getFilename()); - } + if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) + CmdArgs.push_back("-fgpu-rdc"); + } - if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) - CmdArgs.push_back("-fcuda-rdc"); + if (IsCuda) { if (Args.hasFlag(options::OPT_fcuda_short_ptr, options::OPT_fno_cuda_short_ptr, false)) CmdArgs.push_back("-fcuda-short-ptr"); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 9e6aeea9581..355e8d4c7ba 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -15,6 +15,7 @@ #include "Arch/SystemZ.h" #include "Arch/X86.h" #include "Hexagon.h" +#include "HIP.h" #include "InputInfo.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/LangOptions.h" @@ -1337,6 +1338,18 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C, if (!JA.isHostOffloading(Action::OFK_HIP)) return; + InputInfoList DeviceInputs; + for (const auto &II : Inputs) { + const Action *A = II.getAction(); + // Is this a device linking action? + if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) { + DeviceInputs.push_back(II); + } + } + + if (DeviceInputs.empty()) + return; + // Create temporary linker script. Keep it if save-temps is enabled. const char *LKS; SmallString<256> Name = llvm::sys::path::filename(Output.getFilename()); @@ -1364,39 +1377,12 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C, "Wrong platform"); (void)HIPTC; - // Construct clang-offload-bundler command to bundle object files for - // for different GPU archs. - ArgStringList BundlerArgs; - BundlerArgs.push_back(Args.MakeArgString("-type=o")); - - // ToDo: Remove the dummy host binary entry which is required by - // clang-offload-bundler. - std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux"; - std::string BundlerInputArg = "-inputs=/dev/null"; - - for (const auto &II : Inputs) { - const Action *A = II.getAction(); - // Is this a device linking action? - if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) { - BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" + - StringRef(A->getOffloadingArch()).str(); - BundlerInputArg = BundlerInputArg + "," + II.getFilename(); - } - } - BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg)); - BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg)); - - std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o"); + // The output file name needs to persist through the compilation, therefore + // it needs to be created through MakeArgString. + std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb"); const char *BundleFile = C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str())); - auto BundlerOutputArg = - Args.MakeArgString(std::string("-outputs=").append(BundleFile)); - BundlerArgs.push_back(BundlerOutputArg); - - SmallString<128> BundlerPath(C.getDriver().Dir); - llvm::sys::path::append(BundlerPath, "clang-offload-bundler"); - const char *Bundler = Args.MakeArgString(BundlerPath); - C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs)); + AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T); // Add commands to embed target binaries. We ensure that each section and // image is 16-byte aligned. This is not mandatory, but increases the diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 08e49fa6a56..ddcb00809ea 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -398,8 +398,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fnoopenmp_relocatable_target, /*Default=*/true); else if (JA.isOffloading(Action::OFK_Cuda)) - Relocatable = Args.hasFlag(options::OPT_fcuda_rdc, - options::OPT_fno_cuda_rdc, /*Default=*/false); + Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, + options::OPT_fno_gpu_rdc, /*Default=*/false); if (Relocatable) CmdArgs.push_back("-c"); @@ -609,9 +609,9 @@ void CudaToolChain::addClangTargetOptions( options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) - CC1Args.push_back("-fcuda-rdc"); + CC1Args.push_back("-fgpu-rdc"); } if (DriverArgs.hasArg(options::OPT_nocudalib)) diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp index 6efcfaee8fd..58e8e79420d 100644 --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -184,6 +184,40 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA, C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs)); } +// Construct a clang-offload-bundler command to bundle code objects for +// different GPU's into a HIP fat binary. +void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA, + StringRef OutputFileName, const InputInfoList &Inputs, + const llvm::opt::ArgList &Args, const Tool& T) { + // Construct clang-offload-bundler command to bundle object files for + // for different GPU archs. + ArgStringList BundlerArgs; + BundlerArgs.push_back(Args.MakeArgString("-type=o")); + + // ToDo: Remove the dummy host binary entry which is required by + // clang-offload-bundler. + std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux"; + std::string BundlerInputArg = "-inputs=/dev/null"; + + for (const auto &II : Inputs) { + const auto* A = II.getAction(); + BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" + + StringRef(A->getOffloadingArch()).str(); + BundlerInputArg = BundlerInputArg + "," + II.getFilename(); + } + BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg)); + BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg)); + + auto BundlerOutputArg = + Args.MakeArgString(std::string("-outputs=").append(OutputFileName)); + BundlerArgs.push_back(BundlerOutputArg); + + SmallString<128> BundlerPath(C.getDriver().Dir); + llvm::sys::path::append(BundlerPath, "clang-offload-bundler"); + const char *Bundler = Args.MakeArgString(BundlerPath); + C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs)); +} + // For amdgcn the inputs of the linker job are device bitcode and output is // object file. It calls llvm-link, opt, llc, then lld steps. void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, @@ -192,6 +226,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA, const ArgList &Args, const char *LinkingOutput) const { + if (JA.getType() == types::TY_HIP_FATBIN) + return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this); + assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn && "Unsupported target"); @@ -244,9 +281,9 @@ void HIPToolChain::addClangTargetOptions( options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, + if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false)) - CC1Args.push_back("-fcuda-rdc"); + CC1Args.push_back("-fgpu-rdc"); // Default to "hidden" visibility, as object level linking will not be // supported for the foreseeable future. diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h index 40c9128e2f5..3af19d44dae 100644 --- a/clang/lib/Driver/ToolChains/HIP.h +++ b/clang/lib/Driver/ToolChains/HIP.h @@ -19,6 +19,11 @@ namespace driver { namespace tools { namespace AMDGCN { + // Construct command for creating HIP fatbin. + void constructHIPFatbinCommand(Compilation &C, const JobAction &JA, + StringRef OutputFileName, const InputInfoList &Inputs, + const llvm::opt::ArgList &TCArgs, const Tool& T); + // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with // device library, then compiles it to ISA in a shared object. class LLVM_LIBRARY_VISIBILITY Linker : public Tool { |