summaryrefslogtreecommitdiffstats
path: root/clang/lib/Driver
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/Driver')
-rw-r--r--clang/lib/Driver/Driver.cpp74
-rw-r--r--clang/lib/Driver/ToolChains/Clang.cpp14
-rw-r--r--clang/lib/Driver/ToolChains/CommonArgs.cpp48
-rw-r--r--clang/lib/Driver/ToolChains/Cuda.cpp8
-rw-r--r--clang/lib/Driver/ToolChains/HIP.cpp41
-rw-r--r--clang/lib/Driver/ToolChains/HIP.h5
6 files changed, 136 insertions, 54 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index c5aed55eb94..ebf2ffc3558 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final {
class HIPActionBuilder final : public CudaActionBuilderBase {
/// The linker inputs obtained for each device arch.
SmallVector<ActionList, 8> DeviceLinkerInputs;
+ bool Relocatable;
public:
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
- : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+ : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+ Relocatable(false) {}
bool canUseBundlerUnbundler() const override { return true; }
@@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final {
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
// amdgcn does not support linking of object files, therefore we skip
- // backend and assemble phases to output LLVM IR.
- if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+ // backend and assemble phases to output LLVM IR. Except for generating
+ // non-relocatable device coee, where we generate fat binary for device
+ // code and pass to host in Backend phase.
+ if (CudaDeviceActions.empty() ||
+ (CurPhase == phases::Backend && Relocatable) ||
CurPhase == phases::Assemble)
return ABRT_Success;
- assert((CurPhase == phases::Link ||
+ assert(((CurPhase == phases::Link && Relocatable) ||
CudaDeviceActions.size() == GpuArchList.size()) &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");
- // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
- // This happens to each device action originated from each input file.
- // Later on, device actions in DeviceLinkerInputs are used to create
- // device link actions in appendLinkDependences and the created device
- // link actions are passed to the offload action as device dependence.
- if (CurPhase == phases::Link) {
+ if (!Relocatable && CurPhase == phases::Backend) {
+ // If we are in backend phase, we attempt to generate the fat binary.
+ // We compile each arch to IR and use a link action to generate code
+ // object containing ISA. Then we use a special "link" action to create
+ // a fat binary containing all the code objects for different GPU's.
+ // The fat binary is then an input to the host action.
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ // Create a link action to link device IR with device library
+ // and generate ISA.
+ ActionList AL;
+ AL.push_back(CudaDeviceActions[I]);
+ CudaDeviceActions[I] =
+ C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+ // OffloadingActionBuilder propagates device arch until an offload
+ // action. Since the next action for creating fatbin does
+ // not have device arch, whereas the above link action and its input
+ // have device arch, an offload action is needed to stop the null
+ // device arch of the next action being propagated to the above link
+ // action.
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+ CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+ CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+ DDep, CudaDeviceActions[I]->getType());
+ }
+ // Create HIP fat binary with a special "link" action.
+ CudaFatBinary =
+ C.MakeAction<LinkJobAction>(CudaDeviceActions,
+ types::TY_HIP_FATBIN);
+
+ DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+ AssociatedOffloadKind);
+ // Clear the fat binary, it is already a dependence to an host
+ // action.
+ CudaFatBinary = nullptr;
+
+ // Remove the CUDA actions as they are already connected to an host
+ // action or fat binary.
+ CudaDeviceActions.clear();
+
+ return ABRT_Success;
+ } else if (CurPhase == phases::Link) {
+ // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+ // This happens to each device action originated from each input file.
+ // Later on, device actions in DeviceLinkerInputs are used to create
+ // device link actions in appendLinkDependences and the created device
+ // link actions are passed to the offload action as device dependence.
DeviceLinkerInputs.resize(CudaDeviceActions.size());
auto LI = DeviceLinkerInputs.begin();
for (auto *A : CudaDeviceActions) {
@@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final {
++I;
}
}
+
+ bool initialize() override {
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+ return CudaActionBuilderBase::initialize();
+ }
};
/// OpenMP action builder. The host bitcode is passed to the device frontend
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 3ca31ab5cbd..070abd82508 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4920,16 +4920,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back(Args.MakeArgString(Flags));
}
- if (IsCuda) {
- // Host-side cuda compilation receives all device-side outputs in a single
- // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
- if (CudaDeviceInput) {
+ // Host-side cuda compilation receives all device-side outputs in a single
+ // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
+ if ((IsCuda || IsHIP) && CudaDeviceInput) {
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
- }
+ if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ CmdArgs.push_back("-fgpu-rdc");
+ }
- if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
- CmdArgs.push_back("-fcuda-rdc");
+ if (IsCuda) {
if (Args.hasFlag(options::OPT_fcuda_short_ptr,
options::OPT_fno_cuda_short_ptr, false))
CmdArgs.push_back("-fcuda-short-ptr");
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 9e6aeea9581..355e8d4c7ba 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -15,6 +15,7 @@
#include "Arch/SystemZ.h"
#include "Arch/X86.h"
#include "Hexagon.h"
+#include "HIP.h"
#include "InputInfo.h"
#include "clang/Basic/CharInfo.h"
#include "clang/Basic/LangOptions.h"
@@ -1337,6 +1338,18 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
if (!JA.isHostOffloading(Action::OFK_HIP))
return;
+ InputInfoList DeviceInputs;
+ for (const auto &II : Inputs) {
+ const Action *A = II.getAction();
+ // Is this a device linking action?
+ if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
+ DeviceInputs.push_back(II);
+ }
+ }
+
+ if (DeviceInputs.empty())
+ return;
+
// Create temporary linker script. Keep it if save-temps is enabled.
const char *LKS;
SmallString<256> Name = llvm::sys::path::filename(Output.getFilename());
@@ -1364,39 +1377,12 @@ void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C,
"Wrong platform");
(void)HIPTC;
- // Construct clang-offload-bundler command to bundle object files for
- // for different GPU archs.
- ArgStringList BundlerArgs;
- BundlerArgs.push_back(Args.MakeArgString("-type=o"));
-
- // ToDo: Remove the dummy host binary entry which is required by
- // clang-offload-bundler.
- std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
- std::string BundlerInputArg = "-inputs=/dev/null";
-
- for (const auto &II : Inputs) {
- const Action *A = II.getAction();
- // Is this a device linking action?
- if (A && isa<LinkJobAction>(A) && A->isDeviceOffloading(Action::OFK_HIP)) {
- BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
- StringRef(A->getOffloadingArch()).str();
- BundlerInputArg = BundlerInputArg + "," + II.getFilename();
- }
- }
- BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
- BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
-
- std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o");
+ // The output file name needs to persist through the compilation, therefore
+ // it needs to be created through MakeArgString.
+ std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "hipfb");
const char *BundleFile =
C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str()));
- auto BundlerOutputArg =
- Args.MakeArgString(std::string("-outputs=").append(BundleFile));
- BundlerArgs.push_back(BundlerOutputArg);
-
- SmallString<128> BundlerPath(C.getDriver().Dir);
- llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
- const char *Bundler = Args.MakeArgString(BundlerPath);
- C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+ AMDGCN::constructHIPFatbinCommand(C, JA, BundleFile, DeviceInputs, Args, T);
// Add commands to embed target binaries. We ensure that each section and
// image is 16-byte aligned. This is not mandatory, but increases the
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 08e49fa6a56..ddcb00809ea 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -398,8 +398,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fnoopenmp_relocatable_target,
/*Default=*/true);
else if (JA.isOffloading(Action::OFK_Cuda))
- Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
- options::OPT_fno_cuda_rdc, /*Default=*/false);
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
if (Relocatable)
CmdArgs.push_back("-c");
@@ -609,9 +609,9 @@ void CudaToolChain::addClangTargetOptions(
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
- if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
- CC1Args.push_back("-fcuda-rdc");
+ CC1Args.push_back("-fgpu-rdc");
}
if (DriverArgs.hasArg(options::OPT_nocudalib))
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 6efcfaee8fd..58e8e79420d 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -184,6 +184,40 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
C.addCommand(llvm::make_unique<Command>(JA, *this, Lld, LldArgs, Inputs));
}
+// Construct a clang-offload-bundler command to bundle code objects for
+// different GPU's into a HIP fat binary.
+void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+ StringRef OutputFileName, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &Args, const Tool& T) {
+ // Construct clang-offload-bundler command to bundle object files for
+ // for different GPU archs.
+ ArgStringList BundlerArgs;
+ BundlerArgs.push_back(Args.MakeArgString("-type=o"));
+
+ // ToDo: Remove the dummy host binary entry which is required by
+ // clang-offload-bundler.
+ std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux";
+ std::string BundlerInputArg = "-inputs=/dev/null";
+
+ for (const auto &II : Inputs) {
+ const auto* A = II.getAction();
+ BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" +
+ StringRef(A->getOffloadingArch()).str();
+ BundlerInputArg = BundlerInputArg + "," + II.getFilename();
+ }
+ BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
+ BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
+
+ auto BundlerOutputArg =
+ Args.MakeArgString(std::string("-outputs=").append(OutputFileName));
+ BundlerArgs.push_back(BundlerOutputArg);
+
+ SmallString<128> BundlerPath(C.getDriver().Dir);
+ llvm::sys::path::append(BundlerPath, "clang-offload-bundler");
+ const char *Bundler = Args.MakeArgString(BundlerPath);
+ C.addCommand(llvm::make_unique<Command>(JA, T, Bundler, BundlerArgs, Inputs));
+}
+
// For amdgcn the inputs of the linker job are device bitcode and output is
// object file. It calls llvm-link, opt, llc, then lld steps.
void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -192,6 +226,9 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
const ArgList &Args,
const char *LinkingOutput) const {
+ if (JA.getType() == types::TY_HIP_FATBIN)
+ return constructHIPFatbinCommand(C, JA, Output.getFilename(), Inputs, Args, *this);
+
assert(getToolChain().getTriple().getArch() == llvm::Triple::amdgcn &&
"Unsupported target");
@@ -244,9 +281,9 @@ void HIPToolChain::addClangTargetOptions(
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
- if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ if (DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
false))
- CC1Args.push_back("-fcuda-rdc");
+ CC1Args.push_back("-fgpu-rdc");
// Default to "hidden" visibility, as object level linking will not be
// supported for the foreseeable future.
diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 40c9128e2f5..3af19d44dae 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -19,6 +19,11 @@ namespace driver {
namespace tools {
namespace AMDGCN {
+ // Construct command for creating HIP fatbin.
+ void constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
+ StringRef OutputFileName, const InputInfoList &Inputs,
+ const llvm::opt::ArgList &TCArgs, const Tool& T);
+
// Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
// device library, then compiles it to ISA in a shared object.
class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
OpenPOWER on IntegriCloud