summaryrefslogtreecommitdiffstats
path: root/clang/lib/Driver/Driver.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib/Driver/Driver.cpp')
-rw-r--r--clang/lib/Driver/Driver.cpp74
1 files changed, 64 insertions, 10 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index c5aed55eb94..ebf2ffc3558 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2486,11 +2486,13 @@ class OffloadingActionBuilder final {
class HIPActionBuilder final : public CudaActionBuilderBase {
/// The linker inputs obtained for each device arch.
SmallVector<ActionList, 8> DeviceLinkerInputs;
+ bool Relocatable;
public:
HIPActionBuilder(Compilation &C, DerivedArgList &Args,
const Driver::InputList &Inputs)
- : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP) {}
+ : CudaActionBuilderBase(C, Args, Inputs, Action::OFK_HIP),
+ Relocatable(false) {}
bool canUseBundlerUnbundler() const override { return true; }
@@ -2499,23 +2501,68 @@ class OffloadingActionBuilder final {
phases::ID CurPhase, phases::ID FinalPhase,
PhasesTy &Phases) override {
// amdgcn does not support linking of object files, therefore we skip
- // backend and assemble phases to output LLVM IR.
- if (CudaDeviceActions.empty() || CurPhase == phases::Backend ||
+ // backend and assemble phases to output LLVM IR. Except for generating
+ // non-relocatable device coee, where we generate fat binary for device
+ // code and pass to host in Backend phase.
+ if (CudaDeviceActions.empty() ||
+ (CurPhase == phases::Backend && Relocatable) ||
CurPhase == phases::Assemble)
return ABRT_Success;
- assert((CurPhase == phases::Link ||
+ assert(((CurPhase == phases::Link && Relocatable) ||
CudaDeviceActions.size() == GpuArchList.size()) &&
"Expecting one action per GPU architecture.");
assert(!CompileHostOnly &&
"Not expecting CUDA actions in host-only compilation.");
- // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
- // This happens to each device action originated from each input file.
- // Later on, device actions in DeviceLinkerInputs are used to create
- // device link actions in appendLinkDependences and the created device
- // link actions are passed to the offload action as device dependence.
- if (CurPhase == phases::Link) {
+ if (!Relocatable && CurPhase == phases::Backend) {
+ // If we are in backend phase, we attempt to generate the fat binary.
+ // We compile each arch to IR and use a link action to generate code
+ // object containing ISA. Then we use a special "link" action to create
+ // a fat binary containing all the code objects for different GPU's.
+ // The fat binary is then an input to the host action.
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ // Create a link action to link device IR with device library
+ // and generate ISA.
+ ActionList AL;
+ AL.push_back(CudaDeviceActions[I]);
+ CudaDeviceActions[I] =
+ C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+
+ // OffloadingActionBuilder propagates device arch until an offload
+ // action. Since the next action for creating fatbin does
+ // not have device arch, whereas the above link action and its input
+ // have device arch, an offload action is needed to stop the null
+ // device arch of the next action being propagated to the above link
+ // action.
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*CudaDeviceActions[I], *ToolChains.front(),
+ CudaArchToString(GpuArchList[I]), AssociatedOffloadKind);
+ CudaDeviceActions[I] = C.MakeAction<OffloadAction>(
+ DDep, CudaDeviceActions[I]->getType());
+ }
+ // Create HIP fat binary with a special "link" action.
+ CudaFatBinary =
+ C.MakeAction<LinkJobAction>(CudaDeviceActions,
+ types::TY_HIP_FATBIN);
+
+ DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+ AssociatedOffloadKind);
+ // Clear the fat binary, it is already a dependence to an host
+ // action.
+ CudaFatBinary = nullptr;
+
+ // Remove the CUDA actions as they are already connected to an host
+ // action or fat binary.
+ CudaDeviceActions.clear();
+
+ return ABRT_Success;
+ } else if (CurPhase == phases::Link) {
+ // Save CudaDeviceActions to DeviceLinkerInputs for each GPU subarch.
+ // This happens to each device action originated from each input file.
+ // Later on, device actions in DeviceLinkerInputs are used to create
+ // device link actions in appendLinkDependences and the created device
+ // link actions are passed to the offload action as device dependence.
DeviceLinkerInputs.resize(CudaDeviceActions.size());
auto LI = DeviceLinkerInputs.begin();
for (auto *A : CudaDeviceActions) {
@@ -2548,6 +2595,13 @@ class OffloadingActionBuilder final {
++I;
}
}
+
+ bool initialize() override {
+ Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
+ options::OPT_fno_gpu_rdc, /*Default=*/false);
+
+ return CudaActionBuilderBase::initialize();
+ }
};
/// OpenMP action builder. The host bitcode is passed to the device frontend
OpenPOWER on IntegriCloud