summaryrefslogtreecommitdiffstats
path: root/clang/lib/Driver/Driver.cpp
diff options
context:
space:
mode:
authorJustin Lebar <jlebar@google.com>2016-01-14 21:41:27 +0000
committerJustin Lebar <jlebar@google.com>2016-01-14 21:41:27 +0000
commit21e5d4fcfa4c8a2a198a968af5b7db1884b801b9 (patch)
tree678dba0dd9353eac0609f6eaaf55c43dbb6fc3cc /clang/lib/Driver/Driver.cpp
parentb44f6fed4da5bc1d2983887f6ac933dd90f5c854 (diff)
downloadbcm5719-llvm-21e5d4fcfa4c8a2a198a968af5b7db1884b801b9.tar.gz
bcm5719-llvm-21e5d4fcfa4c8a2a198a968af5b7db1884b801b9.zip
[CUDA] Invoke ptxas and fatbinary during compilation.
Summary: Previously we compiled CUDA device code to PTX assembly and embedded that asm as text in our host binary. Now we compile to PTX assembly and then invoke ptxas to assemble the PTX into a cubin file. We gather the ptx and cubin files for each of our --cuda-gpu-archs and combine them using fatbinary, and then embed that into the host binary. Adds two new command-line flags, -Xcuda_ptxas and -Xcuda_fatbinary, which pass args down to the external tools. Reviewers: tra, echristo Subscribers: cfe-commits, jhen Differential Revision: http://reviews.llvm.org/D16082 llvm-svn: 257809
Diffstat (limited to 'clang/lib/Driver/Driver.cpp')
-rw-r--r--clang/lib/Driver/Driver.cpp52
1 files changed, 36 insertions, 16 deletions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index e9a37c6aa81..0cdfb4fe105 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -949,8 +949,9 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
os << '"' << BIA->getArchName() << '"' << ", {"
<< PrintActions1(C, *BIA->begin(), Ids) << "}";
} else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
- os << '"' << CDA->getGpuArchName() << '"' << ", {"
- << PrintActions1(C, *CDA->begin(), Ids) << "}";
+ os << '"'
+ << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+ << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
} else {
const ActionList *AL;
if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
@@ -1327,7 +1328,7 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
// Check whether any of device actions stopped before they could generate PTX.
bool PartialCompilation =
llvm::any_of(CudaDeviceActions, [](const Action *a) {
- return a->getKind() != Action::BackendJobClass;
+ return a->getKind() != Action::AssembleJobClass;
});
// Figure out what to do with device actions -- pass them as inputs to the
@@ -1356,16 +1357,32 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
return HostAction;
}
- // Outputs of device actions during complete CUDA compilation get created
- // with AtTopLevel=false and become inputs for the host action.
+ // If we're not a partial or device-only compilation, we compile each arch to
+ // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+ // "link" action, which uses fatbinary to combine these cubins into one
+ // fatbin. The fatbin is then an input to the host compilation.
ActionList DeviceActions;
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
- DeviceActions.push_back(
- C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
- /* AtTopLevel */ false));
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ Action* AssembleAction = CudaDeviceActions[I];
+ assert(AssembleAction->getType() == types::TY_Object);
+ assert(AssembleAction->getInputs().size() == 1);
+
+ Action* BackendAction = AssembleAction->getInputs()[0];
+ assert(BackendAction->getType() == types::TY_PP_Asm);
+
+ for (const auto& A : {AssembleAction, BackendAction}) {
+ DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+ A, GpuArchList[I], /* AtTopLevel */ false));
+ }
+ }
+ auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+ C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+ /* GpuArchName = */ nullptr,
+ /* AtTopLevel = */ false);
// Return a new host action that incorporates original host action and all
// device actions.
- return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+ return C.MakeAction<CudaHostAction>(std::move(HostAction),
+ ActionList({FatbinAction}));
}
void Driver::BuildActions(Compilation &C, const ToolChain &TC,
@@ -1600,7 +1617,7 @@ Action *Driver::ConstructPhaseAction(Compilation &C, const ToolChain &TC,
return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
}
case phases::Assemble:
- return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+ return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object);
}
llvm_unreachable("invalid phase in ConstructPhaseAction");
@@ -1849,11 +1866,14 @@ InputInfo Driver::BuildJobsForActionNoCache(
if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
// Initial processing of CudaDeviceAction carries host params.
// Call BuildJobsForAction() again, now with correct device parameters.
- assert(CDA->getGpuArchName() && "No GPU name in device action.");
- return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
- CDA->getGpuArchName(), CDA->isAtTopLevel(),
- /*MultipleArchs*/ true, LinkingOutput,
- CachedResults);
+ InputInfo II = BuildJobsForAction(
+ C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+ CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+ CachedResults);
+ // Currently II's Action is *CDA->begin(). Set it to CDA instead, so that
+ // one can retrieve II's GPU arch.
+ II.setAction(A);
+ return II;
}
const ActionList *Inputs = &A->getInputs();
OpenPOWER on IntegriCloud