summaryrefslogtreecommitdiffstats
path: root/clang
diff options
context:
space:
mode:
Diffstat (limited to 'clang')
-rw-r--r--clang/include/clang/Driver/Action.h190
-rw-r--r--clang/include/clang/Driver/Compilation.h11
-rw-r--r--clang/include/clang/Driver/Driver.h27
-rw-r--r--clang/lib/Driver/Action.cpp217
-rw-r--r--clang/lib/Driver/Driver.cpp370
-rw-r--r--clang/lib/Driver/ToolChain.cpp3
-rw-r--r--clang/lib/Driver/Tools.cpp110
-rw-r--r--clang/lib/Driver/Tools.h3
-rw-r--r--clang/lib/Frontend/CreateInvocationFromCommandLine.cpp16
-rw-r--r--clang/test/Driver/cuda_phases.cu206
10 files changed, 933 insertions, 220 deletions
diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index c54a2a4195a..0c38b3af886 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -13,6 +13,7 @@
#include "clang/Basic/Cuda.h"
#include "clang/Driver/Types.h"
#include "clang/Driver/Util.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
namespace llvm {
@@ -27,6 +28,8 @@ namespace opt {
namespace clang {
namespace driver {
+class ToolChain;
+
/// Action - Represent an abstract compilation step to perform.
///
/// An action represents an edge in the compilation graph; typically
@@ -50,8 +53,7 @@ public:
enum ActionClass {
InputClass = 0,
BindArchClass,
- CudaDeviceClass,
- CudaHostClass,
+ OffloadClass,
PreprocessJobClass,
PrecompileJobClass,
AnalyzeJobClass,
@@ -65,17 +67,13 @@ public:
VerifyDebugInfoJobClass,
VerifyPCHJobClass,
- JobClassFirst=PreprocessJobClass,
- JobClassLast=VerifyPCHJobClass
+ JobClassFirst = PreprocessJobClass,
+ JobClassLast = VerifyPCHJobClass
};
// The offloading kind determines if this action is binded to a particular
// programming model. Each entry reserves one bit. We also have a special kind
// to designate the host offloading tool chain.
- //
- // FIXME: This is currently used to indicate that tool chains are used in a
- // given programming, but will be used here as well once a generic offloading
- // action is implemented.
enum OffloadKind {
OFK_None = 0x00,
// The host offloading tool chain.
@@ -95,6 +93,19 @@ private:
ActionList Inputs;
protected:
+ ///
+ /// Offload information.
+ ///
+
+ /// The host offloading kind - a combination of kinds encoded in a mask.
+ /// Multiple programming models may be supported simultaneously by the same
+ /// host.
+ unsigned ActiveOffloadKindMask = 0u;
+ /// Offloading kind of the device.
+ OffloadKind OffloadingDeviceKind = OFK_None;
+ /// The Offloading architecture associated with this action.
+ const char *OffloadingArch = nullptr;
+
Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {}
Action(ActionClass Kind, Action *Input, types::ID Type)
: Action(Kind, ActionList({Input}), Type) {}
@@ -124,6 +135,40 @@ public:
input_const_range inputs() const {
return input_const_range(input_begin(), input_end());
}
+
+ /// Return a string containing the offload kind of the action.
+ std::string getOffloadingKindPrefix() const;
+ /// Return a string that can be used as prefix in order to generate unique
+ /// files for each offloading kind.
+ std::string getOffloadingFileNamePrefix(StringRef NormalizedTriple) const;
+
+ /// Set the device offload info of this action and propagate it to its
+ /// dependences.
+ void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch);
+ /// Append the host offload info of this action and propagate it to its
+ /// dependences.
+ void propagateHostOffloadInfo(unsigned OKinds, const char *OArch);
+ /// Set the offload info of this action to be the same as the provided action,
+ /// and propagate it to its dependences.
+ void propagateOffloadInfo(const Action *A);
+
+ unsigned getOffloadingHostActiveKinds() const {
+ return ActiveOffloadKindMask;
+ }
+ OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; }
+ const char *getOffloadingArch() const { return OffloadingArch; }
+
+ /// Check if this action have any offload kinds. Note that host offload kinds
+ /// are only set if the action is a dependence to a host offload action.
+ bool isHostOffloading(OffloadKind OKind) const {
+ return ActiveOffloadKindMask & OKind;
+ }
+ bool isDeviceOffloading(OffloadKind OKind) const {
+ return OffloadingDeviceKind == OKind;
+ }
+ bool isOffloading(OffloadKind OKind) const {
+ return isHostOffloading(OKind) || isDeviceOffloading(OKind);
+ }
};
class InputAction : public Action {
@@ -156,39 +201,126 @@ public:
}
};
-class CudaDeviceAction : public Action {
+/// An offload action combines host or/and device actions according to the
+/// programming model implementation needs and propagates the offloading kind to
+/// its dependences.
+class OffloadAction final : public Action {
virtual void anchor();
- const CudaArch GpuArch;
+public:
+ /// Type used to communicate device actions. It associates bound architecture,
+ /// toolchain, and offload kind to each action.
+ class DeviceDependences final {
+ public:
+ typedef SmallVector<const ToolChain *, 3> ToolChainList;
+ typedef SmallVector<const char *, 3> BoundArchList;
+ typedef SmallVector<OffloadKind, 3> OffloadKindList;
+
+ private:
+ // Lists that keep the information for each dependency. All the lists are
+ // meant to be updated in sync. We are adopting separate lists instead of a
+ // list of structs, because that simplifies forwarding the actions list to
+ // initialize the inputs of the base Action class.
+
+ /// The dependence actions.
+ ActionList DeviceActions;
+ /// The offloading toolchains that should be used with the action.
+ ToolChainList DeviceToolChains;
+ /// The architectures that should be used with this action.
+ BoundArchList DeviceBoundArchs;
+ /// The offload kind of each dependence.
+ OffloadKindList DeviceOffloadKinds;
+
+ public:
+ /// Add a action along with the associated toolchain, bound arch, and
+ /// offload kind.
+ void add(Action &A, const ToolChain &TC, const char *BoundArch,
+ OffloadKind OKind);
+
+ /// Get each of the individual arrays.
+ const ActionList &getActions() const { return DeviceActions; };
+ const ToolChainList &getToolChains() const { return DeviceToolChains; };
+ const BoundArchList &getBoundArchs() const { return DeviceBoundArchs; };
+ const OffloadKindList &getOffloadKinds() const {
+ return DeviceOffloadKinds;
+ };
+ };
+
+ /// Type used to communicate host actions. It associates bound architecture,
+ /// toolchain, and offload kinds to the host action.
+ class HostDependence final {
+ /// The dependence action.
+ Action &HostAction;
+ /// The offloading toolchain that should be used with the action.
+ const ToolChain &HostToolChain;
+ /// The architectures that should be used with this action.
+ const char *HostBoundArch = nullptr;
+ /// The offload kind of each dependence.
+ unsigned HostOffloadKinds = 0u;
+
+ public:
+ HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+ const unsigned OffloadKinds)
+ : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch),
+ HostOffloadKinds(OffloadKinds){};
+ /// Constructor version that obtains the offload kinds from the device
+ /// dependencies.
+ HostDependence(Action &A, const ToolChain &TC, const char *BoundArch,
+ const DeviceDependences &DDeps);
+ Action *getAction() const { return &HostAction; };
+ const ToolChain *getToolChain() const { return &HostToolChain; };
+ const char *getBoundArch() const { return HostBoundArch; };
+ unsigned getOffloadKinds() const { return HostOffloadKinds; };
+ };
+
+ typedef llvm::function_ref<void(Action *, const ToolChain *, const char *)>
+ OffloadActionWorkTy;
+
+private:
+ /// The host offloading toolchain that should be used with the action.
+ const ToolChain *HostTC = nullptr;
- /// True when action results are not consumed by the host action (e.g when
- /// -fsyntax-only or --cuda-device-only options are used).
- bool AtTopLevel;
+ /// The tool chains associated with the list of actions.
+ DeviceDependences::ToolChainList DevToolChains;
public:
- CudaDeviceAction(Action *Input, CudaArch Arch, bool AtTopLevel);
+ OffloadAction(const HostDependence &HDep);
+ OffloadAction(const DeviceDependences &DDeps, types::ID Ty);
+ OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps);
- /// Get the CUDA GPU architecture to which this Action corresponds. Returns
- /// UNKNOWN if this Action corresponds to multiple architectures.
- CudaArch getGpuArch() const { return GpuArch; }
+ /// Execute the work specified in \a Work on the host dependence.
+ void doOnHostDependence(const OffloadActionWorkTy &Work) const;
- bool isAtTopLevel() const { return AtTopLevel; }
+ /// Execute the work specified in \a Work on each device dependence.
+ void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const;
- static bool classof(const Action *A) {
- return A->getKind() == CudaDeviceClass;
- }
-};
+ /// Execute the work specified in \a Work on each dependence.
+ void doOnEachDependence(const OffloadActionWorkTy &Work) const;
-class CudaHostAction : public Action {
- virtual void anchor();
- ActionList DeviceActions;
+ /// Execute the work specified in \a Work on each host or device dependence if
+ /// \a IsHostDependenceto is true or false, respectively.
+ void doOnEachDependence(bool IsHostDependence,
+ const OffloadActionWorkTy &Work) const;
-public:
- CudaHostAction(Action *Input, const ActionList &DeviceActions);
+ /// Return true if the action has a host dependence.
+ bool hasHostDependence() const;
+
+ /// Return the host dependence of this action. This function is only expected
+ /// to be called if the host dependence exists.
+ Action *getHostDependence() const;
+
+ /// Return true if the action has a single device dependence. If \a
+ /// DoNotConsiderHostActions is set, ignore the host dependence, if any, while
+ /// accounting for the number of dependences.
+ bool hasSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
- const ActionList &getDeviceActions() const { return DeviceActions; }
+ /// Return the single device dependence of this action. This function is only
+ /// expected to be called if a single device dependence exists. If \a
+ /// DoNotConsiderHostActions is set, a host dependence is allowed.
+ Action *
+ getSingleDeviceDependence(bool DoNotConsiderHostActions = false) const;
- static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+ static bool classof(const Action *A) { return A->getKind() == OffloadClass; }
};
class JobAction : public Action {
diff --git a/clang/include/clang/Driver/Compilation.h b/clang/include/clang/Driver/Compilation.h
index ea84a93cf28..3f387151a3d 100644
--- a/clang/include/clang/Driver/Compilation.h
+++ b/clang/include/clang/Driver/Compilation.h
@@ -98,12 +98,7 @@ public:
const Driver &getDriver() const { return TheDriver; }
const ToolChain &getDefaultToolChain() const { return DefaultToolChain; }
- const ToolChain *getOffloadingHostToolChain() const {
- auto It = OrderedOffloadingToolchains.find(Action::OFK_Host);
- if (It != OrderedOffloadingToolchains.end())
- return It->second;
- return nullptr;
- }
+
unsigned isOffloadingHostKind(Action::OffloadKind Kind) const {
return ActiveOffloadMask & Kind;
}
@@ -121,8 +116,8 @@ public:
return OrderedOffloadingToolchains.equal_range(Kind);
}
- // Return an offload toolchain of the provided kind. Only one is expected to
- // exist.
+ /// Return an offload toolchain of the provided kind. Only one is expected to
+ /// exist.
template <Action::OffloadKind Kind>
const ToolChain *getSingleOffloadToolChain() const {
auto TCs = getOffloadToolChains<Kind>();
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 46bf06d8219..9ecf434a87e 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -394,12 +394,13 @@ public:
/// BuildJobsForAction - Construct the jobs to perform for the action \p A and
/// return an InputInfo for the result of running \p A. Will only construct
/// jobs for a given (Action, ToolChain, BoundArch) tuple once.
- InputInfo BuildJobsForAction(Compilation &C, const Action *A,
- const ToolChain *TC, const char *BoundArch,
- bool AtTopLevel, bool MultipleArchs,
- const char *LinkingOutput,
- std::map<std::pair<const Action *, std::string>,
- InputInfo> &CachedResults) const;
+ InputInfo
+ BuildJobsForAction(Compilation &C, const Action *A, const ToolChain *TC,
+ const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
+ const char *LinkingOutput,
+ std::map<std::pair<const Action *, std::string>, InputInfo>
+ &CachedResults,
+ bool BuildForOffloadDevice) const;
/// Returns the default name for linked images (e.g., "a.out").
const char *getDefaultImageName() const;
@@ -415,12 +416,11 @@ public:
/// \param BoundArch - The bound architecture.
/// \param AtTopLevel - Whether this is a "top-level" action.
/// \param MultipleArchs - Whether multiple -arch options were supplied.
- const char *GetNamedOutputPath(Compilation &C,
- const JobAction &JA,
- const char *BaseInput,
- const char *BoundArch,
- bool AtTopLevel,
- bool MultipleArchs) const;
+ /// \param NormalizedTriple - The normalized triple of the relevant target.
+ const char *GetNamedOutputPath(Compilation &C, const JobAction &JA,
+ const char *BaseInput, const char *BoundArch,
+ bool AtTopLevel, bool MultipleArchs,
+ StringRef NormalizedTriple) const;
/// GetTemporaryPath - Return the pathname of a temporary file to use
/// as part of compilation; the file will have the given prefix and suffix.
@@ -467,7 +467,8 @@ private:
const char *BoundArch, bool AtTopLevel, bool MultipleArchs,
const char *LinkingOutput,
std::map<std::pair<const Action *, std::string>, InputInfo>
- &CachedResults) const;
+ &CachedResults,
+ bool BuildForOffloadDevice) const;
public:
/// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 7982f51f07b..a98b5c1bbaa 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
@@ -21,8 +22,8 @@ const char *Action::getClassName(ActionClass AC) {
switch (AC) {
case InputClass: return "input";
case BindArchClass: return "bind-arch";
- case CudaDeviceClass: return "cuda-device";
- case CudaHostClass: return "cuda-host";
+ case OffloadClass:
+ return "offload";
case PreprocessJobClass: return "preprocessor";
case PrecompileJobClass: return "precompiler";
case AnalyzeJobClass: return "analyzer";
@@ -40,6 +41,82 @@ const char *Action::getClassName(ActionClass AC) {
llvm_unreachable("invalid class");
}
+void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) {
+ // Offload action set its own kinds on their dependences.
+ if (Kind == OffloadClass)
+ return;
+
+ assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) &&
+ "Setting device kind to a different device??");
+ assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??");
+ OffloadingDeviceKind = OKind;
+ OffloadingArch = OArch;
+
+ for (auto *A : Inputs)
+ A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch);
+}
+
+void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) {
+ // Offload action set its own kinds on their dependences.
+ if (Kind == OffloadClass)
+ return;
+
+ assert(OffloadingDeviceKind == OFK_None &&
+ "Setting a host kind in a device action.");
+ ActiveOffloadKindMask |= OKinds;
+ OffloadingArch = OArch;
+
+ for (auto *A : Inputs)
+ A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch);
+}
+
+void Action::propagateOffloadInfo(const Action *A) {
+ if (unsigned HK = A->getOffloadingHostActiveKinds())
+ propagateHostOffloadInfo(HK, A->getOffloadingArch());
+ else
+ propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(),
+ A->getOffloadingArch());
+}
+
+std::string Action::getOffloadingKindPrefix() const {
+ switch (OffloadingDeviceKind) {
+ case OFK_None:
+ break;
+ case OFK_Host:
+ llvm_unreachable("Host kind is not an offloading device kind.");
+ break;
+ case OFK_Cuda:
+ return "device-cuda";
+
+ // TODO: Add other programming models here.
+ }
+
+ if (!ActiveOffloadKindMask)
+ return "";
+
+ std::string Res("host");
+ if (ActiveOffloadKindMask & OFK_Cuda)
+ Res += "-cuda";
+
+ // TODO: Add other programming models here.
+
+ return Res;
+}
+
+std::string
+Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const {
+ // A file prefix is only generated for device actions and consists of the
+ // offload kind and triple.
+ if (!OffloadingDeviceKind)
+ return "";
+
+ std::string Res("-");
+ Res += getOffloadingKindPrefix();
+ Res += "-";
+ Res += NormalizedTriple;
+ return Res;
+}
+
void InputAction::anchor() {}
InputAction::InputAction(const Arg &_Input, types::ID _Type)
@@ -51,16 +128,138 @@ void BindArchAction::anchor() {}
BindArchAction::BindArchAction(Action *Input, const char *_ArchName)
: Action(BindArchClass, Input), ArchName(_ArchName) {}
-void CudaDeviceAction::anchor() {}
+void OffloadAction::anchor() {}
+
+OffloadAction::OffloadAction(const HostDependence &HDep)
+ : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) {
+ OffloadingArch = HDep.getBoundArch();
+ ActiveOffloadKindMask = HDep.getOffloadKinds();
+ HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+ HDep.getBoundArch());
+};
+
+OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty)
+ : Action(OffloadClass, DDeps.getActions(), Ty),
+ DevToolChains(DDeps.getToolChains()) {
+ auto &OKinds = DDeps.getOffloadKinds();
+ auto &BArchs = DDeps.getBoundArchs();
+
+ // If all inputs agree on the same kind, use it also for this action.
+ if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+ OffloadingDeviceKind = OKinds.front();
+
+ // If we have a single dependency, inherit the architecture from it.
+ if (OKinds.size() == 1)
+ OffloadingArch = BArchs.front();
+
+ // Propagate info to the dependencies.
+ for (unsigned i = 0, e = getInputs().size(); i != e; ++i)
+ getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]);
+}
+
+OffloadAction::OffloadAction(const HostDependence &HDep,
+ const DeviceDependences &DDeps)
+ : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
+ DevToolChains(DDeps.getToolChains()) {
+ // We use the kinds of the host dependence for this action.
+ OffloadingArch = HDep.getBoundArch();
+ ActiveOffloadKindMask = HDep.getOffloadKinds();
+ HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+ HDep.getBoundArch());
+
+ // Add device inputs and propagate info to the device actions. Do work only if
+ // we have dependencies.
+ for (unsigned i = 0, e = DDeps.getActions().size(); i != e; ++i)
+ if (auto *A = DDeps.getActions()[i]) {
+ getInputs().push_back(A);
+ A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i],
+ DDeps.getBoundArchs()[i]);
+ }
+}
+
+void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const {
+ if (!HostTC)
+ return;
+ assert(!getInputs().empty() && "No dependencies for offload action??");
+ auto *A = getInputs().front();
+ Work(A, HostTC, A->getOffloadingArch());
+}
-CudaDeviceAction::CudaDeviceAction(Action *Input, clang::CudaArch Arch,
- bool AtTopLevel)
- : Action(CudaDeviceClass, Input), GpuArch(Arch), AtTopLevel(AtTopLevel) {}
+void OffloadAction::doOnEachDeviceDependence(
+ const OffloadActionWorkTy &Work) const {
+ auto I = getInputs().begin();
+ auto E = getInputs().end();
+ if (I == E)
+ return;
+
+ // We expect to have the same number of input dependences and device tool
+ // chains, except if we also have a host dependence. In that case we have one
+ // more dependence than we have device tool chains.
+ assert(getInputs().size() == DevToolChains.size() + (HostTC ? 1 : 0) &&
+ "Sizes of action dependences and toolchains are not consistent!");
+
+ // Skip host action
+ if (HostTC)
+ ++I;
+
+ auto TI = DevToolChains.begin();
+ for (; I != E; ++I, ++TI)
+ Work(*I, *TI, (*I)->getOffloadingArch());
+}
+
+void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const {
+ doOnHostDependence(Work);
+ doOnEachDeviceDependence(Work);
+}
+
+void OffloadAction::doOnEachDependence(bool IsHostDependence,
+ const OffloadActionWorkTy &Work) const {
+ if (IsHostDependence)
+ doOnHostDependence(Work);
+ else
+ doOnEachDeviceDependence(Work);
+}
-void CudaHostAction::anchor() {}
+bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; }
-CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions)
- : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {}
+Action *OffloadAction::getHostDependence() const {
+ assert(hasHostDependence() && "Host dependence does not exist!");
+ assert(!getInputs().empty() && "No dependencies for offload action??");
+ return HostTC ? getInputs().front() : nullptr;
+}
+
+bool OffloadAction::hasSingleDeviceDependence(
+ bool DoNotConsiderHostActions) const {
+ if (DoNotConsiderHostActions)
+ return getInputs().size() == (HostTC ? 2 : 1);
+ return !HostTC && getInputs().size() == 1;
+}
+
+Action *
+OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const {
+ assert(hasSingleDeviceDependence(DoNotConsiderHostActions) &&
+ "Single device dependence does not exist!");
+ // The previous assert ensures the number of entries in getInputs() is
+ // consistent with what we are doing here.
+ return HostTC ? getInputs()[1] : getInputs().front();
+}
+
+void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC,
+ const char *BoundArch,
+ OffloadKind OKind) {
+ DeviceActions.push_back(&A);
+ DeviceToolChains.push_back(&TC);
+ DeviceBoundArchs.push_back(BoundArch);
+ DeviceOffloadKinds.push_back(OKind);
+}
+
+OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC,
+ const char *BoundArch,
+ const DeviceDependences &DDeps)
+ : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) {
+ for (auto K : DDeps.getOffloadKinds())
+ HostOffloadKinds |= K;
+}
void JobAction::anchor() {}
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 78c3125cdb6..02f4a999771 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -435,7 +435,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
})) {
const ToolChain &TC = getToolChain(
C.getInputArgs(),
- llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit()
+ llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
+ ->getTriple()
+ .isArch64Bit()
? "nvptx64-nvidia-cuda"
: "nvptx-nvidia-cuda"));
C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
@@ -1022,19 +1024,33 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
} else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
os << '"' << BIA->getArchName() << '"' << ", {"
<< PrintActions1(C, *BIA->input_begin(), Ids) << "}";
- } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
- CudaArch Arch = CDA->getGpuArch();
- if (Arch != CudaArch::UNKNOWN)
- os << "'" << CudaArchToString(Arch) << "', ";
- os << "{" << PrintActions1(C, *CDA->input_begin(), Ids) << "}";
+ } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+ bool IsFirst = true;
+ OA->doOnEachDependence(
+ [&](Action *A, const ToolChain *TC, const char *BoundArch) {
+ // E.g. for two CUDA device dependences whose bound arch is sm_20 and
+ // sm_35 this will generate:
+ // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device"
+ // (nvptx64-nvidia-cuda:sm_35) {#ID}
+ if (!IsFirst)
+ os << ", ";
+ os << '"';
+ if (TC)
+ os << A->getOffloadingKindPrefix();
+ else
+ os << "host";
+ os << " (";
+ os << TC->getTriple().normalize();
+
+ if (BoundArch)
+ os << ":" << BoundArch;
+ os << ")";
+ os << '"';
+ os << " {" << PrintActions1(C, A, Ids) << "}";
+ IsFirst = false;
+ });
} else {
- const ActionList *AL;
- if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
- os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}"
- << ", gpu binaries ";
- AL = &CHA->getDeviceActions();
- } else
- AL = &A->getInputs();
+ const ActionList *AL = &A->getInputs();
if (AL->size()) {
const char *Prefix = "{";
@@ -1047,10 +1063,24 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
os << "{}";
}
+ // Append offload info for all options other than the offloading action
+ // itself (e.g. (cuda-device, sm_20) or (cuda-host)).
+ std::string offload_str;
+ llvm::raw_string_ostream offload_os(offload_str);
+ if (!isa<OffloadAction>(A)) {
+ auto S = A->getOffloadingKindPrefix();
+ if (!S.empty()) {
+ offload_os << ", (" << S;
+ if (A->getOffloadingArch())
+ offload_os << ", " << A->getOffloadingArch();
+ offload_os << ")";
+ }
+ }
+
unsigned Id = Ids.size();
Ids[A] = Id;
llvm::errs() << Id << ": " << os.str() << ", "
- << types::getTypeName(A->getType()) << "\n";
+ << types::getTypeName(A->getType()) << offload_os.str() << "\n";
return Id;
}
@@ -1378,8 +1408,12 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
PartialCompilationArg &&
PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
- if (CompileHostOnly)
- return C.MakeAction<CudaHostAction>(HostAction, ActionList());
+ if (CompileHostOnly) {
+ OffloadAction::HostDependence HDep(
+ *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+ /*BoundArch=*/nullptr, Action::OFK_Cuda);
+ return C.MakeAction<OffloadAction>(HDep);
+ }
// Collect all cuda_gpu_arch parameters, removing duplicates.
SmallVector<CudaArch, 4> GpuArchList;
@@ -1408,8 +1442,6 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
// Build actions for all device inputs.
- assert(C.getSingleOffloadToolChain<Action::OFK_Cuda>() &&
- "Missing toolchain for device-side compilation.");
ActionList CudaDeviceActions;
C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
assert(GpuArchList.size() == CudaDeviceActions.size() &&
@@ -1421,6 +1453,8 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
return a->getKind() != Action::AssembleJobClass;
});
+ const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+
// Figure out what to do with device actions -- pass them as inputs to the
// host action or run each of them independently.
if (PartialCompilation || CompileDeviceOnly) {
@@ -1436,10 +1470,13 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
return nullptr;
}
- for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
- Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I],
- GpuArchList[I],
- /* AtTopLevel */ true));
+ for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
+ Action::OFK_Cuda);
+ Actions.push_back(
+ C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+ }
// Kill host action in case of device-only compilation.
if (CompileDeviceOnly)
return nullptr;
@@ -1459,19 +1496,23 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
Action* BackendAction = AssembleAction->getInputs()[0];
assert(BackendAction->getType() == types::TY_PP_Asm);
- for (const auto& A : {AssembleAction, BackendAction}) {
- DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
- A, GpuArchList[I], /* AtTopLevel */ false));
+ for (auto &A : {AssembleAction, BackendAction}) {
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
+ DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
}
}
- auto FatbinAction = C.MakeAction<CudaDeviceAction>(
- C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
- CudaArch::UNKNOWN,
- /* AtTopLevel = */ false);
+ auto FatbinAction =
+ C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
// Return a new host action that incorporates original host action and all
// device actions.
- return C.MakeAction<CudaHostAction>(std::move(HostAction),
- ActionList({FatbinAction}));
+ OffloadAction::HostDependence HDep(
+ *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+ /*BoundArch=*/nullptr, Action::OFK_Cuda);
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
+ return C.MakeAction<OffloadAction>(HDep, DDep);
}
void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
@@ -1580,6 +1621,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
YcArg = YuArg = nullptr;
}
+ // Track the host offload kinds used on this compilation.
+ unsigned CompilationActiveOffloadHostKinds = 0u;
+
// Construct the actions to perform.
ActionList LinkerInputs;
@@ -1648,6 +1692,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
? phases::Compile
: FinalPhase;
+ // Track the host offload kinds used on this input.
+ unsigned InputActiveOffloadHostKinds = 0u;
+
// Build the pipeline for this file.
Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1679,21 +1726,36 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
Current = buildCudaActions(C, Args, InputArg, Current, Actions);
if (!Current)
break;
+
+ // We produced a CUDA action for this input, so the host has to support
+ // CUDA.
+ InputActiveOffloadHostKinds |= Action::OFK_Cuda;
+ CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
}
if (Current->getType() == types::TY_Nothing)
break;
}
- // If we ended with something, add to the output list.
- if (Current)
+ // If we ended with something, add to the output list. Also, propagate the
+ // offload information to the top-level host action related with the current
+ // input.
+ if (Current) {
+ if (InputActiveOffloadHostKinds)
+ Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
+ /*BoundArch=*/nullptr);
Actions.push_back(Current);
+ }
}
- // Add a link action if necessary.
- if (!LinkerInputs.empty())
+ // Add a link action if necessary and propagate the offload information for
+ // the current compilation.
+ if (!LinkerInputs.empty()) {
Actions.push_back(
C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
+ Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
+ /*BoundArch=*/nullptr);
+ }
// If we are linking, claim any options which are obviously only used for
// compilation.
@@ -1829,7 +1891,8 @@ void Driver::BuildJobs(Compilation &C) const {
/*BoundArch*/ nullptr,
/*AtTopLevel*/ true,
/*MultipleArchs*/ ArchNames.size() > 1,
- /*LinkingOutput*/ LinkingOutput, CachedResults);
+ /*LinkingOutput*/ LinkingOutput, CachedResults,
+ /*BuildForOffloadDevice*/ false);
}
// If the user passed -Qunused-arguments or there were errors, don't warn
@@ -1878,7 +1941,28 @@ void Driver::BuildJobs(Compilation &C) const {
}
}
}
-
+/// Collapse an offloading action looking for a job of the given type. The input
+/// action is changed to the input of the collapsed sequence. If we effectively
+/// had a collapse return the corresponding offloading action, otherwise return
+/// null.
+template <typename T>
+static OffloadAction *collapseOffloadingAction(Action *&CurAction) {
+ if (!CurAction)
+ return nullptr;
+ if (auto *OA = dyn_cast<OffloadAction>(CurAction)) {
+ if (OA->hasHostDependence())
+ if (auto *HDep = dyn_cast<T>(OA->getHostDependence())) {
+ CurAction = HDep;
+ return OA;
+ }
+ if (OA->hasSingleDeviceDependence())
+ if (auto *DDep = dyn_cast<T>(OA->getSingleDeviceDependence())) {
+ CurAction = DDep;
+ return OA;
+ }
+ }
+ return nullptr;
+}
// Returns a Tool for a given JobAction. In case the action and its
// predecessors can be combined, updates Inputs with the inputs of the
// first combined action. If one of the collapsed actions is a
@@ -1888,34 +1972,39 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
bool EmbedBitcode, const ToolChain *TC,
const JobAction *JA,
const ActionList *&Inputs,
- const CudaHostAction *&CollapsedCHA) {
+ ActionList &CollapsedOffloadAction) {
const Tool *ToolForJob = nullptr;
- CollapsedCHA = nullptr;
+ CollapsedOffloadAction.clear();
// See if we should look for a compiler with an integrated assembler. We match
// bottom up, so what we are actually looking for is an assembler job with a
// compiler input.
+ // Look through offload actions between assembler and backend actions.
+ Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1)
+ ? *Inputs->begin()
+ : nullptr;
+ auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA);
+
if (TC->useIntegratedAs() && !SaveTemps &&
!C.getArgs().hasArg(options::OPT_via_file_asm) &&
!C.getArgs().hasArg(options::OPT__SLASH_FA) &&
- !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
- isa<AssembleJobAction>(JA) && Inputs->size() == 1 &&
- isa<BackendJobAction>(*Inputs->begin())) {
+ !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA &&
+ isa<BackendJobAction>(BackendJA)) {
// A BackendJob is always preceded by a CompileJob, and without -save-temps
// or -fembed-bitcode, they will always get combined together, so instead of
// checking the backend tool, check if the tool for the CompileJob has an
// integrated assembler. For -fembed-bitcode, CompileJob is still used to
// look up tools for BackendJob, but they need to match before we can split
// them.
- const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
- // Compile job may be wrapped in CudaHostAction, extract it if
- // that's the case and update CollapsedCHA if we combine phases.
- CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
- JobAction *CompileJA = cast<CompileJobAction>(
- CHA ? *CHA->input_begin() : *BackendInputs->begin());
- assert(CompileJA && "Backend job is not preceeded by compile job.");
- const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+ // Look through offload actions between backend and compile actions.
+ Action *CompileJA = *BackendJA->getInputs().begin();
+ auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+ assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+ "Backend job is not preceeded by compile job.");
+ const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
if (!Compiler)
return nullptr;
// When using -fembed-bitcode, it is required to have the same tool (clang)
@@ -1929,7 +2018,12 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
if (Compiler->hasIntegratedAssembler()) {
Inputs = &CompileJA->getInputs();
ToolForJob = Compiler;
- CollapsedCHA = CHA;
+ // Save the collapsed offload actions because they may still contain
+ // device actions.
+ if (CompileOA)
+ CollapsedOffloadAction.push_back(CompileOA);
+ if (BackendOA)
+ CollapsedOffloadAction.push_back(BackendOA);
}
}
@@ -1939,20 +2033,23 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
if (isa<BackendJobAction>(JA)) {
// Check if the compiler supports emitting LLVM IR.
assert(Inputs->size() == 1);
- // Compile job may be wrapped in CudaHostAction, extract it if
- // that's the case and update CollapsedCHA if we combine phases.
- CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
- JobAction *CompileJA =
- cast<CompileJobAction>(CHA ? *CHA->input_begin() : *Inputs->begin());
- assert(CompileJA && "Backend job is not preceeded by compile job.");
- const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+ // Look through offload actions between backend and compile actions.
+ Action *CompileJA = *JA->getInputs().begin();
+ auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+ assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+ "Backend job is not preceeded by compile job.");
+ const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
if (!Compiler)
return nullptr;
if (!Compiler->canEmitIR() ||
(!SaveTemps && !EmbedBitcode)) {
Inputs = &CompileJA->getInputs();
ToolForJob = Compiler;
- CollapsedCHA = CHA;
+
+ if (CompileOA)
+ CollapsedOffloadAction.push_back(CompileOA);
}
}
@@ -1963,12 +2060,21 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
// See if we should use an integrated preprocessor. We do so when we have
// exactly one input, since this is the only use case we care about
// (irrelevant since we don't support combine yet).
- if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) &&
+
+ // Look through offload actions after preprocessing.
+ Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr;
+ auto *PreprocessOA =
+ collapseOffloadingAction<PreprocessJobAction>(PreprocessJA);
+
+ if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) &&
!C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
!C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
!C.getArgs().hasArg(options::OPT_rewrite_objc) &&
- ToolForJob->hasIntegratedCPP())
- Inputs = &(*Inputs)[0]->getInputs();
+ ToolForJob->hasIntegratedCPP()) {
+ Inputs = &PreprocessJA->getInputs();
+ if (PreprocessOA)
+ CollapsedOffloadAction.push_back(PreprocessOA);
+ }
return ToolForJob;
}
@@ -1976,8 +2082,8 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
InputInfo Driver::BuildJobsForAction(
Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
- std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
- const {
+ std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+ bool BuildForOffloadDevice) const {
// The bound arch is not necessarily represented in the toolchain's triple --
// for example, armv7 and armv7s both map to the same triple -- so we need
// both in our map.
@@ -1991,9 +2097,9 @@ InputInfo Driver::BuildJobsForAction(
if (CachedResult != CachedResults.end()) {
return CachedResult->second;
}
- InputInfo Result =
- BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs,
- LinkingOutput, CachedResults);
+ InputInfo Result = BuildJobsForActionNoCache(
+ C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput,
+ CachedResults, BuildForOffloadDevice);
CachedResults[ActionTC] = Result;
return Result;
}
@@ -2001,21 +2107,65 @@ InputInfo Driver::BuildJobsForAction(
InputInfo Driver::BuildJobsForActionNoCache(
Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
- std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
- const {
+ std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+ bool BuildForOffloadDevice) const {
llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
- InputInfoList CudaDeviceInputInfos;
- if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
- // Append outputs of device jobs to the input list.
- for (const Action *DA : CHA->getDeviceActions()) {
- CudaDeviceInputInfos.push_back(BuildJobsForAction(
- C, DA, TC, nullptr, AtTopLevel,
- /*MultipleArchs*/ false, LinkingOutput, CachedResults));
+ InputInfoList OffloadDependencesInputInfo;
+ if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+ // The offload action is expected to be used in four different situations.
+ //
+ // a) Set a toolchain/architecture/kind for a host action:
+ // Host Action 1 -> OffloadAction -> Host Action 2
+ //
+ // b) Set a toolchain/architecture/kind for a device action;
+ // Device Action 1 -> OffloadAction -> Device Action 2
+ //
+ // c) Specify a device dependences to a host action;
+ // Device Action 1 _
+ // \
+ // Host Action 1 ---> OffloadAction -> Host Action 2
+ //
+ // d) Specify a host dependence to a device action.
+ // Host Action 1 _
+ // \
+ // Device Action 1 ---> OffloadAction -> Device Action 2
+ //
+ // For a) and b), we just return the job generated for the dependence. For
+ // c) and d) we override the current action with the host/device dependence
+ // if the current toolchain is host/device and set the offload dependences
+ // info with the jobs obtained from the device/host dependence(s).
+
+ // If there is a single device option, just generate the job for it.
+ if (OA->hasSingleDeviceDependence()) {
+ InputInfo DevA;
+ OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC,
+ const char *DepBoundArch) {
+ DevA =
+ BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel,
+ /*MultipleArchs*/ !!DepBoundArch, LinkingOutput,
+ CachedResults, /*BuildForOffloadDevice=*/true);
+ });
+ return DevA;
}
- // Override current action with a real host compile action and continue
- // processing it.
- A = *CHA->input_begin();
+
+ // If 'Action 2' is host, we generate jobs for the device dependences and
+ // override the current action with the host dependence. Otherwise, we
+ // generate the host dependences and override the action with the device
+ // dependence. The dependences can't therefore be a top-level action.
+ OA->doOnEachDependence(
+ /*IsHostDependence=*/BuildForOffloadDevice,
+ [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+ OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+ C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
+ /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
+ /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+ Action::OFK_None));
+ });
+
+ A = BuildForOffloadDevice
+ ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)
+ : OA->getHostDependence();
}
if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -2042,41 +2192,34 @@ InputInfo Driver::BuildJobsForActionNoCache(
TC = &C.getDefaultToolChain();
return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel,
- MultipleArchs, LinkingOutput, CachedResults);
+ MultipleArchs, LinkingOutput, CachedResults,
+ BuildForOffloadDevice);
}
- if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
- // Initial processing of CudaDeviceAction carries host params.
- // Call BuildJobsForAction() again, now with correct device parameters.
- InputInfo II = BuildJobsForAction(
- C, *CDA->input_begin(), C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
- CudaArchToString(CDA->getGpuArch()), CDA->isAtTopLevel(),
- /*MultipleArchs=*/true, LinkingOutput, CachedResults);
- // Currently II's Action is *CDA->input_begin(). Set it to CDA instead, so
- // that one can retrieve II's GPU arch.
- II.setAction(A);
- return II;
- }
const ActionList *Inputs = &A->getInputs();
const JobAction *JA = cast<JobAction>(A);
- const CudaHostAction *CollapsedCHA = nullptr;
+ ActionList CollapsedOffloadActions;
+
const Tool *T =
selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA,
- Inputs, CollapsedCHA);
+ Inputs, CollapsedOffloadActions);
if (!T)
return InputInfo();
- // If we've collapsed action list that contained CudaHostAction we
- // need to build jobs for device-side inputs it may have held.
- if (CollapsedCHA) {
- for (const Action *DA : CollapsedCHA->getDeviceActions()) {
- CudaDeviceInputInfos.push_back(BuildJobsForAction(
- C, DA, TC, "", AtTopLevel,
- /*MultipleArchs*/ false, LinkingOutput, CachedResults));
- }
- }
+ // If we've collapsed action list that contained OffloadAction we
+ // need to build jobs for host/device-side inputs it may have held.
+ for (const auto *OA : CollapsedOffloadActions)
+ cast<OffloadAction>(OA)->doOnEachDependence(
+ /*IsHostDependence=*/BuildForOffloadDevice,
+ [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+ OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+ C, DepA, DepTC, DepBoundArch, AtTopLevel,
+ /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults,
+ /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+ Action::OFK_None));
+ });
// Only use pipes when there is exactly one input.
InputInfoList InputInfos;
@@ -2086,9 +2229,9 @@ InputInfo Driver::BuildJobsForActionNoCache(
// FIXME: Clean this up.
bool SubJobAtTopLevel =
AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
- InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch,
- SubJobAtTopLevel, MultipleArchs,
- LinkingOutput, CachedResults));
+ InputInfos.push_back(BuildJobsForAction(
+ C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
+ CachedResults, BuildForOffloadDevice));
}
// Always use the first input as the base input.
@@ -2099,9 +2242,10 @@ InputInfo Driver::BuildJobsForActionNoCache(
if (JA->getType() == types::TY_dSYM)
BaseInput = InputInfos[0].getFilename();
- // Append outputs of cuda device jobs to the input list
- if (CudaDeviceInputInfos.size())
- InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+ // Append outputs of offload device jobs to the input list
+ if (!OffloadDependencesInputInfo.empty())
+ InputInfos.append(OffloadDependencesInputInfo.begin(),
+ OffloadDependencesInputInfo.end());
// Determine the place to write output to, if any.
InputInfo Result;
@@ -2109,7 +2253,8 @@ InputInfo Driver::BuildJobsForActionNoCache(
Result = InputInfo(A, BaseInput);
else
Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
- AtTopLevel, MultipleArchs),
+ AtTopLevel, MultipleArchs,
+ TC->getTriple().normalize()),
BaseInput);
if (CCCPrintBindings && !CCGenDiagnostics) {
@@ -2169,7 +2314,8 @@ static const char *MakeCLOutputFilename(const ArgList &Args, StringRef ArgValue,
const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
const char *BaseInput,
const char *BoundArch, bool AtTopLevel,
- bool MultipleArchs) const {
+ bool MultipleArchs,
+ StringRef NormalizedTriple) const {
llvm::PrettyStackTraceString CrashInfo("Computing output path");
// Output to a user requested destination?
if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) {
@@ -2255,6 +2401,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image);
} else if (MultipleArchs && BoundArch) {
SmallString<128> Output(getDefaultImageName());
+ Output += JA.getOffloadingFileNamePrefix(NormalizedTriple);
Output += "-";
Output.append(BoundArch);
NamedOutput = C.getArgs().MakeArgString(Output.c_str());
@@ -2271,6 +2418,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
if (!types::appendSuffixForType(JA.getType()))
End = BaseName.rfind('.');
SmallString<128> Suffixed(BaseName.substr(0, End));
+ Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple);
if (MultipleArchs && BoundArch) {
Suffixed += "-";
Suffixed.append(BoundArch);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index cba8924ec2c..e96688cbaf8 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -248,8 +248,7 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
case Action::InputClass:
case Action::BindArchClass:
- case Action::CudaDeviceClass:
- case Action::CudaHostClass:
+ case Action::OffloadClass:
case Action::LipoJobClass:
case Action::DsymutilJobClass:
case Action::VerifyDebugInfoJobClass:
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index 63284bc1b2a..df4a996e47f 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -296,12 +296,45 @@ static bool forwardToGCC(const Option &O) {
!O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput);
}
+/// Add the C++ include args of other offloading toolchains. If this is a host
+/// job, the device toolchains are added. If this is a device job, the host
+/// toolchains will be added.
+static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C,
+ const JobAction &JA,
+ const ArgList &Args,
+ ArgStringList &CmdArgs) {
+
+ if (JA.isHostOffloading(Action::OFK_Cuda))
+ C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+ ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+ else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+ C.getSingleOffloadToolChain<Action::OFK_Host>()
+ ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+
+ // TODO: Add support for other programming models here.
+}
+
+/// Add the include args that are specific of each offloading programming model.
+static void addExtraOffloadSpecificIncludeArgs(Compilation &C,
+ const JobAction &JA,
+ const ArgList &Args,
+ ArgStringList &CmdArgs) {
+
+ if (JA.isHostOffloading(Action::OFK_Cuda))
+ C.getSingleOffloadToolChain<Action::OFK_Host>()->AddCudaIncludeArgs(
+ Args, CmdArgs);
+ else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+ C.getSingleOffloadToolChain<Action::OFK_Cuda>()->AddCudaIncludeArgs(
+ Args, CmdArgs);
+
+ // TODO: Add support for other programming models here.
+}
+
void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
const Driver &D, const ArgList &Args,
ArgStringList &CmdArgs,
const InputInfo &Output,
- const InputInfoList &Inputs,
- const ToolChain *AuxToolChain) const {
+ const InputInfoList &Inputs) const {
Arg *A;
const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
@@ -566,31 +599,27 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
// OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
- // Optional AuxToolChain indicates that we need to include headers
- // for more than one target. If that's the case, add include paths
- // from AuxToolChain right after include paths of the same kind for
- // the current target.
+ // While adding the include arguments, we also attempt to retrieve the
+ // arguments of related offloading toolchains or arguments that are specific
+ // of an offloading programming model.
// Add C++ include arguments, if needed.
if (types::isCXX(Inputs[0].getType())) {
getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
- if (AuxToolChain)
- AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+ addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
}
// Add system include arguments for all targets but IAMCU.
if (!IsIAMCU) {
getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
- if (AuxToolChain)
- AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+ addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
} else {
// For IAMCU add special include arguments.
getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
}
- // Add CUDA include arguments, if needed.
- if (types::isCuda(Inputs[0].getType()))
- getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+ // Add offload include arguments, if needed.
+ addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs);
}
// FIXME: Move to target hook.
@@ -3799,7 +3828,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
// CUDA compilation may have multiple inputs (source file + results of
// device-side compilations). All other jobs are expected to have exactly one
// input.
- bool IsCuda = types::isCuda(Input.getType());
+ bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
// C++ is not supported for IAMCU.
@@ -3815,21 +3844,21 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-triple");
CmdArgs.push_back(Args.MakeArgString(TripleStr));
- const ToolChain *AuxToolChain = nullptr;
if (IsCuda) {
- // FIXME: We need a (better) way to pass information about
- // particular compilation pass we're constructing here. For now we
- // can check which toolchain we're using and pick the other one to
- // extract the triple.
- if (&getToolChain() == C.getSingleOffloadToolChain<Action::OFK_Cuda>())
- AuxToolChain = C.getOffloadingHostToolChain();
- else if (&getToolChain() == C.getOffloadingHostToolChain())
- AuxToolChain = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+ // We have to pass the triple of the host if compiling for a CUDA device and
+ // vice-versa.
+ StringRef NormalizedTriple;
+ if (JA.isDeviceOffloading(Action::OFK_Cuda))
+ NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
+ ->getTriple()
+ .normalize();
else
- llvm_unreachable("Can't figure out CUDA compilation mode.");
- assert(AuxToolChain != nullptr && "No aux toolchain.");
+ NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+ ->getTriple()
+ .normalize();
+
CmdArgs.push_back("-aux-triple");
- CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
+ CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
}
if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
@@ -4718,8 +4747,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
//
// FIXME: Support -fpreprocessed
if (types::getPreprocessedType(InputType) != types::TY_INVALID)
- AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs,
- AuxToolChain);
+ AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
// Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
// that "The compiler can only warn and ignore the option if not recognized".
@@ -11193,15 +11221,14 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
static_cast<const toolchains::CudaToolChain &>(getToolChain());
assert(TC.getTriple().isNVPTX() && "Wrong platform");
- std::vector<std::string> gpu_archs =
- Args.getAllArgValues(options::OPT_march_EQ);
- assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
- const std::string& gpu_arch = gpu_archs[0];
+ // Obtain architecture from the action.
+ CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+ assert(gpu_arch != CudaArch::UNKNOWN &&
+ "Device action expected to have an architecture.");
// Check that our installation's ptxas supports gpu_arch.
if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
- TC.cudaInstallation().CheckCudaVersionSupportsArch(
- StringToCudaArch(gpu_arch));
+ TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
}
ArgStringList CmdArgs;
@@ -11245,7 +11272,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
}
CmdArgs.push_back("--gpu-name");
- CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+ CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
CmdArgs.push_back("--output-file");
CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
for (const auto& II : Inputs)
@@ -11277,13 +11304,20 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
for (const auto& II : Inputs) {
- auto* A = cast<const CudaDeviceAction>(II.getAction());
+ auto *A = II.getAction();
+ assert(A->getInputs().size() == 1 &&
+ "Device offload action is expected to have a single input");
+ const char *gpu_arch_str = A->getOffloadingArch();
+ assert(gpu_arch_str &&
+ "Device action expected to have associated a GPU architecture!");
+ CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+
// We need to pass an Arch of the form "sm_XX" for cubin files and
// "compute_XX" for ptx.
const char *Arch =
(II.getType() == types::TY_PP_Asm)
- ? CudaVirtualArchToString(VirtualArchForCudaArch(A->getGpuArch()))
- : CudaArchToString(A->getGpuArch());
+ ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+ : gpu_arch_str;
CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
Arch + ",file=" + II.getFilename()));
}
diff --git a/clang/lib/Driver/Tools.h b/clang/lib/Driver/Tools.h
index 2e546fc6538..02bdb8e5e2d 100644
--- a/clang/lib/Driver/Tools.h
+++ b/clang/lib/Driver/Tools.h
@@ -57,8 +57,7 @@ private:
const Driver &D, const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs,
const InputInfo &Output,
- const InputInfoList &Inputs,
- const ToolChain *AuxToolChain) const;
+ const InputInfoList &Inputs) const;
void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
llvm::opt::ArgStringList &CmdArgs) const;
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 6c44932f55e..1e9e57afb6b 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -60,25 +60,25 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
}
// We expect to get back exactly one command job, if we didn't something
- // failed. CUDA compilation is an exception as it creates multiple jobs. If
- // that's the case, we proceed with the first job. If caller needs particular
- // CUDA job, it should be controlled via --cuda-{host|device}-only option
- // passed to the driver.
+ // failed. Offload compilation is an exception as it creates multiple jobs. If
+ // that's the case, we proceed with the first job. If caller needs a
+ // particular job, it should be controlled via options (e.g.
+ // --cuda-{host|device}-only for CUDA) passed to the driver.
const driver::JobList &Jobs = C->getJobs();
- bool CudaCompilation = false;
+ bool OffloadCompilation = false;
if (Jobs.size() > 1) {
for (auto &A : C->getActions()){
// On MacOSX real actions may end up being wrapped in BindArchAction
if (isa<driver::BindArchAction>(A))
A = *A->input_begin();
- if (isa<driver::CudaDeviceAction>(A)) {
- CudaCompilation = true;
+ if (isa<driver::OffloadAction>(A)) {
+ OffloadCompilation = true;
break;
}
}
}
if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
- (Jobs.size() > 1 && !CudaCompilation)) {
+ (Jobs.size() > 1 && !OffloadCompilation)) {
SmallString<256> Msg;
llvm::raw_svector_ostream OS(Msg);
Jobs.Print(OS, "; ", true);
diff --git a/clang/test/Driver/cuda_phases.cu b/clang/test/Driver/cuda_phases.cu
new file mode 100644
index 00000000000..6cfb61aba72
--- /dev/null
+++ b/clang/test/Driver/cuda_phases.cu
@@ -0,0 +1,206 @@
+// Tests the phases generated for a CUDA offloading target for different
+// combinations of:
+// - Number of gpu architectures;
+// - Host/device-only compilation;
+// - User-requested final phase - binary or assembly.
+
+// REQUIRES: clang-driver
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+//
+// Test single gpu architecture with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN %s
+// BIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN: 2: compiler, {1}, ir, (host-cuda)
+// BIN: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
+// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
+// BIN: 12: backend, {11}, assembler, (host-cuda)
+// BIN: 13: assembler, {12}, object, (host-cuda)
+// BIN: 14: linker, {13}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM %s
+// ASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM: 5: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
+// ASM: 7: compiler, {6}, ir, (host-cuda)
+// ASM: 8: backend, {7}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN2 %s
+// BIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN2: 2: compiler, {1}, ir, (host-cuda)
+// BIN2: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN2: 10: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
+// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
+// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
+// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
+// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
+// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
+// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
+// BIN2: 19: backend, {18}, assembler, (host-cuda)
+// BIN2: 20: assembler, {19}, object, (host-cuda)
+// BIN2: 21: linker, {20}, image, (host-cuda)
+
+//
+// Test two gpu architecturess up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM2 %s
+// ASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// ASM2: 10: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
+// ASM2: 12: compiler, {11}, ir, (host-cuda)
+// ASM2: 13: backend, {12}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN %s
+// HBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN: 2: compiler, {1}, ir, (host-cuda)
+// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN: 4: backend, {3}, assembler, (host-cuda)
+// HBIN: 5: assembler, {4}, object, (host-cuda)
+// HBIN: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM %s
+// HASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM: 2: compiler, {1}, ir, (host-cuda)
+// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN2 %s
+// HBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN2: 2: compiler, {1}, ir, (host-cuda)
+// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN2: 4: backend, {3}, assembler, (host-cuda)
+// HBIN2: 5: assembler, {4}, object, (host-cuda)
+// HBIN2: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test two gpu architectures up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM2 %s
+// HASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM2: 2: compiler, {1}, ir, (host-cuda)
+// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM2: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN %s
+// DBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+
+//
+// Test single gpu architecture up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM %s
+// DASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+
+//
+// Test two gpu architectures with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN2 %s
+// DBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN2: 6: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
+// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
+// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
+// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+
+//
+// Test two gpu architectures up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM2 %s
+// DASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
OpenPOWER on IntegriCloud