6 files changed, 549 insertions, 170 deletions
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 7982f51f07b..a98b5c1bbaa 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
@@ -21,8 +22,8 @@ const char *Action::getClassName(ActionClass AC) {
   switch (AC) {
   case InputClass: return "input";
   case BindArchClass: return "bind-arch";
-  case CudaDeviceClass: return "cuda-device";
-  case CudaHostClass: return "cuda-host";
+  case OffloadClass:
+    return "offload";
   case PreprocessJobClass: return "preprocessor";
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
@@ -40,6 +41,82 @@ const char *Action::getClassName(ActionClass AC) {
   llvm_unreachable("invalid class");
 }
 
+void Action::propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert((OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFK_None) &&
+         "Setting device kind to a different device??");
+  assert(!ActiveOffloadKindMask && "Setting a device kind in a host action??");
+  OffloadingDeviceKind = OKind;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch);
+}
+
+void Action::propagateHostOffloadInfo(unsigned OKinds, const char *OArch) {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert(OffloadingDeviceKind == OFK_None &&
+         "Setting a host kind in a device action.");
+  ActiveOffloadKindMask |= OKinds;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateHostOffloadInfo(ActiveOffloadKindMask, OArch);
+}
+
+void Action::propagateOffloadInfo(const Action *A) {
+  if (unsigned HK = A->getOffloadingHostActiveKinds())
+    propagateHostOffloadInfo(HK, A->getOffloadingArch());
+  else
+    propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(),
+                               A->getOffloadingArch());
+}
+
+std::string Action::getOffloadingKindPrefix() const {
+  switch (OffloadingDeviceKind) {
+  case OFK_None:
+    break;
+  case OFK_Host:
+    llvm_unreachable("Host kind is not an offloading device kind.");
+    break;
+  case OFK_Cuda:
+    return "device-cuda";
+
+    // TODO: Add other programming models here.
+  }
+
+  if (!ActiveOffloadKindMask)
+    return "";
+
+  std::string Res("host");
+  if (ActiveOffloadKindMask & OFK_Cuda)
+    Res += "-cuda";
+
+  // TODO: Add other programming models here.
+
+  return Res;
+}
+
+std::string
+Action::getOffloadingFileNamePrefix(StringRef NormalizedTriple) const {
+  // A file prefix is only generated for device actions and consists of the
+  // offload kind and triple.
+  if (!OffloadingDeviceKind)
+    return "";
+
+  std::string Res("-");
+  Res += getOffloadingKindPrefix();
+  Res += "-";
+  Res += NormalizedTriple;
+  return Res;
+}
+
 void InputAction::anchor() {}
 
 InputAction::InputAction(const Arg &_Input, types::ID _Type)
@@ -51,16 +128,138 @@ void BindArchAction::anchor() {}
 BindArchAction::BindArchAction(Action *Input, const char *_ArchName)
     : Action(BindArchClass, Input), ArchName(_ArchName) {}
 
-void CudaDeviceAction::anchor() {}
+void OffloadAction::anchor() {}
+
+OffloadAction::OffloadAction(const HostDependence &HDep)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) {
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+};
+
+OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty)
+    : Action(OffloadClass, DDeps.getActions(), Ty),
+      DevToolChains(DDeps.getToolChains()) {
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
+
+  // If all inputs agree on the same kind, use it also for this action.
+  if (llvm::all_of(OKinds, [&](OffloadKind K) { return K == OKinds.front(); }))
+    OffloadingDeviceKind = OKinds.front();
+
+  // If we have a single dependency, inherit the architecture from it.
+  if (OKinds.size() == 1)
+    OffloadingArch = BArchs.front();
+
+  // Propagate info to the dependencies.
+  for (unsigned i = 0, e = getInputs().size(); i != e; ++i)
+    getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]);
+}
+
+OffloadAction::OffloadAction(const HostDependence &HDep,
+                             const DeviceDependences &DDeps)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
+      DevToolChains(DDeps.getToolChains()) {
+  // We use the kinds of the host dependence for this action.
+  OffloadingArch = HDep.getBoundArch();
+  ActiveOffloadKindMask = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+
+  // Add device inputs and propagate info to the device actions. Do work only if
+  // we have dependencies.
+  for (unsigned i = 0, e = DDeps.getActions().size(); i != e; ++i)
+    if (auto *A = DDeps.getActions()[i]) {
+      getInputs().push_back(A);
+      A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i],
+                                    DDeps.getBoundArchs()[i]);
+    }
+}
+
+void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const {
+  if (!HostTC)
+    return;
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  auto *A = getInputs().front();
+  Work(A, HostTC, A->getOffloadingArch());
+}
 
-CudaDeviceAction::CudaDeviceAction(Action *Input, clang::CudaArch Arch,
-                                   bool AtTopLevel)
-    : Action(CudaDeviceClass, Input), GpuArch(Arch), AtTopLevel(AtTopLevel) {}
+void OffloadAction::doOnEachDeviceDependence(
+    const OffloadActionWorkTy &Work) const {
+  auto I = getInputs().begin();
+  auto E = getInputs().end();
+  if (I == E)
+    return;
+
+  // We expect to have the same number of input dependences and device tool
+  // chains, except if we also have a host dependence. In that case we have one
+  // more dependence than we have device tool chains.
+  assert(getInputs().size() == DevToolChains.size() + (HostTC ? 1 : 0) &&
+         "Sizes of action dependences and toolchains are not consistent!");
+
+  // Skip host action
+  if (HostTC)
+    ++I;
+
+  auto TI = DevToolChains.begin();
+  for (; I != E; ++I, ++TI)
+    Work(*I, *TI, (*I)->getOffloadingArch());
+}
+
+void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const {
+  doOnHostDependence(Work);
+  doOnEachDeviceDependence(Work);
+}
+
+void OffloadAction::doOnEachDependence(bool IsHostDependence,
+                                       const OffloadActionWorkTy &Work) const {
+  if (IsHostDependence)
+    doOnHostDependence(Work);
+  else
+    doOnEachDeviceDependence(Work);
+}
 
-void CudaHostAction::anchor() {}
+bool OffloadAction::hasHostDependence() const { return HostTC != nullptr; }
 
-CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions)
-    : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {}
+Action *OffloadAction::getHostDependence() const {
+  assert(hasHostDependence() && "Host dependence does not exist!");
+  assert(!getInputs().empty() && "No dependencies for offload action??");
+  return HostTC ? getInputs().front() : nullptr;
+}
+
+bool OffloadAction::hasSingleDeviceDependence(
+    bool DoNotConsiderHostActions) const {
+  if (DoNotConsiderHostActions)
+    return getInputs().size() == (HostTC ? 2 : 1);
+  return !HostTC && getInputs().size() == 1;
+}
+
+Action *
+OffloadAction::getSingleDeviceDependence(bool DoNotConsiderHostActions) const {
+  assert(hasSingleDeviceDependence(DoNotConsiderHostActions) &&
+         "Single device dependence does not exist!");
+  // The previous assert ensures the number of entries in getInputs() is
+  // consistent with what we are doing here.
+  return HostTC ? getInputs()[1] : getInputs().front();
+}
+
+void OffloadAction::DeviceDependences::add(Action &A, const ToolChain &TC,
+                                           const char *BoundArch,
+                                           OffloadKind OKind) {
+  DeviceActions.push_back(&A);
+  DeviceToolChains.push_back(&TC);
+  DeviceBoundArchs.push_back(BoundArch);
+  DeviceOffloadKinds.push_back(OKind);
+}
+
+OffloadAction::HostDependence::HostDependence(Action &A, const ToolChain &TC,
+                                              const char *BoundArch,
+                                              const DeviceDependences &DDeps)
+    : HostAction(A), HostToolChain(TC), HostBoundArch(BoundArch) {
+  for (auto K : DDeps.getOffloadKinds())
+    HostOffloadKinds |= K;
+}
 
 void JobAction::anchor() {}
 
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 78c3125cdb6..02f4a999771 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -435,7 +435,9 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       })) {
     const ToolChain &TC = getToolChain(
         C.getInputArgs(),
-        llvm::Triple(C.getOffloadingHostToolChain()->getTriple().isArch64Bit()
+        llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .isArch64Bit()
                          ? "nvptx64-nvidia-cuda"
                          : "nvptx-nvidia-cuda"));
     C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
@@ -1022,19 +1024,33 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->input_begin(), Ids) << "}";
-  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    CudaArch Arch = CDA->getGpuArch();
-    if (Arch != CudaArch::UNKNOWN)
-      os << "'" << CudaArchToString(Arch) << "', ";
-    os << "{" << PrintActions1(C, *CDA->input_begin(), Ids) << "}";
+  } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    bool IsFirst = true;
+    OA->doOnEachDependence(
+        [&](Action *A, const ToolChain *TC, const char *BoundArch) {
+          // E.g. for two CUDA device dependences whose bound arch is sm_20 and
+          // sm_35 this will generate:
+          // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device"
+          // (nvptx64-nvidia-cuda:sm_35) {#ID}
+          if (!IsFirst)
+            os << ", ";
+          os << '"';
+          if (TC)
+            os << A->getOffloadingKindPrefix();
+          else
+            os << "host";
+          os << " (";
+          os << TC->getTriple().normalize();
+
+          if (BoundArch)
+            os << ":" << BoundArch;
+          os << ")";
+          os << '"';
+          os << " {" << PrintActions1(C, A, Ids) << "}";
+          IsFirst = false;
+        });
   } else {
-    const ActionList *AL;
-    if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-      os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}"
-         << ", gpu binaries ";
-      AL = &CHA->getDeviceActions();
-    } else
-      AL = &A->getInputs();
+    const ActionList *AL = &A->getInputs();
 
     if (AL->size()) {
       const char *Prefix = "{";
@@ -1047,10 +1063,24 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
       os << "{}";
   }
 
+  // Append offload info for all options other than the offloading action
+  // itself (e.g. (cuda-device, sm_20) or (cuda-host)).
+  std::string offload_str;
+  llvm::raw_string_ostream offload_os(offload_str);
+  if (!isa<OffloadAction>(A)) {
+    auto S = A->getOffloadingKindPrefix();
+    if (!S.empty()) {
+      offload_os << ", (" << S;
+      if (A->getOffloadingArch())
+        offload_os << ", " << A->getOffloadingArch();
+      offload_os << ")";
+    }
+  }
+
   unsigned Id = Ids.size();
   Ids[A] = Id;
   llvm::errs() << Id << ": " << os.str() << ", "
-               << types::getTypeName(A->getType()) << "\n";
+               << types::getTypeName(A->getType()) << offload_os.str() << "\n";
 
   return Id;
 }
@@ -1378,8 +1408,12 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
       PartialCompilationArg &&
       PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
 
-  if (CompileHostOnly)
-    return C.MakeAction<CudaHostAction>(HostAction, ActionList());
+  if (CompileHostOnly) {
+    OffloadAction::HostDependence HDep(
+        *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+        /*BoundArch=*/nullptr, Action::OFK_Cuda);
+    return C.MakeAction<OffloadAction>(HDep);
+  }
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
   SmallVector<CudaArch, 4> GpuArchList;
@@ -1408,8 +1442,6 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
   // Build actions for all device inputs.
-  assert(C.getSingleOffloadToolChain<Action::OFK_Cuda>() &&
-         "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
   C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
@@ -1421,6 +1453,8 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
         return a->getKind() != Action::AssembleJobClass;
       });
 
+  const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+
   // Figure out what to do with device actions -- pass them as inputs to the
   // host action or run each of them independently.
   if (PartialCompilation || CompileDeviceOnly) {
@@ -1436,10 +1470,13 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
       return nullptr;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-      Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I],
-                                                       GpuArchList[I],
-                                                       /* AtTopLevel */ true));
+    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*CudaDeviceActions[I], *CudaTC, CudaArchToString(GpuArchList[I]),
+               Action::OFK_Cuda);
+      Actions.push_back(
+          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    }
     // Kill host action in case of device-only compilation.
     if (CompileDeviceOnly)
       return nullptr;
@@ -1459,19 +1496,23 @@ static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
     Action* BackendAction = AssembleAction->getInputs()[0];
     assert(BackendAction->getType() == types::TY_PP_Asm);
 
-    for (const auto& A : {AssembleAction, BackendAction}) {
-      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
-          A, GpuArchList[I], /* AtTopLevel */ false));
+    for (auto &A : {AssembleAction, BackendAction}) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(*A, *CudaTC, CudaArchToString(GpuArchList[I]), Action::OFK_Cuda);
+      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
     }
   }
-  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
-      CudaArch::UNKNOWN,
-      /* AtTopLevel = */ false);
+  auto FatbinAction =
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(std::move(HostAction),
-                                      ActionList({FatbinAction}));
+  OffloadAction::HostDependence HDep(
+      *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+      /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  OffloadAction::DeviceDependences DDep;
+  DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
+  return C.MakeAction<OffloadAction>(HDep, DDep);
 }
 
 void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
@@ -1580,6 +1621,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     YcArg = YuArg = nullptr;
   }
 
+  // Track the host offload kinds used on this compilation.
+  unsigned CompilationActiveOffloadHostKinds = 0u;
+
   // Construct the actions to perform.
   ActionList LinkerInputs;
 
@@ -1648,6 +1692,9 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
             ? phases::Compile
             : FinalPhase;
 
+    // Track the host offload kinds used on this input.
+    unsigned InputActiveOffloadHostKinds = 0u;
+
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
@@ -1679,21 +1726,36 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
         Current = buildCudaActions(C, Args, InputArg, Current, Actions);
         if (!Current)
           break;
+
+        // We produced a CUDA action for this input, so the host has to support
+        // CUDA.
+        InputActiveOffloadHostKinds |= Action::OFK_Cuda;
+        CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
       }
 
       if (Current->getType() == types::TY_Nothing)
         break;
     }
 
-    // If we ended with something, add to the output list.
-    if (Current)
+    // If we ended with something, add to the output list. Also, propagate the
+    // offload information to the top-level host action related with the current
+    // input.
+    if (Current) {
+      if (InputActiveOffloadHostKinds)
+        Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
+                                          /*BoundArch=*/nullptr);
       Actions.push_back(Current);
+    }
   }
 
-  // Add a link action if necessary.
-  if (!LinkerInputs.empty())
+  // Add a link action if necessary and propagate the offload information for
+  // the current compilation.
+  if (!LinkerInputs.empty()) {
     Actions.push_back(
         C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
+    Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
+                                             /*BoundArch=*/nullptr);
+  }
 
   // If we are linking, claim any options which are obviously only used for
   // compilation.
@@ -1829,7 +1891,8 @@ void Driver::BuildJobs(Compilation &C) const {
                        /*BoundArch*/ nullptr,
                        /*AtTopLevel*/ true,
                        /*MultipleArchs*/ ArchNames.size() > 1,
-                       /*LinkingOutput*/ LinkingOutput, CachedResults);
+                       /*LinkingOutput*/ LinkingOutput, CachedResults,
+                       /*BuildForOffloadDevice*/ false);
   }
 
   // If the user passed -Qunused-arguments or there were errors, don't warn
@@ -1878,7 +1941,28 @@ void Driver::BuildJobs(Compilation &C) const {
     }
   }
 }
-
+/// Collapse an offloading action looking for a job of the given type. The input
+/// action is changed to the input of the collapsed sequence. If we effectively
+/// had a collapse return the corresponding offloading action, otherwise return
+/// null.
+template <typename T>
+static OffloadAction *collapseOffloadingAction(Action *&CurAction) {
+  if (!CurAction)
+    return nullptr;
+  if (auto *OA = dyn_cast<OffloadAction>(CurAction)) {
+    if (OA->hasHostDependence())
+      if (auto *HDep = dyn_cast<T>(OA->getHostDependence())) {
+        CurAction = HDep;
+        return OA;
+      }
+    if (OA->hasSingleDeviceDependence())
+      if (auto *DDep = dyn_cast<T>(OA->getSingleDeviceDependence())) {
+        CurAction = DDep;
+        return OA;
+      }
+  }
+  return nullptr;
+}
 // Returns a Tool for a given JobAction.  In case the action and its
 // predecessors can be combined, updates Inputs with the inputs of the
 // first combined action. If one of the collapsed actions is a
@@ -1888,34 +1972,39 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
                                     bool EmbedBitcode, const ToolChain *TC,
                                     const JobAction *JA,
                                     const ActionList *&Inputs,
-                                    const CudaHostAction *&CollapsedCHA) {
+                                    ActionList &CollapsedOffloadAction) {
   const Tool *ToolForJob = nullptr;
-  CollapsedCHA = nullptr;
+  CollapsedOffloadAction.clear();
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
   // compiler input.
 
+  // Look through offload actions between assembler and backend actions.
+  Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1)
+                          ? *Inputs->begin()
+                          : nullptr;
+  auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA);
+
   if (TC->useIntegratedAs() && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_via_file_asm) &&
       !C.getArgs().hasArg(options::OPT__SLASH_FA) &&
-      !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
-      isa<AssembleJobAction>(JA) && Inputs->size() == 1 &&
-      isa<BackendJobAction>(*Inputs->begin())) {
+      !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA &&
+      isa<BackendJobAction>(BackendJA)) {
     // A BackendJob is always preceded by a CompileJob, and without -save-temps
     // or -fembed-bitcode, they will always get combined together, so instead of
     // checking the backend tool, check if the tool for the CompileJob has an
     // integrated assembler. For -fembed-bitcode, CompileJob is still used to
     // look up tools for BackendJob, but they need to match before we can split
     // them.
-    const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
-    JobAction *CompileJA = cast<CompileJobAction>(
-        CHA ? *CHA->input_begin() : *BackendInputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *BackendJA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     // When using -fembed-bitcode, it is required to have the same tool (clang)
@@ -1929,7 +2018,12 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
     if (Compiler->hasIntegratedAssembler()) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+      // Save the collapsed offload actions because they may still contain
+      // device actions.
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
+      if (BackendOA)
+        CollapsedOffloadAction.push_back(BackendOA);
     }
   }
 
@@ -1939,20 +2033,23 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
-    JobAction *CompileJA =
-        cast<CompileJobAction>(CHA ? *CHA->input_begin() : *Inputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *JA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     if (!Compiler->canEmitIR() ||
         (!SaveTemps && !EmbedBitcode)) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
     }
   }
 
@@ -1963,12 +2060,21 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
   // See if we should use an integrated preprocessor. We do so when we have
   // exactly one input, since this is the only use case we care about
   // (irrelevant since we don't support combine yet).
-  if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) &&
+
+  // Look through offload actions after preprocessing.
+  Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr;
+  auto *PreprocessOA =
+      collapseOffloadingAction<PreprocessJobAction>(PreprocessJA);
+
+  if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) &&
       !C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
       !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_rewrite_objc) &&
-      ToolForJob->hasIntegratedCPP())
-    Inputs = &(*Inputs)[0]->getInputs();
+      ToolForJob->hasIntegratedCPP()) {
+    Inputs = &PreprocessJA->getInputs();
+    if (PreprocessOA)
+      CollapsedOffloadAction.push_back(PreprocessOA);
+  }
 
   return ToolForJob;
 }
@@ -1976,8 +2082,8 @@ static const Tool *selectToolForJob(Compilation &C, bool SaveTemps,
 InputInfo Driver::BuildJobsForAction(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   // The bound arch is not necessarily represented in the toolchain's triple --
   // for example, armv7 and armv7s both map to the same triple -- so we need
   // both in our map.
@@ -1991,9 +2097,9 @@ InputInfo Driver::BuildJobsForAction(
   if (CachedResult != CachedResults.end()) {
     return CachedResult->second;
   }
-  InputInfo Result =
-      BuildJobsForActionNoCache(C, A, TC, BoundArch, AtTopLevel, MultipleArchs,
-                                LinkingOutput, CachedResults);
+  InputInfo Result = BuildJobsForActionNoCache(
+      C, A, TC, BoundArch, AtTopLevel, MultipleArchs, LinkingOutput,
+      CachedResults, BuildForOffloadDevice);
   CachedResults[ActionTC] = Result;
   return Result;
 }
@@ -2001,21 +2107,65 @@ InputInfo Driver::BuildJobsForAction(
 InputInfo Driver::BuildJobsForActionNoCache(
     Compilation &C, const Action *A, const ToolChain *TC, const char *BoundArch,
     bool AtTopLevel, bool MultipleArchs, const char *LinkingOutput,
-    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults)
-    const {
+    std::map<std::pair<const Action *, std::string>, InputInfo> &CachedResults,
+    bool BuildForOffloadDevice) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
-  InputInfoList CudaDeviceInputInfos;
-  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-    // Append outputs of device jobs to the input list.
-    for (const Action *DA : CHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, nullptr, AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
+  InputInfoList OffloadDependencesInputInfo;
+  if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    // The offload action is expected to be used in four different situations.
+    //
+    // a) Set a toolchain/architecture/kind for a host action:
+    //    Host Action 1 -> OffloadAction -> Host Action 2
+    //
+    // b) Set a toolchain/architecture/kind for a device action;
+    //    Device Action 1 -> OffloadAction -> Device Action 2
+    //
+    // c) Specify a device dependences to a host action;
+    //    Device Action 1  _
+    //                      \
+    //      Host Action 1  ---> OffloadAction -> Host Action 2
+    //
+    // d) Specify a host dependence to a device action.
+    //      Host Action 1  _
+    //                      \
+    //    Device Action 1  ---> OffloadAction -> Device Action 2
+    //
+    // For a) and b), we just return the job generated for the dependence. For
+    // c) and d) we override the current action with the host/device dependence
+    // if the current toolchain is host/device and set the offload dependences
+    // info with the jobs obtained from the device/host dependence(s).
+
+    // If there is a single device option, just generate the job for it.
+    if (OA->hasSingleDeviceDependence()) {
+      InputInfo DevA;
+      OA->doOnEachDeviceDependence([&](Action *DepA, const ToolChain *DepTC,
+                                       const char *DepBoundArch) {
+        DevA =
+            BuildJobsForAction(C, DepA, DepTC, DepBoundArch, AtTopLevel,
+                               /*MultipleArchs*/ !!DepBoundArch, LinkingOutput,
+                               CachedResults, /*BuildForOffloadDevice=*/true);
+      });
+      return DevA;
     }
-    // Override current action with a real host compile action and continue
-    // processing it.
-    A = *CHA->input_begin();
+
+    // If 'Action 2' is host, we generate jobs for the device dependences and
+    // override the current action with the host dependence. Otherwise, we
+    // generate the host dependences and override the action with the device
+    // dependence. The dependences can't therefore be a top-level action.
+    OA->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, /*AtTopLevel=*/false,
+              /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
+
+    A = BuildForOffloadDevice
+            ? OA->getSingleDeviceDependence(/*DoNotConsiderHostActions=*/true)
+            : OA->getHostDependence();
   }
 
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -2042,41 +2192,34 @@ InputInfo Driver::BuildJobsForActionNoCache(
       TC = &C.getDefaultToolChain();
 
     return BuildJobsForAction(C, *BAA->input_begin(), TC, ArchName, AtTopLevel,
-                              MultipleArchs, LinkingOutput, CachedResults);
+                              MultipleArchs, LinkingOutput, CachedResults,
+                              BuildForOffloadDevice);
   }
 
-  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    // Initial processing of CudaDeviceAction carries host params.
-    // Call BuildJobsForAction() again, now with correct device parameters.
-    InputInfo II = BuildJobsForAction(
-        C, *CDA->input_begin(), C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
-        CudaArchToString(CDA->getGpuArch()), CDA->isAtTopLevel(),
-        /*MultipleArchs=*/true, LinkingOutput, CachedResults);
-    // Currently II's Action is *CDA->input_begin().  Set it to CDA instead, so
-    // that one can retrieve II's GPU arch.
-    II.setAction(A);
-    return II;
-  }
 
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const CudaHostAction *CollapsedCHA = nullptr;
+  ActionList CollapsedOffloadActions;
+
   const Tool *T =
       selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA,
-                       Inputs, CollapsedCHA);
+                       Inputs, CollapsedOffloadActions);
   if (!T)
     return InputInfo();
 
-  // If we've collapsed action list that contained CudaHostAction we
-  // need to build jobs for device-side inputs it may have held.
-  if (CollapsedCHA) {
-    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, "", AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
-    }
-  }
+  // If we've collapsed action list that contained OffloadAction we
+  // need to build jobs for host/device-side inputs it may have held.
+  for (const auto *OA : CollapsedOffloadActions)
+    cast<OffloadAction>(OA)->doOnEachDependence(
+        /*IsHostDependence=*/BuildForOffloadDevice,
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDependencesInputInfo.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, AtTopLevel,
+              /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults,
+              /*BuildForOffloadDevice=*/DepA->getOffloadingDeviceKind() !=
+                  Action::OFK_None));
+        });
 
   // Only use pipes when there is exactly one input.
   InputInfoList InputInfos;
@@ -2086,9 +2229,9 @@ InputInfo Driver::BuildJobsForActionNoCache(
     // FIXME: Clean this up.
     bool SubJobAtTopLevel =
         AtTopLevel && (isa<DsymutilJobAction>(A) || isa<VerifyJobAction>(A));
-    InputInfos.push_back(BuildJobsForAction(C, Input, TC, BoundArch,
-                                            SubJobAtTopLevel, MultipleArchs,
-                                            LinkingOutput, CachedResults));
+    InputInfos.push_back(BuildJobsForAction(
+        C, Input, TC, BoundArch, SubJobAtTopLevel, MultipleArchs, LinkingOutput,
+        CachedResults, BuildForOffloadDevice));
   }
 
   // Always use the first input as the base input.
@@ -2099,9 +2242,10 @@ InputInfo Driver::BuildJobsForActionNoCache(
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
-  // Append outputs of cuda device jobs to the input list
-  if (CudaDeviceInputInfos.size())
-    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+  // Append outputs of offload device jobs to the input list
+  if (!OffloadDependencesInputInfo.empty())
+    InputInfos.append(OffloadDependencesInputInfo.begin(),
+                      OffloadDependencesInputInfo.end());
 
   // Determine the place to write output to, if any.
   InputInfo Result;
@@ -2109,7 +2253,8 @@ InputInfo Driver::BuildJobsForActionNoCache(
     Result = InputInfo(A, BaseInput);
   else
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
-                                             AtTopLevel, MultipleArchs),
+                                             AtTopLevel, MultipleArchs,
+                                             TC->getTriple().normalize()),
                        BaseInput);
 
   if (CCCPrintBindings && !CCGenDiagnostics) {
@@ -2169,7 +2314,8 @@ static const char *MakeCLOutputFilename(const ArgList &Args, StringRef ArgValue,
 const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
                                        const char *BaseInput,
                                        const char *BoundArch, bool AtTopLevel,
-                                       bool MultipleArchs) const {
+                                       bool MultipleArchs,
+                                       StringRef NormalizedTriple) const {
   llvm::PrettyStackTraceString CrashInfo("Computing output path");
   // Output to a user requested destination?
   if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) {
@@ -2255,6 +2401,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
           MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image);
     } else if (MultipleArchs && BoundArch) {
       SmallString<128> Output(getDefaultImageName());
+      Output += JA.getOffloadingFileNamePrefix(NormalizedTriple);
       Output += "-";
       Output.append(BoundArch);
       NamedOutput = C.getArgs().MakeArgString(Output.c_str());
@@ -2271,6 +2418,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
     if (!types::appendSuffixForType(JA.getType()))
       End = BaseName.rfind('.');
     SmallString<128> Suffixed(BaseName.substr(0, End));
+    Suffixed += JA.getOffloadingFileNamePrefix(NormalizedTriple);
     if (MultipleArchs && BoundArch) {
       Suffixed += "-";
       Suffixed.append(BoundArch);
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index cba8924ec2c..e96688cbaf8 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -248,8 +248,7 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
 
   case Action::InputClass:
   case Action::BindArchClass:
-  case Action::CudaDeviceClass:
-  case Action::CudaHostClass:
+  case Action::OffloadClass:
   case Action::LipoJobClass:
   case Action::DsymutilJobClass:
   case Action::VerifyDebugInfoJobClass:
diff --git a/clang/lib/Driver/Tools.cpp b/clang/lib/Driver/Tools.cpp
index 63284bc1b2a..df4a996e47f 100644
--- a/clang/lib/Driver/Tools.cpp
+++ b/clang/lib/Driver/Tools.cpp
@@ -296,12 +296,45 @@ static bool forwardToGCC(const Option &O) {
          !O.hasFlag(options::DriverOption) && !O.hasFlag(options::LinkerInput);
 }
 
+/// Add the C++ include args of other offloading toolchains. If this is a host
+/// job, the device toolchains are added. If this is a device job, the host
+/// toolchains will be added.
+static void addExtraOffloadCXXStdlibIncludeArgs(Compilation &C,
+                                                const JobAction &JA,
+                                                const ArgList &Args,
+                                                ArgStringList &CmdArgs) {
+
+  if (JA.isHostOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+        ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Host>()
+        ->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+
+  // TODO: Add support for other programming models here.
+}
+
+/// Add the include args that are specific of each offloading programming model.
+static void addExtraOffloadSpecificIncludeArgs(Compilation &C,
+                                               const JobAction &JA,
+                                               const ArgList &Args,
+                                               ArgStringList &CmdArgs) {
+
+  if (JA.isHostOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Host>()->AddCudaIncludeArgs(
+        Args, CmdArgs);
+  else if (JA.isDeviceOffloading(Action::OFK_Cuda))
+    C.getSingleOffloadToolChain<Action::OFK_Cuda>()->AddCudaIncludeArgs(
+        Args, CmdArgs);
+
+  // TODO: Add support for other programming models here.
+}
+
 void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
                                     const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
-                                    const InputInfoList &Inputs,
-                                    const ToolChain *AuxToolChain) const {
+                                    const InputInfoList &Inputs) const {
   Arg *A;
   const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
 
@@ -566,31 +599,27 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
   // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
-  // Optional AuxToolChain indicates that we need to include headers
-  // for more than one target. If that's the case, add include paths
-  // from AuxToolChain right after include paths of the same kind for
-  // the current target.
+  // While adding the include arguments, we also attempt to retrieve the
+  // arguments of related offloading toolchains or arguments that are specific
+  // of an offloading programming model.
 
   // Add C++ include arguments, if needed.
   if (types::isCXX(Inputs[0].getType())) {
     getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
-    if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+    addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
   }
 
   // Add system include arguments for all targets but IAMCU.
   if (!IsIAMCU) {
     getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
-    if (AuxToolChain)
-      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+    addExtraOffloadCXXStdlibIncludeArgs(C, JA, Args, CmdArgs);
   } else {
     // For IAMCU add special include arguments.
     getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
   }
 
-  // Add CUDA include arguments, if needed.
-  if (types::isCuda(Inputs[0].getType()))
-    getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
+  // Add offload include arguments, if needed.
+  addExtraOffloadSpecificIncludeArgs(C, JA, Args, CmdArgs);
 }
 
 // FIXME: Move to target hook.
@@ -3799,7 +3828,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // CUDA compilation may have multiple inputs (source file + results of
   // device-side compilations). All other jobs are expected to have exactly one
   // input.
-  bool IsCuda = types::isCuda(Input.getType());
+  bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
 
   // C++ is not supported for IAMCU.
@@ -3815,21 +3844,21 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
-  const ToolChain *AuxToolChain = nullptr;
   if (IsCuda) {
-    // FIXME: We need a (better) way to pass information about
-    // particular compilation pass we're constructing here. For now we
-    // can check which toolchain we're using and pick the other one to
-    // extract the triple.
-    if (&getToolChain() == C.getSingleOffloadToolChain<Action::OFK_Cuda>())
-      AuxToolChain = C.getOffloadingHostToolChain();
-    else if (&getToolChain() == C.getOffloadingHostToolChain())
-      AuxToolChain = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+    // We have to pass the triple of the host if compiling for a CUDA device and
+    // vice-versa.
+    StringRef NormalizedTriple;
+    if (JA.isDeviceOffloading(Action::OFK_Cuda))
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
+                             ->getTriple()
+                             .normalize();
     else
-      llvm_unreachable("Can't figure out CUDA compilation mode.");
-    assert(AuxToolChain != nullptr && "No aux toolchain.");
+      NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>()
+                             ->getTriple()
+                             .normalize();
+
     CmdArgs.push_back("-aux-triple");
-    CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
+    CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
@@ -4718,8 +4747,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
-    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs,
-                            AuxToolChain);
+    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
@@ -11193,15 +11221,14 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
-  std::vector<std::string> gpu_archs =
-      Args.getAllArgValues(options::OPT_march_EQ);
-  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
-  const std::string& gpu_arch = gpu_archs[0];
+  // Obtain architecture from the action.
+  CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
+  assert(gpu_arch != CudaArch::UNKNOWN &&
+         "Device action expected to have an architecture.");
 
   // Check that our installation's ptxas supports gpu_arch.
   if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
-    TC.cudaInstallation().CheckCudaVersionSupportsArch(
-        StringToCudaArch(gpu_arch));
+    TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
   }
 
   ArgStringList CmdArgs;
@@ -11245,7 +11272,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   CmdArgs.push_back("--gpu-name");
-  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
   for (const auto& II : Inputs)
@@ -11277,13 +11304,20 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
 
   for (const auto& II : Inputs) {
-    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    auto *A = II.getAction();
+    assert(A->getInputs().size() == 1 &&
+           "Device offload action is expected to have a single input");
+    const char *gpu_arch_str = A->getOffloadingArch();
+    assert(gpu_arch_str &&
+           "Device action expected to have associated a GPU architecture!");
+    CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
+
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
     const char *Arch =
         (II.getType() == types::TY_PP_Asm)
-            ? CudaVirtualArchToString(VirtualArchForCudaArch(A->getGpuArch()))
-            : CudaArchToString(A->getGpuArch());
+            ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+            : gpu_arch_str;
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
                                          Arch + ",file=" + II.getFilename()));
   }
diff --git a/clang/lib/Driver/Tools.h b/clang/lib/Driver/Tools.h
index 2e546fc6538..02bdb8e5e2d 100644
--- a/clang/lib/Driver/Tools.h
+++ b/clang/lib/Driver/Tools.h
@@ -57,8 +57,7 @@ private:
                                const Driver &D, const llvm::opt::ArgList &Args,
                                llvm::opt::ArgStringList &CmdArgs,
                                const InputInfo &Output,
-                               const InputInfoList &Inputs,
-                               const ToolChain *AuxToolChain) const;
+                               const InputInfoList &Inputs) const;
 
   void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const;
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 6c44932f55e..1e9e57afb6b 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -60,25 +60,25 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
   }
 
   // We expect to get back exactly one command job, if we didn't something
-  // failed. CUDA compilation is an exception as it creates multiple jobs. If
-  // that's the case, we proceed with the first job. If caller needs particular
-  // CUDA job, it should be controlled via --cuda-{host|device}-only option
-  // passed to the driver.
+  // failed. Offload compilation is an exception as it creates multiple jobs. If
+  // that's the case, we proceed with the first job. If caller needs a
+  // particular job, it should be controlled via options (e.g.
+  // --cuda-{host|device}-only for CUDA) passed to the driver.
   const driver::JobList &Jobs = C->getJobs();
-  bool CudaCompilation = false;
+  bool OffloadCompilation = false;
   if (Jobs.size() > 1) {
     for (auto &A : C->getActions()){
       // On MacOSX real actions may end up being wrapped in BindArchAction
       if (isa<driver::BindArchAction>(A))
         A = *A->input_begin();
-      if (isa<driver::CudaDeviceAction>(A)) {
-        CudaCompilation = true;
+      if (isa<driver::OffloadAction>(A)) {
+        OffloadCompilation = true;
         break;
       }
     }
   }
   if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
-      (Jobs.size() > 1 && !CudaCompilation)) {
+      (Jobs.size() > 1 && !OffloadCompilation)) {
     SmallString<256> Msg;
     llvm::raw_svector_ostream OS(Msg);
     Jobs.Print(OS, "; ", true);