Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -12,6 +12,7 @@
 
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
@@ -26,6 +27,8 @@
 namespace clang {
 namespace driver {
 
+class ToolChain;
+
 /// Action - Represent an abstract compilation step to perform.
 ///
 /// An action represents an edge in the compilation graph; typically
@@ -49,8 +52,7 @@
   enum ActionClass {
     InputClass = 0,
     BindArchClass,
-    CudaDeviceClass,
-    CudaHostClass,
+    OffloadClass,
     PreprocessJobClass,
     PrecompileJobClass,
     AnalyzeJobClass,
@@ -70,10 +72,6 @@
 
   // The offloading kind determines if this action is binded to a particular
   // programming model. Each entry reserves one bit.
-  //
-  // FIXME: This is currently used to indicate that toolchains are used in a
-  // given programming as well, but will be used here as well once a generic
-  // offloading action is implemented.
   enum OffloadKind {
     OFFLOAD_None = 0x00,
     OFFLOAD_CUDA = 0x01,
@@ -90,13 +88,24 @@
   ActionList Inputs;
 
 protected:
+  /// Offload information. It has to be mutable as it needs to be adjusted if
+  /// actions are integrated.
+  /// \brief Multiple programming models may be supported simultaneously by the
+  /// same host. Therefore, the host offloading kind is a combination of kinds.
+  mutable unsigned OffloadingHostKind;
+  /// \brief Offloading kind of the device.
+  mutable OffloadKind OffloadingDeviceKind;
+  /// \brief The Offloading architecture associated with this action.
+  mutable const char *OffloadingArch;
+
   Action(ActionClass Kind, types::ID Type) : Action(Kind, ActionList(), Type) {}
   Action(ActionClass Kind, Action *Input, types::ID Type)
       : Action(Kind, ActionList({Input}), Type) {}
   Action(ActionClass Kind, Action *Input)
       : Action(Kind, ActionList({Input}), Input->getType()) {}
   Action(ActionClass Kind, const ActionList &Inputs, types::ID Type)
-      : Kind(Kind), Type(Type), Inputs(Inputs) {}
+      : Kind(Kind), Type(Type), Inputs(Inputs), OffloadingHostKind(0u),
+        OffloadingDeviceKind(OFFLOAD_None), OffloadingArch(nullptr) {}
 
 public:
   virtual ~Action();
@@ -119,6 +128,36 @@
   input_const_range inputs() const {
     return input_const_range(input_begin(), input_end());
   }
+
+  std::string getOffloadingKindPrefix() const;
+  std::string getOffloadingFileNamePrefix(const ToolChain *TC) const;
+
+  /// \brief Set the device offload info of this action and propagate it to its
+  /// dependences.
+  void propagateDeviceOffloadInfo(OffloadKind OKind, const char *OArch) const;
+  /// \brief Append the host offload info of this action and propagate it to its
+  /// dependences.
+  void propagateHostOffloadInfo(unsigned OKinds, const char *OArch) const;
+  /// \brief Set the offload info of this action to be the same as the provided
+  /// action, and propagate it to its dependences.
+  void propagateOffloadInfo(const Action *A) const;
+
+  unsigned getOffloadingHostKinds() const { return OffloadingHostKind; }
+  OffloadKind getOffloadingDeviceKind() const { return OffloadingDeviceKind; }
+  const char *getOffloadingArch() const { return OffloadingArch; }
+
+  /// \brief Check if this action have any offload kinds. Note that host offload
+  /// kinds are only set if the action is a dependence to an host offload
+  /// action.
+  bool isHostOffloading(OffloadKind OKind) const {
+    return OffloadingHostKind & OKind;
+  }
+  bool isDeviceOffloading(OffloadKind OKind) const {
+    return OffloadingDeviceKind == OKind;
+  }
+  bool isOffloading(OffloadKind OKind) const {
+    return isHostOffloading(OKind) || isDeviceOffloading(OKind);
+  }
 };
 
 class InputAction : public Action {
@@ -151,43 +190,102 @@
   }
 };
 
-class CudaDeviceAction : public Action {
+/// \brief An offload action combines host or/and device actions according to
+/// the programming model implementation needs and propagates the offloading
+/// kind to its dependences.
+class OffloadAction : public Action {
   virtual void anchor();
-  /// GPU architecture to bind.  Always of the form /sm_\d+/ or null (when the
-  /// action applies to multiple architectures).
-  const char *GpuArchName;
-  /// True when action results are not consumed by the host action (e.g when
-  /// -fsyntax-only or --cuda-device-only options are used).
-  bool AtTopLevel;
-
 public:
-  CudaDeviceAction(Action *Input, const char *ArchName, bool AtTopLevel);
+  /// \brief Type used to communicate device actions. It associates bound
+  /// architecture, toolchain, and offload kind to each action.
+  class DeviceDependences {
+  public:
+    typedef SmallVector<const ToolChain *, 3> ToolChainList;
+    typedef SmallVector<const char *, 3> BoundArchList;
+    typedef SmallVector<OffloadKind, 3> OffloadKindList;
+
+  private:
+    /// \brief The dependence action.
+    ActionList AL;
+    /// \brief The offloading toolchains that should be used with the action.
+    SmallVector<const ToolChain *, 3> TCL;
+    /// \brief The architectures that should be used with this action.
+    SmallVector<const char *, 3> BAL;
+    /// \brief The offload kind of each dependence.
+    SmallVector<OffloadKind, 3> KL;
+
+  public:
+    /// \brief Add a action along with the associated toolchain, bound arch, and
+    /// offload kind.
+    void add(Action *A, const ToolChain *TC, const char *BoundArch,
+             OffloadKind OKind);
+
+    /// \brief Get each of the individual arrays.
+    const ActionList &getActions() const { return AL; };
+    const ToolChainList &getToolChains() const { return TCL; };
+    const BoundArchList &getBoundArchs() const { return BAL; };
+    const OffloadKindList &getOffloadKinds() const { return KL; };
+  };
+
+  /// \brief Type used to communicate host actions. It associates bound
+  /// architecture, toolchain, and offload kinds to each action.
+  class HostDependence {
+    /// \brief The dependence action.
+    Action *A;
+    /// \brief The offloading toolchain that should be used with the action.
+    const ToolChain *TC;
+    /// \brief The architectures that should be used with this action.
+    const char *BoundArch;
+    /// \brief The offload kind of each dependence.
+    unsigned OffloadKinds;
+
+  public:
+    HostDependence(Action *A, const ToolChain *TC, const char *BoundArch,
+                   const unsigned OffloadKinds)
+        : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(OffloadKinds){};
+    /// \brief Constructor version that obtains the offload kinds from the
+    /// device dependencies.
+    HostDependence(Action *A, const ToolChain *TC, const char *BoundArch,
+                   const DeviceDependences &DDeps);
+    Action *getAction() const { return A; };
+    const ToolChain *getToolChain() const { return TC; };
+    const char *getBoundArch() const { return BoundArch; };
+    unsigned getOffloadKinds() const { return OffloadKinds; };
+  };
 
-  const char *getGpuArchName() const { return GpuArchName; }
+  typedef llvm::function_ref<void(Action *, const ToolChain *, const char *)>
+      OffloadActionWorkTy;
 
-  /// Gets the compute_XX that corresponds to getGpuArchName().  Returns null
-  /// when getGpuArchName() is null.
-  const char *getComputeArchName() const;
+private:
+  /// \brief The offloading toolchain that should be used with the action.
+  const ToolChain *HostTC;
 
-  bool isAtTopLevel() const { return AtTopLevel; }
+  /// \brief The tool chains associated with the list of actions.
+  DeviceDependences::ToolChainList DevToolChains;
 
-  static bool IsValidGpuArchName(llvm::StringRef ArchName);
+public:
+  OffloadAction(const HostDependence &HDep);
+  OffloadAction(const DeviceDependences &DDeps, types::ID Ty);
+  OffloadAction(const HostDependence &HDep, const DeviceDependences &DDeps);
 
-  static bool classof(const Action *A) {
-    return A->getKind() == CudaDeviceClass;
-  }
-};
+  /// \brief Execute the work specified in \a Work on the host dependence.
+  void doOnHostDependence(const OffloadActionWorkTy &Work) const;
 
-class CudaHostAction : public Action {
-  virtual void anchor();
-  ActionList DeviceActions;
+  /// \brief Execute the work specified in \a Work on each device dependence.
+  void doOnEachDeviceDependence(const OffloadActionWorkTy &Work) const;
 
-public:
-  CudaHostAction(Action *Input, const ActionList &DeviceActions);
+  /// \brief Execute the work specified in \a Work on each dependence.
+  void doOnEachDependence(const OffloadActionWorkTy &Work) const;
+
+  /// \brief Return the host dependence of this action, or null if we don't have
+  /// any.
+  Action *getHostDependence() const;
 
-  const ActionList &getDeviceActions() const { return DeviceActions; }
+  /// \brief Return the single device dependence of this action, or null if we
+  /// don't have one or we have more than one.
+  Action *getSingleDeviceDependence() const;
 
-  static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
+  static bool classof(const Action *A) { return A->getKind() == OffloadClass; }
 };
 
 class JobAction : public Action {
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -415,12 +415,11 @@
   /// \param BoundArch - The bound architecture. 
   /// \param AtTopLevel - Whether this is a "top-level" action.
   /// \param MultipleArchs - Whether multiple -arch options were supplied.
-  const char *GetNamedOutputPath(Compilation &C,
-                                 const JobAction &JA,
-                                 const char *BaseInput,
-                                 const char *BoundArch,
-                                 bool AtTopLevel,
-                                 bool MultipleArchs) const;
+  /// \param TC - Toolchain associated with the output.
+  const char *GetNamedOutputPath(Compilation &C, const JobAction &JA,
+                                 const char *BaseInput, const char *BoundArch,
+                                 bool AtTopLevel, bool MultipleArchs,
+                                 const ToolChain *TC) const;
 
   /// GetTemporaryPath - Return the pathname of a temporary file to use 
   /// as part of compilation; the file will have the given prefix and suffix.
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Driver/Action.h"
+#include "clang/Driver/ToolChain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
@@ -21,8 +22,7 @@
   switch (AC) {
   case InputClass: return "input";
   case BindArchClass: return "bind-arch";
-  case CudaDeviceClass: return "cuda-device";
-  case CudaHostClass: return "cuda-host";
+  case OffloadClass: return "offload";
   case PreprocessJobClass: return "preprocessor";
   case PrecompileJobClass: return "precompiler";
   case AnalyzeJobClass: return "analyzer";
@@ -40,6 +40,79 @@
   llvm_unreachable("invalid class");
 }
 
+void Action::propagateDeviceOffloadInfo(OffloadKind OKind,
+                                        const char *OArch) const {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert(
+      (OffloadingDeviceKind == OKind || OffloadingDeviceKind == OFFLOAD_None) &&
+      "Setting device kind to a different device??");
+  assert(!OffloadingHostKind && "Setting a device kind in a host action??");
+  OffloadingDeviceKind = OKind;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateDeviceOffloadInfo(OffloadingDeviceKind, OArch);
+}
+
+void Action::propagateHostOffloadInfo(unsigned OKinds,
+                                      const char *OArch) const {
+  // Offload action set its own kinds on their dependences.
+  if (Kind == OffloadClass)
+    return;
+
+  assert(OffloadingDeviceKind == OFFLOAD_None &&
+         "Setting a host kind in a device action.");
+  OffloadingHostKind |= OKinds;
+  OffloadingArch = OArch;
+
+  for (auto *A : Inputs)
+    A->propagateHostOffloadInfo(OffloadingHostKind, OArch);
+}
+
+void Action::propagateOffloadInfo(const Action *A) const {
+  if (unsigned HK = A->getOffloadingHostKinds())
+    propagateHostOffloadInfo(HK, A->getOffloadingArch());
+  else
+    propagateDeviceOffloadInfo(A->getOffloadingDeviceKind(),
+                               A->getOffloadingArch());
+}
+
+std::string Action::getOffloadingKindPrefix() const {
+  switch (OffloadingDeviceKind) {
+  case OFFLOAD_None:
+    break;
+  case OFFLOAD_CUDA:
+    return "device-cuda";
+    // Add other programming models here.
+  }
+
+  if (!OffloadingHostKind)
+    return "";
+
+  std::string Res("host");
+  if (OffloadingHostKind & OFFLOAD_CUDA)
+    Res += "-cuda";
+  // Add other programming models here.
+
+  return Res;
+}
+
+std::string Action::getOffloadingFileNamePrefix(const ToolChain *TC) const {
+  // A file prefix is only generated for device actions and consists of the
+  // offload kind and triple.
+  if (!OffloadingDeviceKind)
+    return "";
+
+  std::string Res("-");
+  Res += getOffloadingKindPrefix();
+  Res += "-";
+  Res += TC->getTriple().normalize();
+  return Res;
+}
+
 void InputAction::anchor() {}
 
 InputAction::InputAction(const Arg &_Input, types::ID _Type)
@@ -51,45 +124,106 @@
 BindArchAction::BindArchAction(Action *Input, const char *_ArchName)
     : Action(BindArchClass, Input), ArchName(_ArchName) {}
 
-// Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual
-// compute arch, e.g. "compute_20".  Returns null if the input arch is null or
-// doesn't match an existing arch.
-static const char* GpuArchToComputeName(const char *ArchName) {
-  if (!ArchName)
-    return nullptr;
-  return llvm::StringSwitch<const char *>(ArchName)
-      .Cases("sm_20", "sm_21", "compute_20")
-      .Case("sm_30", "compute_30")
-      .Case("sm_32", "compute_32")
-      .Case("sm_35", "compute_35")
-      .Case("sm_37", "compute_37")
-      .Case("sm_50", "compute_50")
-      .Case("sm_52", "compute_52")
-      .Case("sm_53", "compute_53")
-      .Default(nullptr);
+void OffloadAction::anchor() {}
+
+OffloadAction::OffloadAction(const HostDependence &HDep)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()) {
+  OffloadingArch = HDep.getBoundArch();
+  OffloadingHostKind = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+};
+
+OffloadAction::OffloadAction(const DeviceDependences &DDeps, types::ID Ty)
+    : Action(OffloadClass, DDeps.getActions(), Ty), HostTC(nullptr),
+      DevToolChains(DDeps.getToolChains()) {
+  auto &OKinds = DDeps.getOffloadKinds();
+  auto &BArchs = DDeps.getBoundArchs();
+
+  // If we have a single dependency, inherit the offloading info from it.
+  if (OKinds.size() == 1) {
+    OffloadingDeviceKind = OKinds.front();
+    OffloadingArch = BArchs.front();
+  }
+  // Propagate info to the dependencies.
+  for (unsigned i = 0; i < getInputs().size(); ++i)
+    getInputs()[i]->propagateDeviceOffloadInfo(OKinds[i], BArchs[i]);
 }
 
-void CudaDeviceAction::anchor() {}
+OffloadAction::OffloadAction(const HostDependence &HDep,
+                             const DeviceDependences &DDeps)
+    : Action(OffloadClass, HDep.getAction()), HostTC(HDep.getToolChain()),
+      DevToolChains(DDeps.getToolChains()) {
+  // We use the kinds of the host dependence for this action.
+  OffloadingArch = HDep.getBoundArch();
+  OffloadingHostKind = HDep.getOffloadKinds();
+  HDep.getAction()->propagateHostOffloadInfo(HDep.getOffloadKinds(),
+                                             HDep.getBoundArch());
+
+  // Add device inputs and propagate info to the device actions.
+  for (unsigned i = 0; i < DDeps.getActions().size(); ++i) {
+    auto *A = DDeps.getActions()[i];
+    // Skip actions of empty dependences.
+    if (!A)
+      continue;
+    getInputs().push_back(A);
+    A->propagateDeviceOffloadInfo(DDeps.getOffloadKinds()[i],
+                                  DDeps.getBoundArchs()[i]);
+  }
+}
 
-CudaDeviceAction::CudaDeviceAction(Action *Input, const char *ArchName,
-                                   bool AtTopLevel)
-    : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
-      AtTopLevel(AtTopLevel) {
-  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
+void OffloadAction::doOnHostDependence(const OffloadActionWorkTy &Work) const {
+  if (!HostTC)
+    return;
+  auto *A = getInputs().front();
+  Work(A, HostTC, A->getOffloadingArch());
 }
 
-const char *CudaDeviceAction::getComputeArchName() const {
-  return GpuArchToComputeName(GpuArchName);
+void OffloadAction::doOnEachDeviceDependence(
+    const OffloadActionWorkTy &Work) const {
+  auto I = getInputs().begin();
+  auto E = getInputs().end();
+  if (I == E)
+    return;
+
+  // Skip host action
+  if (HostTC)
+    ++I;
+
+  auto TI = DevToolChains.begin();
+  for (; I != E; ++I)
+    Work(*I, *TI, (*I)->getOffloadingArch());
 }
 
-bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) {
-  return GpuArchToComputeName(ArchName.data()) != nullptr;
+void OffloadAction::doOnEachDependence(const OffloadActionWorkTy &Work) const {
+  doOnHostDependence(Work);
+  doOnEachDeviceDependence(Work);
 }
 
-void CudaHostAction::anchor() {}
+Action *OffloadAction::getHostDependence() const {
+  return HostTC ? getInputs().front() : nullptr;
+}
+
+Action *OffloadAction::getSingleDeviceDependence() const {
+  return (!HostTC && getInputs().size() == 1) ? getInputs().front() : nullptr;
+}
 
-CudaHostAction::CudaHostAction(Action *Input, const ActionList &DeviceActions)
-    : Action(CudaHostClass, Input), DeviceActions(DeviceActions) {}
+void OffloadAction::DeviceDependences::add(Action *A, const ToolChain *TC,
+                                           const char *BoundArch,
+                                           OffloadKind OKind) {
+  AL.push_back(A);
+  TCL.push_back(TC);
+  BAL.push_back(BoundArch);
+  KL.push_back(OKind);
+}
+
+OffloadAction::HostDependence::HostDependence(Action *A, const ToolChain *TC,
+                                              const char *BoundArch,
+                                              const DeviceDependences &DDeps)
+    : A(A), TC(TC), BoundArch(BoundArch), OffloadKinds(0u) {
+  for (auto K : DDeps.getOffloadKinds())
+    OffloadKinds |= K;
+}
 
 void JobAction::anchor() {}
 
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -987,18 +987,33 @@
   } else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->input_begin(), Ids) << "}";
-  } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"'
-       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
-       << '"' << ", {" << PrintActions1(C, *CDA->input_begin(), Ids) << "}";
+  } else if (OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    bool IsFirst = true;
+    OA->doOnEachDependence(
+        [&](Action *A, const ToolChain *TC, const char *BoundArch) {
+          // E.g. for two CUDA device dependences whose bound arch is sm_20 and
+          // sm_35 this will generate:
+          // "cuda-device" (nvptx64-nvidia-cuda:sm_20) {#ID}, "cuda-device"
+          // (nvptx64-nvidia-cuda:sm_35) {#ID}
+          if (!IsFirst)
+            os << ", ";
+          os << '"';
+          if (TC)
+            os << A->getOffloadingKindPrefix();
+          else
+            os << "host";
+          os << " (";
+          os << TC->getTriple().normalize();
+
+          if (BoundArch)
+            os << ":" << BoundArch;
+          os << ")";
+          os << '"';
+          os << " {" << PrintActions1(C, A, Ids) << "}";
+          IsFirst = false;
+        });
   } else {
-    const ActionList *AL;
-    if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-      os << "{" << PrintActions1(C, *CHA->input_begin(), Ids) << "}"
-         << ", gpu binaries ";
-      AL = &CHA->getDeviceActions();
-    } else
-      AL = &A->getInputs();
+    const ActionList *AL = &A->getInputs();
 
     if (AL->size()) {
       const char *Prefix = "{";
@@ -1011,10 +1026,24 @@
       os << "{}";
   }
 
+  // Append offload info for all options other than the offloading action
+  // itself (e.g. (cuda-device, sm_20) or (cuda-host)).
+  std::string offload_str;
+  llvm::raw_string_ostream offload_os(offload_str);
+  if (!isa<OffloadAction>(A)) {
+    auto S = A->getOffloadingKindPrefix();
+    if (!S.empty()) {
+      offload_os << ", (" << S;
+      if (A->getOffloadingArch())
+        offload_os << ", " << A->getOffloadingArch();
+      offload_os << ")";
+    }
+  }
+
   unsigned Id = Ids.size();
   Ids[A] = Id;
   llvm::errs() << Id << ": " << os.str() << ", "
-               << types::getTypeName(A->getType()) << "\n";
+               << types::getTypeName(A->getType()) << offload_os.str() << "\n";
 
   return Id;
 }
@@ -1327,8 +1356,12 @@
                                                options::OPT_cuda_device_only);
   // Host-only compilation case.
   if (PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only))
-    return C.MakeAction<CudaHostAction>(HostAction, ActionList());
+      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only)) {
+    OffloadAction::HostDependence HDep(
+        HostAction, C.getOffloadingHostToolChain(), /*BoundArch=*/nullptr,
+        Action::OFFLOAD_CUDA);
+    return C.MakeAction<OffloadAction>(HDep);
+  }
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
   SmallVector<const char *, 4> GpuArchList;
@@ -1339,7 +1372,7 @@
     A->claim();
 
     const auto& Arch = A->getValue();
-    if (!CudaDeviceAction::IsValidGpuArchName(Arch))
+    if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch))
       C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch;
     else if (GpuArchNames.insert(Arch).second)
       GpuArchList.push_back(Arch);
@@ -1355,9 +1388,11 @@
   for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
+  const ToolChain *CudaTC =
+      C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>();
+
   // Build actions for all device inputs.
-  assert(C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>() &&
-         "Missing toolchain for device-side compilation.");
+  assert(CudaTC && "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
   C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
@@ -1385,10 +1420,13 @@
       return nullptr;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-      Actions.push_back(C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I],
-                                                       GpuArchList[I],
-                                                       /* AtTopLevel */ true));
+    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(CudaDeviceActions[I], CudaTC, GpuArchList[I],
+               Action::OFFLOAD_CUDA);
+      Actions.push_back(
+          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    }
     // Kill host action in case of device-only compilation.
     if (DeviceOnlyCompilation)
       return nullptr;
@@ -1408,19 +1446,23 @@
     Action* BackendAction = AssembleAction->getInputs()[0];
     assert(BackendAction->getType() == types::TY_PP_Asm);
 
-    for (const auto& A : {AssembleAction, BackendAction}) {
-      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
-          A, GpuArchList[I], /* AtTopLevel */ false));
+    for (auto &A : {AssembleAction, BackendAction}) {
+      OffloadAction::DeviceDependences DDep;
+      DDep.add(A, CudaTC, GpuArchList[I], Action::OFFLOAD_CUDA);
+      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
     }
   }
-  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
-      /* GpuArchName = */ nullptr,
-      /* AtTopLevel = */ false);
+  auto FatbinAction =
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(std::move(HostAction),
-                                      ActionList({FatbinAction}));
+  OffloadAction::HostDependence HDep(HostAction, C.getOffloadingHostToolChain(),
+                                     /*BoundArch=*/nullptr,
+                                     Action::OFFLOAD_CUDA);
+  OffloadAction::DeviceDependences DDep;
+  DDep.add(FatbinAction, CudaTC, /*BoundArch=*/nullptr, Action::OFFLOAD_CUDA);
+  return C.MakeAction<OffloadAction>(HDep, DDep);
 }
 
 void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
@@ -1825,7 +1867,28 @@
     }
   }
 }
-
+// Collapse an offloading action looking for a job of the given type. The input
+// action is changed to the input of the collapsed sequence. If we effectively
+// had a collapse return the corresponding offloading action, otherwise return
+// null.
+template <typename T>
+static OffloadAction *collapseOffloadingAction(Action *&CurAction) {
+  if (!CurAction)
+    return nullptr;
+  if (auto *OA = dyn_cast<OffloadAction>(CurAction)) {
+    if (auto *HDep = OA->getHostDependence())
+      if (isa<T>(HDep)) {
+        CurAction = HDep;
+        return OA;
+      }
+    if (auto *DDep = OA->getSingleDeviceDependence())
+      if (isa<T>(DDep)) {
+        CurAction = DDep;
+        return OA;
+      }
+  }
+  return nullptr;
+}
 // Returns a Tool for a given JobAction.  In case the action and its
 // predecessors can be combined, updates Inputs with the inputs of the
 // first combined action. If one of the collapsed actions is a
@@ -1835,34 +1898,39 @@
                                     bool EmbedBitcode, const ToolChain *TC,
                                     const JobAction *JA,
                                     const ActionList *&Inputs,
-                                    const CudaHostAction *&CollapsedCHA) {
+                                    ActionList &CollapsedOffloadAction) {
   const Tool *ToolForJob = nullptr;
-  CollapsedCHA = nullptr;
+  CollapsedOffloadAction.clear();
 
   // See if we should look for a compiler with an integrated assembler. We match
   // bottom up, so what we are actually looking for is an assembler job with a
   // compiler input.
 
+  // Look through offload actions between assembler and backend actions.
+  Action *BackendJA = (isa<AssembleJobAction>(JA) && Inputs->size() == 1)
+                          ? *Inputs->begin()
+                          : nullptr;
+  auto *BackendOA = collapseOffloadingAction<BackendJobAction>(BackendJA);
+
   if (TC->useIntegratedAs() && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_via_file_asm) &&
       !C.getArgs().hasArg(options::OPT__SLASH_FA) &&
-      !C.getArgs().hasArg(options::OPT__SLASH_Fa) &&
-      isa<AssembleJobAction>(JA) && Inputs->size() == 1 &&
-      isa<BackendJobAction>(*Inputs->begin())) {
+      !C.getArgs().hasArg(options::OPT__SLASH_Fa) && BackendJA &&
+      isa<BackendJobAction>(BackendJA)) {
     // A BackendJob is always preceded by a CompileJob, and without -save-temps
     // or -fembed-bitcode, they will always get combined together, so instead of
     // checking the backend tool, check if the tool for the CompileJob has an
     // integrated assembler. For -fembed-bitcode, CompileJob is still used to
     // look up tools for BackendJob, but they need to match before we can split
     // them.
-    const ActionList *BackendInputs = &(*Inputs)[0]->getInputs();
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*BackendInputs->begin());
-    JobAction *CompileJA = cast<CompileJobAction>(
-        CHA ? *CHA->input_begin() : *BackendInputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *BackendJA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     // When using -fembed-bitcode, it is required to have the same tool (clang)
@@ -1876,7 +1944,15 @@
     if (Compiler->hasIntegratedAssembler()) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+      // Save the collapsed offload actions because they may still contain
+      // device action. Also propagate the offloading info of the inputs to the
+      // other action that are being integrated.
+      if (CompileOA)
+        CollapsedOffloadAction.push_back(CompileOA);
+      if (BackendOA)
+        CollapsedOffloadAction.push_back(BackendOA);
+      if (CompileOA || BackendOA)
+        JA->propagateOffloadInfo(CompileJA);
     }
   }
 
@@ -1886,20 +1962,25 @@
   if (isa<BackendJobAction>(JA)) {
     // Check if the compiler supports emitting LLVM IR.
     assert(Inputs->size() == 1);
-    // Compile job may be wrapped in CudaHostAction, extract it if
-    // that's the case and update CollapsedCHA if we combine phases.
-    CudaHostAction *CHA = dyn_cast<CudaHostAction>(*Inputs->begin());
-    JobAction *CompileJA =
-        cast<CompileJobAction>(CHA ? *CHA->input_begin() : *Inputs->begin());
-    assert(CompileJA && "Backend job is not preceeded by compile job.");
-    const Tool *Compiler = TC->SelectTool(*CompileJA);
+
+    // Look through offload actions between backend and compile actions.
+    Action *CompileJA = *JA->getInputs().begin();
+    auto *CompileOA = collapseOffloadingAction<CompileJobAction>(CompileJA);
+
+    assert(CompileJA && isa<CompileJobAction>(CompileJA) &&
+           "Backend job is not preceeded by compile job.");
+    const Tool *Compiler = TC->SelectTool(*cast<CompileJobAction>(CompileJA));
     if (!Compiler)
       return nullptr;
     if (!Compiler->canEmitIR() ||
         (!SaveTemps && !EmbedBitcode)) {
       Inputs = &CompileJA->getInputs();
       ToolForJob = Compiler;
-      CollapsedCHA = CHA;
+
+      if (CompileOA) {
+        CollapsedOffloadAction.push_back(CompileOA);
+        JA->propagateOffloadInfo(CompileJA);
+      }
     }
   }
 
@@ -1910,12 +1991,23 @@
   // See if we should use an integrated preprocessor. We do so when we have
   // exactly one input, since this is the only use case we care about
   // (irrelevant since we don't support combine yet).
-  if (Inputs->size() == 1 && isa<PreprocessJobAction>(*Inputs->begin()) &&
+
+  // Look through offload actions after preprocessing.
+  Action *PreprocessJA = (Inputs->size() == 1) ? *Inputs->begin() : nullptr;
+  auto *PreprocessOA =
+      collapseOffloadingAction<PreprocessJobAction>(PreprocessJA);
+
+  if (PreprocessJA && isa<PreprocessJobAction>(PreprocessJA) &&
       !C.getArgs().hasArg(options::OPT_no_integrated_cpp) &&
       !C.getArgs().hasArg(options::OPT_traditional_cpp) && !SaveTemps &&
       !C.getArgs().hasArg(options::OPT_rewrite_objc) &&
-      ToolForJob->hasIntegratedCPP())
-    Inputs = &(*Inputs)[0]->getInputs();
+      ToolForJob->hasIntegratedCPP()) {
+    Inputs = &PreprocessJA->getInputs();
+    if (PreprocessOA) {
+      CollapsedOffloadAction.push_back(PreprocessOA);
+      JA->propagateOffloadInfo(PreprocessJA);
+    }
+  }
 
   return ToolForJob;
 }
@@ -1952,17 +2044,31 @@
     const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
 
-  InputInfoList CudaDeviceInputInfos;
-  if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
-    // Append outputs of device jobs to the input list.
-    for (const Action *DA : CHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, nullptr, AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
+  InputInfoList OffloadDeviceInputInfos;
+  if (const OffloadAction *OA = dyn_cast<OffloadAction>(A)) {
+    Action *HostAction = nullptr;
+    OA->doOnEachDeviceDependence(
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDeviceInputInfos.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, AtTopLevel,
+              /*MultipleArchs*/ !!DepBoundArch, LinkingOutput, CachedResults));
+        });
+    OA->doOnHostDependence(
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          HostAction = DepA;
+        });
+
+    // If we have a single device action, just return its info.
+    if (!HostAction && OffloadDeviceInputInfos.size() == 1) {
+      return OffloadDeviceInputInfos.back();
     }
+
+    assert(HostAction && "Device actions are only expected to be used by the "
+                         "host, not by each other.");
+
     // Override current action with a real host compile action and continue
     // processing it.
-    A = *CHA->input_begin();
+    A = HostAction;
   }
 
   if (const InputAction *IA = dyn_cast<InputAction>(A)) {
@@ -1992,38 +2098,27 @@
                               MultipleArchs, LinkingOutput, CachedResults);
   }
 
-  if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    // Initial processing of CudaDeviceAction carries host params.
-    // Call BuildJobsForAction() again, now with correct device parameters.
-    InputInfo II = BuildJobsForAction(
-        C, *CDA->input_begin(),
-        C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>(),
-        CDA->getGpuArchName(), CDA->isAtTopLevel(), /*MultipleArchs=*/true,
-        LinkingOutput, CachedResults);
-    // Currently II's Action is *CDA->input_begin().  Set it to CDA instead, so
-    // that one can retrieve II's GPU arch.
-    II.setAction(A);
-    return II;
-  }
 
   const ActionList *Inputs = &A->getInputs();
 
   const JobAction *JA = cast<JobAction>(A);
-  const CudaHostAction *CollapsedCHA = nullptr;
+  ActionList CollapsedOffloadActions;
+
   const Tool *T =
       selectToolForJob(C, isSaveTempsEnabled(), embedBitcodeEnabled(), TC, JA,
-                       Inputs, CollapsedCHA);
+                       Inputs, CollapsedOffloadActions);
   if (!T)
     return InputInfo();
 
-  // If we've collapsed action list that contained CudaHostAction we
+  // If we've collapsed action list that contained OffloadAction we
   // need to build jobs for device-side inputs it may have held.
-  if (CollapsedCHA) {
-    for (const Action *DA : CollapsedCHA->getDeviceActions()) {
-      CudaDeviceInputInfos.push_back(BuildJobsForAction(
-          C, DA, TC, "", AtTopLevel,
-          /*MultipleArchs*/ false, LinkingOutput, CachedResults));
-    }
+  for (const auto *OA : CollapsedOffloadActions) {
+    cast<OffloadAction>(OA)->doOnEachDeviceDependence(
+        [&](Action *DepA, const ToolChain *DepTC, const char *DepBoundArch) {
+          OffloadDeviceInputInfos.push_back(BuildJobsForAction(
+              C, DepA, DepTC, DepBoundArch, AtTopLevel,
+              /*MultipleArchs=*/!!DepBoundArch, LinkingOutput, CachedResults));
+        });
   }
 
   // Only use pipes when there is exactly one input.
@@ -2047,9 +2142,10 @@
   if (JA->getType() == types::TY_dSYM)
     BaseInput = InputInfos[0].getFilename();
 
-  // Append outputs of cuda device jobs to the input list
-  if (CudaDeviceInputInfos.size())
-    InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
+  // Append outputs of offload device jobs to the input list
+  if (!OffloadDeviceInputInfos.empty())
+    InputInfos.append(OffloadDeviceInputInfos.begin(),
+                      OffloadDeviceInputInfos.end());
 
   // Determine the place to write output to, if any.
   InputInfo Result;
@@ -2057,7 +2153,7 @@
     Result = InputInfo(A, BaseInput);
   else
     Result = InputInfo(A, GetNamedOutputPath(C, *JA, BaseInput, BoundArch,
-                                             AtTopLevel, MultipleArchs),
+                                             AtTopLevel, MultipleArchs, TC),
                        BaseInput);
 
   if (CCCPrintBindings && !CCGenDiagnostics) {
@@ -2117,7 +2213,8 @@
 const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
                                        const char *BaseInput,
                                        const char *BoundArch, bool AtTopLevel,
-                                       bool MultipleArchs) const {
+                                       bool MultipleArchs,
+                                       const ToolChain *TC) const {
   llvm::PrettyStackTraceString CrashInfo("Computing output path");
   // Output to a user requested destination?
   if (AtTopLevel && !isa<DsymutilJobAction>(JA) && !isa<VerifyJobAction>(JA)) {
@@ -2203,6 +2300,7 @@
           MakeCLOutputFilename(C.getArgs(), "", BaseName, types::TY_Image);
     } else if (MultipleArchs && BoundArch) {
       SmallString<128> Output(getDefaultImageName());
+      Output += JA.getOffloadingFileNamePrefix(TC);
       Output += "-";
       Output.append(BoundArch);
       NamedOutput = C.getArgs().MakeArgString(Output.c_str());
@@ -2219,6 +2317,7 @@
     if (!types::appendSuffixForType(JA.getType()))
       End = BaseName.rfind('.');
     SmallString<128> Suffixed(BaseName.substr(0, End));
+    Suffixed += JA.getOffloadingFileNamePrefix(TC);
     if (MultipleArchs && BoundArch) {
       Suffixed += "-";
       Suffixed.append(BoundArch);
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -248,8 +248,7 @@
 
   case Action::InputClass:
   case Action::BindArchClass:
-  case Action::CudaDeviceClass:
-  case Action::CudaHostClass:
+  case Action::OffloadClass:
   case Action::LipoJobClass:
   case Action::DsymutilJobClass:
   case Action::VerifyDebugInfoJobClass:
Index: lib/Driver/ToolChains.h
===================================================================
--- lib/Driver/ToolChains.h
+++ lib/Driver/ToolChains.h
@@ -833,6 +833,11 @@
   // ptxas.
   bool useIntegratedAs() const override { return false; }
 
+  // Converts CUDA GPU architecture, e.g. "sm_21", to its corresponding virtual
+  // compute arch, e.g. "compute_20".  Returns null if the input arch is null or
+  // doesn't match an existing arch.
+  static const char *GpuArchToComputeName(const char *ArchName);
+
 protected:
   Tool *buildAssembler() const override;  // ptxas
   Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -4291,6 +4291,21 @@
   return DAL;
 }
 
+const char *CudaToolChain::GpuArchToComputeName(const char *ArchName) {
+  if (!ArchName)
+    return nullptr;
+  return llvm::StringSwitch<const char *>(ArchName)
+      .Cases("sm_20", "sm_21", "compute_20")
+      .Case("sm_30", "compute_30")
+      .Case("sm_32", "compute_32")
+      .Case("sm_35", "compute_35")
+      .Case("sm_37", "compute_37")
+      .Case("sm_50", "compute_50")
+      .Case("sm_52", "compute_52")
+      .Case("sm_53", "compute_53")
+      .Default(nullptr);
+}
+
 Tool *CudaToolChain::buildAssembler() const {
   return new tools::NVPTX::Assembler(*this);
 }
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -3565,7 +3565,7 @@
   // CUDA compilation may have multiple inputs (source file + results of
   // device-side compilations). All other jobs are expected to have exactly one
   // input.
-  bool IsCuda = types::isCuda(Input.getType());
+  bool IsCuda = JA.isOffloading(Action::OFFLOAD_CUDA);
   assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
 
   // Invoke ourselves in -cc1 mode.
@@ -3583,13 +3583,13 @@
     // particular compilation pass we're constructing here. For now we
     // can check which toolchain we're using and pick the other one to
     // extract the triple.
-    if (&getToolChain() ==
-        C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>())
+    if (JA.isDeviceOffloading(Action::OFFLOAD_CUDA))
       AuxToolChain = C.getOffloadingHostToolChain();
-    else if (&getToolChain() == C.getOffloadingHostToolChain())
+    else {
+      assert(C.isOffloadingHostKind(Action::OFFLOAD_CUDA) &&
+             "Expecting CUDA host toolchain.");
       AuxToolChain = C.getSingleOffloadDeviceToolChain<Action::OFFLOAD_CUDA>();
-    else
-      llvm_unreachable("Can't figure out CUDA compilation mode.");
+    }
     assert(AuxToolChain != nullptr && "No aux toolchain.");
     CmdArgs.push_back("-aux-triple");
     CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
@@ -10883,10 +10883,9 @@
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
-  std::vector<std::string> gpu_archs =
-      Args.getAllArgValues(options::OPT_march_EQ);
-  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
-  const std::string& gpu_arch = gpu_archs[0];
+  // Obtain architecture from the action.
+  const char *gpu_arch = JA.getOffloadingArch();
+  assert(gpu_arch && "Device action expected to have an architecture.");
 
   ArgStringList CmdArgs;
   CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
@@ -10960,12 +10959,19 @@
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
 
   for (const auto& II : Inputs) {
-    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    auto *A = II.getAction();
+    assert(A->getInputs().size() == 1 &&
+           "Device offload action is expected to have a single input");
+    const char *gpu_arch = A->getOffloadingArch();
+    assert(gpu_arch &&
+           "Device action expected to have associated a GPU architecture!");
+
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
-    const char *Arch = (II.getType() == types::TY_PP_Asm)
-                           ? A->getComputeArchName()
-                           : A->getGpuArchName();
+    const char *Arch =
+        (II.getType() == types::TY_PP_Asm)
+            ? toolchains::CudaToolChain::GpuArchToComputeName(gpu_arch)
+            : gpu_arch;
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
                                          Arch + ",file=" + II.getFilename()));
   }
Index: lib/Frontend/CreateInvocationFromCommandLine.cpp
===================================================================
--- lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -60,25 +60,25 @@
   }
 
   // We expect to get back exactly one command job, if we didn't something
-  // failed. CUDA compilation is an exception as it creates multiple jobs. If
-  // that's the case, we proceed with the first job. If caller needs particular
-  // CUDA job, it should be controlled via --cuda-{host|device}-only option
-  // passed to the driver.
+  // failed. Offload compilation is an exception as it creates multiple jobs. If
+  // that's the case, we proceed with the first job. If caller needs a
+  // particular job, it should be controlled via options (e.g.
+  // --cuda-{host|device}-only for CUDA) passed to the driver.
   const driver::JobList &Jobs = C->getJobs();
-  bool CudaCompilation = false;
+  bool OffloadCompilation = false;
   if (Jobs.size() > 1) {
     for (auto &A : C->getActions()){
       // On MacOSX real actions may end up being wrapped in BindArchAction
       if (isa<driver::BindArchAction>(A))
         A = *A->input_begin();
-      if (isa<driver::CudaDeviceAction>(A)) {
-        CudaCompilation = true;
+      if (isa<driver::OffloadAction>(A)) {
+        OffloadCompilation = true;
         break;
       }
     }
   }
   if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
-      (Jobs.size() > 1 && !CudaCompilation)) {
+      (Jobs.size() > 1 && !OffloadCompilation)) {
     SmallString<256> Msg;
     llvm::raw_svector_ostream OS(Msg);
     Jobs.Print(OS, "; ", true);