Index: include/clang/Driver/Compilation.h
===================================================================
--- include/clang/Driver/Compilation.h
+++ include/clang/Driver/Compilation.h
@@ -116,6 +116,12 @@
     return OrderedOffloadingToolchains.equal_range(Kind);
   }
 
+  /// Return true if an offloading tool chain of a given kind exists.
+  template <Action::OffloadKind Kind> bool hasOffloadToolChain() const {
+    return OrderedOffloadingToolchains.find(Kind) !=
+           OrderedOffloadingToolchains.end();
+  }
+
   /// Return an offload toolchain of the provided kind. Only one is expected to
   /// exist.
   template <Action::OffloadKind Kind>
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -1387,131 +1387,521 @@
   }
 }
 
-// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE
-// input action and then wraps each in CudaDeviceAction paired with
-// appropriate GPU arch name. In case of partial (i.e preprocessing
-// only) or device-only compilation, each device action is added to /p
-// Actions and /p Current is released. Otherwise the function creates
-// and returns a new CudaHostAction which wraps /p Current and device
-// side actions.
-static Action *buildCudaActions(Compilation &C, DerivedArgList &Args,
-                                const Arg *InputArg, Action *HostAction,
-                                ActionList &Actions) {
-  Arg *PartialCompilationArg = Args.getLastArg(
-      options::OPT_cuda_host_only, options::OPT_cuda_device_only,
-      options::OPT_cuda_compile_host_device);
-  bool CompileHostOnly =
-      PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only);
-  bool CompileDeviceOnly =
-      PartialCompilationArg &&
-      PartialCompilationArg->getOption().matches(options::OPT_cuda_device_only);
-
-  if (CompileHostOnly) {
+namespace {
+/// Provides a convenient interface for different programming models to generate
+/// the required device actions.
+class OffloadingActionBuilder final {
+  /// Flag used to trace errors in the builder.
+  bool IsValid = false;
+
+  /// The compilation that is using this builder.
+  Compilation &C;
+
+  /// The derived arguments associated with this builder.
+  DerivedArgList &Args;
+
+  /// Map between an input argument and the offload kinds used to process it.
+  std::map<const Arg *, unsigned> InputArgToOffloadKindMap;
+
+  /// Builder interface. It doesn't build anything or keep any state.
+  class DeviceActionBuilder {
+  public:
+    typedef llvm::SmallVector<phases::ID, phases::MaxNumberOfPhases> PhasesTy;
+
+    enum ActionBuilderReturnCode {
+      // The builder acted successfully on the current action.
+      ABRT_Success,
+      // The builder didn't have to act on the current action.
+      ABRT_Inactive,
+      // The builder was successful and requested the host action to not be
+      // generated.
+      ABRT_Ignore_Host,
+    };
+
+  protected:
+    /// Compilation associated with this builder.
+    Compilation &C;
+
+    /// Tool chains associated with this builder. The same programming
+    /// model may have associated one or more tool chains.
+    SmallVector<const ToolChain *, 2> ToolChains;
+
+    /// The derived arguments associated with this builder.
+    DerivedArgList &Args;
+
+    /// The inputs associated with this builder.
+    const Driver::InputList &Inputs;
+
+    /// The associated offload kind.
+    Action::OffloadKind AssociatedOffloadKind = Action::OFK_None;
+
+  public:
+    DeviceActionBuilder(Compilation &C, DerivedArgList &Args,
+                        const Driver::InputList &Inputs,
+                        Action::OffloadKind AssociatedOffloadKind)
+        : C(C), Args(Args), Inputs(Inputs),
+          AssociatedOffloadKind(AssociatedOffloadKind) {}
+    virtual ~DeviceActionBuilder() {}
+
+    /// Fill up the array \a DA with all the device dependences that should be
+    /// added to the provided host action \a HostAction. By default it is
+    /// inactive.
+    virtual ActionBuilderReturnCode
+    getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+                      phases::ID FinalPhase, PhasesTy &Phases) {
+      return ABRT_Inactive;
+    }
+
+    /// Update the state to include the provided host action \a HostAction as a
+    /// dependency of the current device action. By default it is inactive.
+    virtual ActionBuilderReturnCode addDeviceDepences(Action *HostAction) {
+      return ABRT_Inactive;
+    }
+
+    /// Append top level actions generated by the builder. Return true if errors
+    /// were found.
+    virtual void appendTopLevelActions(ActionList &AL) {}
+
+    /// Append linker actions generated by the builder. Return true if errors
+    /// were found.
+    virtual void appendLinkDependences(OffloadAction::DeviceDependences &DA) {}
+
+    /// Initialize the builder. Return true if any initialization errors are
+    /// found.
+    virtual bool initialize() { return false; }
+
+    /// Return true if this builder is valid. We have a valid builder if we have
+    /// associated device tool chains.
+    bool isValid() { return !ToolChains.empty(); }
+
+    /// Return the associated offload kind.
+    Action::OffloadKind getAssociatedOffloadKind() {
+      return AssociatedOffloadKind;
+    }
+  };
+
+  /// \brief CUDA action builder. It injects device code in the host backend
+  /// action.
+  class CudaActionBuilder final : public DeviceActionBuilder {
+    /// Flags to signal if the user requested host-only or device-only
+    /// compilation.
+    bool CompileHostOnly = false;
+    bool CompileDeviceOnly = false;
+
+    /// List of GPU architectures to use in this compilation.
+    SmallVector<const char *, 4> GpuArchList;
+
+    /// The CUDA actions for the current input.
+    ActionList CudaDeviceActions;
+
+    /// The CUDA fat binary if it was generated for the current input.
+    Action *CudaFatBinary = nullptr;
+
+    /// Flag that is set to true if this builder acted on the current input.
+    bool IsActive = false;
+
+  public:
+    CudaActionBuilder(Compilation &C, DerivedArgList &Args,
+                      const Driver::InputList &Inputs)
+        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_Cuda) {}
+
+    ActionBuilderReturnCode
+    getDeviceDepences(OffloadAction::DeviceDependences &DA, phases::ID CurPhase,
+                      phases::ID FinalPhase, PhasesTy &Phases) override {
+      if (!IsActive)
+        return ABRT_Inactive;
+
+      // If we don't have more CUDA actions, we don't have any dependences to
+      // create for the host.
+      if (CudaDeviceActions.empty())
+        return ABRT_Success;
+
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(!CompileHostOnly &&
+             "Not expecting CUDA actions in host-only compilation.");
+
+      // If we are generating code for the device or we are in a backend phase,
+      // we attempt to generate the fat binary. We compile each arch to ptx and
+      // assemble to cubin, then feed the cubin *and* the ptx into a device
+      // "link" action, which uses fatbinary to combine these cubins into one
+      // fatbin.  The fatbin is then an input to the host action if not in
+      // device-only mode.
+      if (CompileDeviceOnly || CurPhase == phases::Backend) {
+        ActionList DeviceActions;
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+          // Produce the device action from the current phase up to the assemble
+          // phase.
+          for (auto Ph : Phases) {
+            // Skip the phases that were already dealt with.
+            if (Ph < CurPhase)
+              continue;
+            // We have to be consistent with the host final phase.
+            if (Ph > FinalPhase)
+              break;
+
+            CudaDeviceActions[I] = C.getDriver().ConstructPhaseAction(
+                C, Args, Ph, CudaDeviceActions[I]);
+
+            if (Ph == phases::Assemble)
+              break;
+          }
+
+          // If we didn't reach the assemble phase, we can't generate the fat
+          // binary.
+          if (!isa<AssembleJobAction>(CudaDeviceActions[I]))
+            continue;
+
+          Action *AssembleAction = CudaDeviceActions[I];
+          assert(AssembleAction->getType() == types::TY_Object);
+          assert(AssembleAction->getInputs().size() == 1);
+
+          Action *BackendAction = AssembleAction->getInputs()[0];
+          assert(BackendAction->getType() == types::TY_PP_Asm);
+
+          for (auto &A : {AssembleAction, BackendAction}) {
+            OffloadAction::DeviceDependences DDep;
+            DDep.add(*A, *ToolChains.front(), GpuArchList[I], Action::OFK_Cuda);
+            DeviceActions.push_back(
+                C.MakeAction<OffloadAction>(DDep, A->getType()));
+          }
+        }
+
+        // We generate the fat binary if we have device input actions.
+        if (!DeviceActions.empty()) {
+          CudaFatBinary =
+              C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
+
+          if (!CompileDeviceOnly) {
+            DA.add(*CudaFatBinary, *ToolChains.front(), /*BoundArch=*/nullptr,
+                   Action::OFK_Cuda);
+            // Clear the fat binary, it is already a dependence to an host
+            // action.
+            CudaFatBinary = nullptr;
+          }
+
+          // Remove the CUDA actions as they are already connected to an host
+          // action or fat binary.
+          CudaDeviceActions.clear();
+        }
+
+        // We avoid creating host action in device-only mode.
+        return CompileDeviceOnly ? ABRT_Ignore_Host : ABRT_Success;
+      }
+
+      assert(CurPhase < phases::Backend && "Generating single CUDA "
+                                           "instructions should only occur "
+                                           "before the backend phase!");
+
+      // By default, we produce an action for each device arch.
+      for (Action *&A : CudaDeviceActions)
+        A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A);
+
+      return ABRT_Success;
+    }
+
+    ActionBuilderReturnCode addDeviceDepences(Action *HostAction) override {
+      // While generating code for CUDA, we only depend on the host input action
+      // to trigger the creation of all the CUDA device actions.
+
+      // If we are dealing with an input action, replicate it for each GPU
+      // architecture. If we are in host-only mode we return 'success' so that
+      // the host use the CUDA offload kind.
+      if (auto *IA = dyn_cast<InputAction>(HostAction)) {
+        assert(!GpuArchList.empty() &&
+               "We should have at least one GPU architecture.");
+
+        // If the host input is not CUDA, we don't need to bother about this
+        // input.
+        if (IA->getType() != types::TY_CUDA) {
+          // The builder will ignore this input.
+          IsActive = false;
+          return ABRT_Inactive;
+        }
+
+        // Set the flag to true, so that the builder acts on the current input.
+        IsActive = true;
+
+        if (CompileHostOnly)
+          return ABRT_Success;
+
+        // Replicate inputs for each GPU architecture.
+        for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+          CudaDeviceActions.push_back(C.MakeAction<InputAction>(
+              IA->getInputArg(), types::TY_CUDA_DEVICE));
+
+        return ABRT_Success;
+      }
+
+      return IsActive ? ABRT_Success : ABRT_Inactive;
+    }
+
+    void appendTopLevelActions(ActionList &AL) override {
+      // Utility to append actions to the top level list.
+      auto AddTopLevel = [&](Action *A, const char *BoundArch) {
+        OffloadAction::DeviceDependences Dep;
+        Dep.add(*A, *ToolChains.front(), BoundArch, Action::OFK_Cuda);
+        AL.push_back(C.MakeAction<OffloadAction>(Dep, A->getType()));
+      };
+
+      // If we have a fat binary, add it to the list.
+      if (CudaFatBinary) {
+        AddTopLevel(CudaFatBinary, /*BoundArch=*/nullptr);
+        CudaDeviceActions.clear();
+        CudaFatBinary = nullptr;
+        return;
+      }
+
+      if (CudaDeviceActions.empty())
+        return;
+
+      // If we have CUDA actions at this point, that's because we have a have
+      // partial compilation, so we should have an action for each GPU
+      // architecture.
+      assert(CudaDeviceActions.size() == GpuArchList.size() &&
+             "Expecting one action per GPU architecture.");
+      assert(ToolChains.size() == 1 &&
+             "Expecting to have a sing CUDA toolchain.");
+      for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
+        AddTopLevel(CudaDeviceActions[I], GpuArchList[I]);
+
+      CudaDeviceActions.clear();
+    }
+
+    bool initialize() override {
+      // We don't need to support CUDA.
+      if (!C.hasOffloadToolChain<Action::OFK_Cuda>())
+        return false;
+
+      ToolChains.push_back(C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+
+      Arg *PartialCompilationArg = Args.getLastArg(
+          options::OPT_cuda_host_only, options::OPT_cuda_device_only,
+          options::OPT_cuda_compile_host_device);
+      CompileHostOnly = PartialCompilationArg &&
+                        PartialCompilationArg->getOption().matches(
+                            options::OPT_cuda_host_only);
+      CompileDeviceOnly = PartialCompilationArg &&
+                          PartialCompilationArg->getOption().matches(
+                              options::OPT_cuda_device_only);
+
+      // Collect all cuda_gpu_arch parameters, removing duplicates.
+      llvm::StringSet<> GpuArchNames;
+      bool Error = false;
+      for (Arg *A : Args) {
+        if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
+          continue;
+        A->claim();
+
+        const auto &Arch = A->getValue();
+        if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch)) {
+          C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch;
+          Error = true;
+        } else if (GpuArchNames.insert(Arch).second)
+          GpuArchList.push_back(Arch);
+      }
+
+      // Default to sm_20 which is the lowest common denominator for supported
+      // GPUs.
+      // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
+      if (GpuArchList.empty())
+        GpuArchList.push_back("sm_20");
+
+      return Error;
+    }
+  };
+
+  /// Add the implementation for other specialized builders here.
+
+  /// Specialized builders being used by this offloading action builder.
+  SmallVector<DeviceActionBuilder *, 4> SpecializedBuilders;
+
+public:
+  OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
+                          const Driver::InputList &Inputs)
+      : C(C), Args(Args) {
+    // Create a specialized builder for each device toolchain.
+
+    IsValid = true;
+
+    // Create a specialized builder for CUDA.
+    SpecializedBuilders.push_back(new CudaActionBuilder(C, Args, Inputs));
+
+    //
+    // TODO: Build other specialized builders here.
+    //
+
+    // Initialize all the builders, keeping track of errors.
+    for (auto *SB : SpecializedBuilders)
+      IsValid = IsValid && !SB->initialize();
+  }
+
+  ~OffloadingActionBuilder() {
+    for (auto *SB : SpecializedBuilders)
+      delete SB;
+  }
+
+  /// Generate an action that adds device dependences (if any) to a host action.
+  /// If no device dependence actions exist, just return the host action \a
+  /// HostAction. If an error is found or if no builder requires the host action
+  /// to be generated, return nullptr.
+  Action *
+  addDeviceDependencesToHostAction(Action *HostAction, const Arg *InputArg,
+                                   phases::ID CurPhase, phases::ID FinalPhase,
+                                   DeviceActionBuilder::PhasesTy &Phases) {
+    if (!IsValid)
+      return nullptr;
+
+    if (SpecializedBuilders.empty())
+      return HostAction;
+
+    assert(HostAction && "Invalid host action!");
+
+    OffloadAction::DeviceDependences DDeps;
+    // Check if all the programming models agree we should not emit the host
+    // action. Also, keep track of the offloading kinds employed.
+    auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+    unsigned InactiveBuilders = 0u;
+    unsigned IgnoringBuilders = 0u;
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid()) {
+        ++InactiveBuilders;
+        continue;
+      }
+
+      auto RetCode = SB->getDeviceDepences(DDeps, CurPhase, FinalPhase, Phases);
+
+      // If the builder explicitly says the host action should be ignored,
+      // we need to increment the variable that tracks the builders that request
+      // the host object to be ignored.
+      if (RetCode == DeviceActionBuilder::ABRT_Ignore_Host)
+        ++IgnoringBuilders;
+
+      // Unless the builder was inactive for this action, we have to record the
+      // offload kind because the host will have to use it.
+      if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+        OffloadKind |= SB->getAssociatedOffloadKind();
+    }
+
+    // If all builders agree that the host object should be ignored, just return
+    // nullptr.
+    if (IgnoringBuilders &&
+        SpecializedBuilders.size() == (InactiveBuilders + IgnoringBuilders))
+      return nullptr;
+
+    if (DDeps.getActions().empty())
+      return HostAction;
+
+    // We have dependences we need to bundle together. We use an offload action
+    // for that.
     OffloadAction::HostDependence HDep(
         *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
-        /*BoundArch=*/nullptr, Action::OFK_Cuda);
-    return C.MakeAction<OffloadAction>(HDep);
+        /*BoundArch=*/nullptr, DDeps);
+    return C.MakeAction<OffloadAction>(HDep, DDeps);
   }
 
-  // Collect all cuda_gpu_arch parameters, removing duplicates.
-  SmallVector<const char *, 4> GpuArchList;
-  llvm::StringSet<> GpuArchNames;
-  for (Arg *A : Args) {
-    if (!A->getOption().matches(options::OPT_cuda_gpu_arch_EQ))
-      continue;
-    A->claim();
+  /// Generate an action that adds a host dependence to a device action. The
+  /// results will be kept in this action builder. Return true if an error was
+  /// found.
+  bool addHostDependenceToDeviceActions(Action *HostAction,
+                                        const Arg *InputArg) {
+    if (!IsValid)
+      return true;
 
-    const auto& Arch = A->getValue();
-    if (!toolchains::CudaToolChain::GpuArchToComputeName(Arch))
-      C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << Arch;
-    else if (GpuArchNames.insert(Arch).second)
-      GpuArchList.push_back(Arch);
-  }
-
-  // Default to sm_20 which is the lowest common denominator for supported GPUs.
-  // sm_20 code should work correctly, if suboptimally, on all newer GPUs.
-  if (GpuArchList.empty())
-    GpuArchList.push_back("sm_20");
-
-  // Replicate inputs for each GPU architecture.
-  Driver::InputList CudaDeviceInputs;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
-
-  // Build actions for all device inputs.
-  ActionList CudaDeviceActions;
-  C.getDriver().BuildActions(C, Args, CudaDeviceInputs, CudaDeviceActions);
-  assert(GpuArchList.size() == CudaDeviceActions.size() &&
-         "Failed to create actions for all devices");
-
-  // Check whether any of device actions stopped before they could generate PTX.
-  bool PartialCompilation =
-      llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::AssembleJobClass;
-      });
+    assert(HostAction && "Invalid host action!");
+
+    // Register the offload kinds that are used.
+    auto &OffloadKind = InputArgToOffloadKindMap[InputArg];
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+
+      auto RetCode = SB->addDeviceDepences(HostAction);
 
-  const ToolChain *CudaTC = C.getSingleOffloadToolChain<Action::OFK_Cuda>();
+      // Host dependences for device actions are not compatible with that same
+      // action being ignored.
+      assert(RetCode != DeviceActionBuilder::ABRT_Ignore_Host &&
+             "Host dependence not expected to be ignored.!");
 
-  // Figure out what to do with device actions -- pass them as inputs to the
-  // host action or run each of them independently.
-  if (PartialCompilation || CompileDeviceOnly) {
-    // In case of partial or device-only compilation results of device actions
-    // are not consumed by the host action device actions have to be added to
-    // top-level actions list with AtTopLevel=true and run independently.
+      // Unless the builder was inactive for this action, we have to record the
+      // offload kind because the host will have to use it.
+      if (RetCode != DeviceActionBuilder::ABRT_Inactive)
+        OffloadKind |= SB->getAssociatedOffloadKind();
+    }
+
+    return false;
+  }
 
-    // -o is ambiguous if we have more than one top-level action.
-    if (Args.hasArg(options::OPT_o) &&
-        (!CompileDeviceOnly || GpuArchList.size() > 1)) {
+  /// Add the offloading top level actions to the provided action list.
+  bool appendTopLevelActions(ActionList &AL, Action *HostAction,
+                             const Arg *InputArg) {
+    auto NumActions = AL.size();
+
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+      SB->appendTopLevelActions(AL);
+    }
+
+    assert(NumActions <= AL.size() && "Expecting more actions, not less!");
+
+    // Propagate to the current host action (if any) the offload information
+    // associated with the current input.
+    if (HostAction)
+      HostAction->propagateHostOffloadInfo(InputArgToOffloadKindMap[InputArg],
+                                           /*BoundArch=*/nullptr);
+
+    // If any action is added by the builders, -o is ambiguous if we have more
+    // than one top-level action.
+    if (NumActions < AL.size() && Args.hasArg(options::OPT_o) &&
+        AL.size() > 1) {
       C.getDriver().Diag(
           clang::diag::err_drv_output_argument_with_multiple_files);
-      return nullptr;
+      return true;
     }
 
-    for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-      OffloadAction::DeviceDependences DDep;
-      DDep.add(*CudaDeviceActions[I], *CudaTC, GpuArchList[I],
-               Action::OFK_Cuda);
-      Actions.push_back(
-          C.MakeAction<OffloadAction>(DDep, CudaDeviceActions[I]->getType()));
+    return false;
+  }
+
+  /// Processes the host linker action. This currently consists of replacing it
+  /// with an offload action if there are device link objects and propagate to
+  /// the host action all the offload kinds used in the current compilation. The
+  /// resulting action is returned.
+  Action *processHostLinkAction(Action *HostAction) {
+    // Add all the dependences from the device linking actions.
+    OffloadAction::DeviceDependences DDeps;
+    for (auto *SB : SpecializedBuilders) {
+      if (!SB->isValid())
+        continue;
+
+      SB->appendLinkDependences(DDeps);
     }
-    // Kill host action in case of device-only compilation.
-    if (CompileDeviceOnly)
-      return nullptr;
-    return HostAction;
-  }
-
-  // If we're not a partial or device-only compilation, we compile each arch to
-  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
-  // "link" action, which uses fatbinary to combine these cubins into one
-  // fatbin.  The fatbin is then an input to the host compilation.
-  ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
-    Action* AssembleAction = CudaDeviceActions[I];
-    assert(AssembleAction->getType() == types::TY_Object);
-    assert(AssembleAction->getInputs().size() == 1);
-
-    Action* BackendAction = AssembleAction->getInputs()[0];
-    assert(BackendAction->getType() == types::TY_PP_Asm);
-
-    for (auto &A : {AssembleAction, BackendAction}) {
-      OffloadAction::DeviceDependences DDep;
-      DDep.add(*A, *CudaTC, GpuArchList[I], Action::OFK_Cuda);
-      DeviceActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
-    }
-  }
-  auto FatbinAction =
-      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN);
-
-  // Return a new host action that incorporates original host action and all
-  // device actions.
-  OffloadAction::HostDependence HDep(
-      *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
-      /*BoundArch=*/nullptr, Action::OFK_Cuda);
-  OffloadAction::DeviceDependences DDep;
-  DDep.add(*FatbinAction, *CudaTC, /*BoundArch=*/nullptr, Action::OFK_Cuda);
-  return C.MakeAction<OffloadAction>(HDep, DDep);
-}
+
+    // Calculate all the offload kinds used in the current compilation.
+    unsigned ActiveOffloadKinds = 0u;
+    for (auto &I : InputArgToOffloadKindMap)
+      ActiveOffloadKinds |= I.second;
+
+    // If we don't have device dependencies, we don't have to create an offload
+    // action.
+    if (DDeps.getActions().empty()) {
+      // Propagate all the active kinds to host action. Given that it is a link
+      // action it is assumed to depend on all actions generated so far.
+      HostAction->propagateHostOffloadInfo(ActiveOffloadKinds,
+                                           /*BoundArch=*/nullptr);
+      return HostAction;
+    }
+
+    /// Create the offload action with all dependences. When an offload action
+    /// is created the kinds are propagated to the host action, so we don't have
+    /// to do that explicitely here.
+    OffloadAction::HostDependence HDep(
+        *HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
+        /*BoundArch*/ nullptr, ActiveOffloadKinds);
+    return C.MakeAction<OffloadAction>(HDep, DDeps);
+  }
+};
+} // anonymous namespace.
 
 void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                           const InputList &Inputs, ActionList &Actions) const {
@@ -1619,8 +2009,8 @@
     YcArg = YuArg = nullptr;
   }
 
-  // Track the host offload kinds used on this compilation.
-  unsigned CompilationActiveOffloadHostKinds = 0u;
+  // Builder to be used to build offloading actions.
+  OffloadingActionBuilder OffloadBuilder(C, Args, Inputs);
 
   // Construct the actions to perform.
   ActionList LinkerInputs;
@@ -1684,17 +2074,14 @@
       }
     }
 
-    phases::ID CudaInjectionPhase =
-        (phases::Compile < FinalPhase &&
-         llvm::find(PL, phases::Compile) != PL.end())
-            ? phases::Compile
-            : FinalPhase;
-
-    // Track the host offload kinds used on this input.
-    unsigned InputActiveOffloadHostKinds = 0u;
-
     // Build the pipeline for this file.
     Action *Current = C.MakeAction<InputAction>(*InputArg, InputType);
+
+    // Use the current host action in any of the offloading actions, if
+    // required.
+    if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+      break;
+
     for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
          i != e; ++i) {
       phases::ID Phase = *i;
@@ -1703,6 +2090,12 @@
       if (Phase > FinalPhase)
         break;
 
+      // Add any offload action the host action depends on.
+      Current = OffloadBuilder.addDeviceDependencesToHostAction(
+          Current, InputArg, Phase, FinalPhase, PL);
+      if (!Current)
+        break;
+
       // Queue linker inputs.
       if (Phase == phases::Link) {
         assert((i + 1) == e && "linking must be final compilation step.");
@@ -1711,48 +2104,37 @@
         break;
       }
 
-      // Some types skip the assembler phase (e.g., llvm-bc), but we can't
-      // encode this in the steps because the intermediate type depends on
-      // arguments. Just special case here.
-      if (Phase == phases::Assemble && Current->getType() != types::TY_PP_Asm)
-        continue;
-
       // Otherwise construct the appropriate action.
-      Current = ConstructPhaseAction(C, Args, Phase, Current);
+      auto *NewCurrent = ConstructPhaseAction(C, Args, Phase, Current);
 
-      if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
-        Current = buildCudaActions(C, Args, InputArg, Current, Actions);
-        if (!Current)
-          break;
+      // We didn't create a new action, so we will just move to the next phase.
+      if (NewCurrent == Current)
+        continue;
 
-        // We produced a CUDA action for this input, so the host has to support
-        // CUDA.
-        InputActiveOffloadHostKinds |= Action::OFK_Cuda;
-        CompilationActiveOffloadHostKinds |= Action::OFK_Cuda;
-      }
+      Current = NewCurrent;
+
+      // Use the current host action in any of the offloading actions, if
+      // required.
+      if (OffloadBuilder.addHostDependenceToDeviceActions(Current, InputArg))
+        break;
 
       if (Current->getType() == types::TY_Nothing)
         break;
     }
 
-    // If we ended with something, add to the output list. Also, propagate the
-    // offload information to the top-level host action related with the current
-    // input.
-    if (Current) {
-      if (InputActiveOffloadHostKinds)
-        Current->propagateHostOffloadInfo(InputActiveOffloadHostKinds,
-                                          /*BoundArch=*/nullptr);
+    // If we ended with something, add to the output list.
+    if (Current)
       Actions.push_back(Current);
-    }
+
+    // Add any top level actions generated for offloading.
+    OffloadBuilder.appendTopLevelActions(Actions, Current, InputArg);
   }
 
-  // Add a link action if necessary and propagate the offload information for
-  // the current compilation.
+  // Add a link action if necessary.
   if (!LinkerInputs.empty()) {
-    Actions.push_back(
-        C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image));
-    Actions.back()->propagateHostOffloadInfo(CompilationActiveOffloadHostKinds,
-                                             /*BoundArch=*/nullptr);
+    Action *LA = C.MakeAction<LinkJobAction>(LinkerInputs, types::TY_Image);
+    LA = OffloadBuilder.processHostLinkAction(LA);
+    Actions.push_back(LA);
   }
 
   // If we are linking, claim any options which are obviously only used for
@@ -1774,6 +2156,13 @@
 Action *Driver::ConstructPhaseAction(Compilation &C, const ArgList &Args,
                                      phases::ID Phase, Action *Input) const {
   llvm::PrettyStackTraceString CrashInfo("Constructing phase actions");
+
+  // Some types skip the assembler phase (e.g., llvm-bc), but we can't
+  // encode this in the steps because the intermediate type depends on
+  // arguments. Just special case here.
+  if (Phase == phases::Assemble && Input->getType() != types::TY_PP_Asm)
+    return Input;
+
   // Build the appropriate action.
   switch (Phase) {
   case phases::Link: