Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -139,15 +139,17 @@
   virtual void anchor();
   /// GPU architecture to bind -- e.g 'sm_35'.
   const char *GpuArchName;
+  const char *DeviceTriple;
   /// True when action results are not consumed by the host action (e.g when
   /// -fsyntax-only or --cuda-device-only options are used).
   bool AtTopLevel;
 
 public:
   CudaDeviceAction(std::unique_ptr<Action> Input, const char *ArchName,
-                   bool AtTopLevel);
+                   const char *DeviceTriple, bool AtTopLevel);
 
   const char *getGpuArchName() const { return GpuArchName; }
+  const char *getDeviceTriple() const { return DeviceTriple; }
   bool isAtTopLevel() const { return AtTopLevel; }
 
   static bool classof(const Action *A) {
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -58,9 +58,10 @@
 void CudaDeviceAction::anchor() {}
 
 CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
-                                   const char *ArchName, bool AtTopLevel)
+                                   const char *ArchName,
+                                   const char *DeviceTriple, bool AtTopLevel)
     : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName),
-      AtTopLevel(AtTopLevel) {}
+      DeviceTriple(DeviceTriple), AtTopLevel(AtTopLevel) {}
 
 void CudaHostAction::anchor() {}
 
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -1237,11 +1237,8 @@
 // side actions.
 static std::unique_ptr<Action>
 buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
-                 const Arg *InputArg, const types::ID InputType,
-                 std::unique_ptr<Action> Current, ActionList &Actions) {
-
-  assert(InputType == types::TY_CUDA &&
-         "CUDA Actions only apply to CUDA inputs.");
+                 const Arg *InputArg, std::unique_ptr<Action> HostAction,
+                 ActionList &Actions) {
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
   SmallVector<const char *, 4> GpuArchList;
@@ -1279,6 +1276,12 @@
     }
   }
 
+  // Figure out which NVPTX triple to use for device-side compilation based on
+  // whether host is 64-bit.
+  const char *DeviceTriple = TC.getTriple().isArch64Bit()
+                                 ? "nvptx64-nvidia-cuda"
+                                 : "nvptx-nvidia-cuda";
+
   // Figure out what to do with device actions -- pass them as inputs to the
   // host action or run each of them independently.
   bool DeviceOnlyCompilation = Args.hasArg(options::OPT_cuda_device_only);
@@ -1295,26 +1298,26 @@
     }
 
     for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-      Actions.push_back(
-          new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[I]),
-                               GpuArchList[I], /* AtTopLevel */ true));
+      Actions.push_back(new CudaDeviceAction(
+          std::unique_ptr<Action>(CudaDeviceActions[I]), GpuArchList[I],
+          DeviceTriple, /* AtTopLevel */ true));
     // Kill host action in case of device-only compilation.
     if (DeviceOnlyCompilation)
-      Current.reset(nullptr);
-    return Current;
+      HostAction.reset(nullptr);
+    return HostAction;
   }
 
   // Outputs of device actions during complete CUDA compilation get created
   // with AtTopLevel=false and become inputs for the host action.
   ActionList DeviceActions;
   for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    DeviceActions.push_back(
-        new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[I]),
-                             GpuArchList[I], /* AtTopLevel */ false));
+    DeviceActions.push_back(new CudaDeviceAction(
+        std::unique_ptr<Action>(CudaDeviceActions[I]), GpuArchList[I],
+        DeviceTriple, /* AtTopLevel */ false));
   // Return a new host action that incorporates original host action and all
   // device actions.
   return std::unique_ptr<Action>(
-      new CudaHostAction(std::move(Current), DeviceActions));
+      new CudaHostAction(std::move(HostAction), DeviceActions));
 }
 
 void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
@@ -1451,7 +1454,7 @@
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
 
       if (InjectCuda && Phase == CudaInjectionPhase) {
-        Current = buildCudaActions(*this, TC, Args, InputArg, InputType,
+        Current = buildCudaActions(*this, TC, Args, InputArg,
                                    std::move(Current), Actions);
         if (!Current)
           break;
@@ -1794,15 +1797,11 @@
   }
 
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    // Figure out which NVPTX triple to use for device-side compilation based on
-    // whether host is 64-bit.
-    llvm::Triple DeviceTriple(TC->getTriple().isArch64Bit()
-                                  ? "nvptx64-nvidia-cuda"
-                                  : "nvptx-nvidia-cuda");
-    BuildJobsForAction(C, *CDA->begin(),
-                       &getToolChain(C.getArgs(), DeviceTriple),
-                       CDA->getGpuArchName(), CDA->isAtTopLevel(),
-                       /*MultipleArchs*/ true, LinkingOutput, Result);
+    BuildJobsForAction(
+        C, *CDA->begin(),
+        &getToolChain(C.getArgs(), llvm::Triple(CDA->getDeviceTriple())),
+        CDA->getGpuArchName(), CDA->isAtTopLevel(),
+        /*MultipleArchs*/ true, LinkingOutput, Result);
     return;
   }