Index: include/clang/Driver/Compilation.h
===================================================================
--- include/clang/Driver/Compilation.h
+++ include/clang/Driver/Compilation.h
@@ -38,6 +38,9 @@
   /// The default tool chain.
   const ToolChain &DefaultToolChain;
 
+  const ToolChain *CudaHostToolChain;
+  const ToolChain *CudaDeviceToolChain;
+
   /// The original (untranslated) input argument list.
   llvm::opt::InputArgList *Args;
 
@@ -81,6 +84,17 @@
   const Driver &getDriver() const { return TheDriver; }
 
   const ToolChain &getDefaultToolChain() const { return DefaultToolChain; }
+  const ToolChain *getCudaHostToolChain() const { return CudaHostToolChain; }
+  const ToolChain *getCudaDeviceToolChain() const {
+    return CudaDeviceToolChain;
+  }
+
+  void setCudaHostToolChain(const ToolChain *HostToolChain) {
+    CudaHostToolChain = HostToolChain;
+  }
+  void setCudaDeviceToolChain(const ToolChain *DeviceToolChain) {
+    CudaDeviceToolChain = DeviceToolChain;
+  }
 
   const llvm::opt::InputArgList &getInputArgs() const { return *Args; }
 
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -297,22 +297,23 @@
   /// BuildActions - Construct the list of actions to perform for the
   /// given arguments, which are only done for a single architecture.
   ///
+  /// \param C - The compilation that is being built.
   /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
   /// \param Actions - The list to store the resulting actions onto.
-  void BuildActions(const ToolChain &TC, llvm::opt::DerivedArgList &Args,
-                    const InputList &Inputs, ActionList &Actions) const;
+  void BuildActions(Compilation &C, const ToolChain &TC,
+                    llvm::opt::DerivedArgList &Args, const InputList &Inputs,
+                    ActionList &Actions) const;
 
   /// BuildUniversalActions - Construct the list of actions to perform
   /// for the given arguments, which may require a universal build.
   ///
+  /// \param C - The compilation that is being built.
   /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
   /// \param Actions - The list to store the resulting actions onto.
-  void BuildUniversalActions(const ToolChain &TC,
-                             llvm::opt::DerivedArgList &Args,
-                             const InputList &BAInputs,
-                             ActionList &Actions) const;
+  void BuildUniversalActions(Compilation &C, const ToolChain &TC,
+                             const InputList &BAInputs) const;
 
   /// BuildJobs - Bind actions to concrete tools and translate
   /// arguments to form the list of jobs to run.
@@ -433,6 +434,13 @@
   /// compilation based on which -f(no-)?lto(=.*)? option occurs last.
   void setLTOMode(const llvm::opt::ArgList &Args);
 
+  /// @}
+
+  /// \brief Get bitmasks for which option flags to include and exclude based on
+  /// the driver mode.
+  std::pair<unsigned, unsigned> getIncludeExcludeOptionFlagMasks() const;
+
+public:
   /// \brief Retrieves a ToolChain for a particular \p Target triple.
   ///
   /// Will cache ToolChains for the life of the driver object, and create them
@@ -440,13 +448,6 @@
   const ToolChain &getToolChain(const llvm::opt::ArgList &Args,
                                 const llvm::Triple &Target) const;
 
-  /// @}
-
-  /// \brief Get bitmasks for which option flags to include and exclude based on
-  /// the driver mode.
-  std::pair<unsigned, unsigned> getIncludeExcludeOptionFlagMasks() const;
-
-public:
   /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and
   /// return the grouped values as integers. Numbers which are not
   /// provided are set to 0.
Index: lib/Driver/Compilation.cpp
===================================================================
--- lib/Driver/Compilation.cpp
+++ lib/Driver/Compilation.cpp
@@ -24,8 +24,9 @@
 
 Compilation::Compilation(const Driver &D, const ToolChain &_DefaultToolChain,
                          InputArgList *_Args, DerivedArgList *_TranslatedArgs)
-    : TheDriver(D), DefaultToolChain(_DefaultToolChain), Args(_Args),
-      TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
+    : TheDriver(D), DefaultToolChain(_DefaultToolChain),
+      CudaHostToolChain(&DefaultToolChain), CudaDeviceToolChain(nullptr),
+      Args(_Args), TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
       ForDiagnostics(false) {}
 
 Compilation::~Compilation() {
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -500,10 +500,9 @@
   // Construct the list of abstract actions to perform for this compilation. On
   // MachO targets this uses the driver-driver and universal actions.
   if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(C->getDefaultToolChain(), C->getArgs(), Inputs,
-                          C->getActions());
+    BuildUniversalActions(*C, C->getDefaultToolChain(), Inputs);
   else
-    BuildActions(C->getDefaultToolChain(), C->getArgs(), Inputs,
+    BuildActions(*C, C->getDefaultToolChain(), C->getArgs(), Inputs,
                  C->getActions());
 
   if (CCCPrintPhases) {
@@ -616,9 +615,9 @@
   // Darwin OSes this uses the driver-driver and builds universal actions.
   const ToolChain &TC = C.getDefaultToolChain();
   if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(TC, C.getArgs(), Inputs, C.getActions());
+    BuildUniversalActions(C, TC, Inputs);
   else
-    BuildActions(TC, C.getArgs(), Inputs, C.getActions());
+    BuildActions(C, TC, C.getArgs(), Inputs, C.getActions());
 
   BuildJobs(C);
 
@@ -996,9 +995,10 @@
   return false;
 }
 
-void Driver::BuildUniversalActions(const ToolChain &TC, DerivedArgList &Args,
-                                   const InputList &BAInputs,
-                                   ActionList &Actions) const {
+void Driver::BuildUniversalActions(Compilation &C, const ToolChain &TC,
+                                   const InputList &BAInputs) const {
+  DerivedArgList &Args = C.getArgs();
+  ActionList &Actions = C.getActions();
   llvm::PrettyStackTraceString CrashInfo("Building universal build actions");
   // Collect the list of architectures. Duplicates are allowed, but should only
   // be handled once (in the order seen).
@@ -1027,7 +1027,7 @@
     Archs.push_back(Args.MakeArgString(TC.getDefaultUniversalArchName()));
 
   ActionList SingleActions;
-  BuildActions(TC, Args, BAInputs, SingleActions);
+  BuildActions(C, TC, Args, BAInputs, SingleActions);
 
   // Add in arch bindings for every top level action, as well as lipo and
   // dsymutil steps if needed.
@@ -1278,14 +1278,16 @@
 // and returns a new CudaHostAction which wraps /p Current and device
 // side actions.
 static std::unique_ptr<Action>
-buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
-                 const Arg *InputArg, std::unique_ptr<Action> HostAction,
-                 ActionList &Actions) {
+buildCudaActions(Compilation &C, const Driver &D, const ToolChain &TC,
+                 DerivedArgList &Args, const Arg *InputArg,
+                 std::unique_ptr<Action> HostAction, ActionList &Actions) {
   // Figure out which NVPTX triple to use for device-side compilation based on
   // whether host is 64-bit.
   const char *DeviceTriple = TC.getTriple().isArch64Bit()
                                  ? "nvptx64-nvidia-cuda"
                                  : "nvptx-nvidia-cuda";
+  C.setCudaDeviceToolChain(
+      &D.getToolChain(C.getArgs(), llvm::Triple(DeviceTriple)));
   Arg *PartialCompilationArg = Args.getLastArg(options::OPT_cuda_host_only,
                                                options::OPT_cuda_device_only);
   // Host-only compilation case.
@@ -1317,7 +1319,7 @@
 
   // Build actions for all device inputs.
   ActionList CudaDeviceActions;
-  D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+  D.BuildActions(C, TC, Args, CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
          "Failed to create actions for all devices");
 
@@ -1368,8 +1370,9 @@
       new CudaHostAction(std::move(HostAction), DeviceActions, DeviceTriple));
 }
 
-void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
-                          const InputList &Inputs, ActionList &Actions) const {
+void Driver::BuildActions(Compilation &C, const ToolChain &TC,
+                          DerivedArgList &Args, const InputList &Inputs,
+                          ActionList &Actions) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
 
   if (!SuppressMissingInputWarning && Inputs.empty()) {
@@ -1499,7 +1502,7 @@
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
 
       if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
-        Current = buildCudaActions(*this, TC, Args, InputArg,
+        Current = buildCudaActions(C, *this, TC, Args, InputArg,
                                    std::move(Current), Actions);
         if (!Current)
           break;
@@ -1802,7 +1805,7 @@
     InputInfo II;
     // Append outputs of device jobs to the input list.
     for (const Action *DA : CHA->getDeviceActions()) {
-      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+      BuildJobsForAction(C, DA, TC, nullptr, AtTopLevel,
                          /*MultipleArchs*/ false, LinkingOutput, II);
       CudaDeviceInputInfos.push_back(II);
     }
@@ -1842,11 +1845,12 @@
   }
 
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    BuildJobsForAction(
-        C, *CDA->begin(),
-        &getToolChain(C.getArgs(), llvm::Triple(CDA->getDeviceTriple())),
-        CDA->getGpuArchName(), CDA->isAtTopLevel(),
-        /*MultipleArchs*/ true, LinkingOutput, Result);
+    // Initial processing of CudaDeviceAction carries host params.
+    // Call BuildJobsForAction() again, now with correct device parameters.
+    assert(CDA->getGpuArchName() && "No GPU name in device action.");
+    BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
+                       CDA->getGpuArchName(), CDA->isAtTopLevel(),
+                       /*MultipleArchs*/ true, LinkingOutput, Result);
     return;
   }
 
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -3215,6 +3215,20 @@
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
+  if (IsCuda) {
+    const ToolChain *AuxToolChain;
+    if (&getToolChain() == C.getCudaDeviceToolChain())
+      AuxToolChain = C.getCudaHostToolChain();
+    else if (&getToolChain() == C.getCudaHostToolChain())
+      AuxToolChain = C.getCudaDeviceToolChain();
+    else
+      llvm_unreachable("Can't figure out CUDA compilation mode.");
+    if (AuxToolChain) {
+      CmdArgs.push_back("-aux-triple");
+      CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
+    }
+  }
+
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
                                Triple.getArch() == llvm::Triple::thumb)) {
     unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6;
Index: test/Driver/cuda-options.cu
===================================================================
--- test/Driver/cuda-options.cu
+++ test/Driver/cuda-options.cu
@@ -111,14 +111,6 @@
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
-// Match device-side preprocessor, and compiler phases with -save-temps
-// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
-// CUDA-D1S-SAME: "-fcuda-is-device"
-// CUDA-D1S-SAME: "-x" "cuda"
-// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
-// CUDA-D1S-SAME: "-fcuda-is-device"
-// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
-
 // --cuda-host-only should never trigger unused arg warning.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s
@@ -133,34 +125,47 @@
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-UNUSED-CDO %s
 
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+
+// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
 // Match the job that produces PTX assembly
-// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1NS-SAME: "-aux-triple" "x86_64--linux-gnu"
 // CUDA-D1-SAME: "-fcuda-is-device"
 // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
 // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
 // CUDA-D1NS-SAME: "-x" "cuda"
 // CUDA-D1S-SAME: "-x" "ir"
 
-// Match anothe device-side compilation
-// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// Match another device-side compilation
+// CUDA-D2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D2-SAME: "-aux-triple" "x86_64--linux-gnu"
 // CUDA-D2-SAME: "-fcuda-is-device"
 // CUDA-D2-SM30-SAME: "-target-cpu" "sm_30"
 // CUDA-D2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
 // CUDA-D2-SAME: "-x" "cuda"
 
 // Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // CUDA-ND-SAME-NOT: "-fcuda-is-device"
 
 // Match host-side preprocessor job with -save-temps
-// CUDA-HS: "-cc1" "-triple"
-// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS: "-cc1" "-triple" "x86_64--linux-gnu"
+// CUDA-HS-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
 // CUDA-HS-SAME-NOT: "-fcuda-is-device"
 // CUDA-HS-SAME: "-x" "cuda"
 
 // Match host-side compilation
-// CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-H: "-cc1" "-triple" "x86_64--linux-gnu"
+// CUDA-H-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
 // CUDA-H-SAME-NOT: "-fcuda-is-device"
 // CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // CUDA-HNS-SAME: "-x" "cuda"
Index: test/SemaCUDA/function-target-hd.cu
===================================================================
--- test/SemaCUDA/function-target-hd.cu
+++ test/SemaCUDA/function-target-hd.cu
@@ -8,9 +8,9 @@
 // host device functions are not allowed to call device functions.
 
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -verify %s
 // RUN: %clang_cc1 -fsyntax-only -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
 
 #include "Inputs/cuda.h"