Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -139,17 +139,15 @@
   virtual void anchor();
   /// GPU architecture to bind -- e.g 'sm_35'.
   const char *GpuArchName;
-  const char *DeviceTriple;
   /// True when action results are not consumed by the host action (e.g when
   /// -fsyntax-only or --cuda-device-only options are used).
   bool AtTopLevel;
 
 public:
   CudaDeviceAction(std::unique_ptr<Action> Input, const char *ArchName,
-                   const char *DeviceTriple, bool AtTopLevel);
+                   bool AtTopLevel);
 
   const char *getGpuArchName() const { return GpuArchName; }
-  const char *getDeviceTriple() const { return DeviceTriple; }
   bool isAtTopLevel() const { return AtTopLevel; }
 
   static bool classof(const Action *A) {
@@ -160,16 +158,13 @@
 class CudaHostAction : public Action {
   virtual void anchor();
   ActionList DeviceActions;
-  const char *DeviceTriple;
 
 public:
-  CudaHostAction(std::unique_ptr<Action> Input, const ActionList &DeviceActions,
-                 const char *DeviceTriple);
+  CudaHostAction(std::unique_ptr<Action> Input,
+                 const ActionList &DeviceActions);
   ~CudaHostAction() override;
 
-  ActionList &getDeviceActions() { return DeviceActions; }
   const ActionList &getDeviceActions() const { return DeviceActions; }
-  const char *getDeviceTriple() const { return DeviceTriple; }
 
   static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
 };
Index: include/clang/Driver/Compilation.h
===================================================================
--- include/clang/Driver/Compilation.h
+++ include/clang/Driver/Compilation.h
@@ -38,6 +38,9 @@
   /// The default tool chain.
   const ToolChain &DefaultToolChain;
 
+  const ToolChain *CudaHostToolChain;
+  const ToolChain *CudaDeviceToolChain;
+
   /// The original (untranslated) input argument list.
   llvm::opt::InputArgList *Args;
 
@@ -81,6 +84,17 @@
   const Driver &getDriver() const { return TheDriver; }
 
   const ToolChain &getDefaultToolChain() const { return DefaultToolChain; }
+  const ToolChain *getCudaHostToolChain() const { return CudaHostToolChain; }
+  const ToolChain *getCudaDeviceToolChain() const {
+    return CudaDeviceToolChain;
+  }
+
+  void setCudaHostToolChain(const ToolChain *HostToolChain) {
+    CudaHostToolChain = HostToolChain;
+  }
+  void setCudaDeviceToolChain(const ToolChain *DeviceToolChain) {
+    CudaDeviceToolChain = DeviceToolChain;
+  }
 
   const llvm::opt::InputArgList &getInputArgs() const { return *Args; }
 
Index: include/clang/Driver/Driver.h
===================================================================
--- include/clang/Driver/Driver.h
+++ include/clang/Driver/Driver.h
@@ -297,22 +297,23 @@
   /// BuildActions - Construct the list of actions to perform for the
   /// given arguments, which are only done for a single architecture.
   ///
+  /// \param C - The compilation that is being built.
   /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
   /// \param Actions - The list to store the resulting actions onto.
-  void BuildActions(const ToolChain &TC, llvm::opt::DerivedArgList &Args,
-                    const InputList &Inputs, ActionList &Actions) const;
+  void BuildActions(Compilation &C, const ToolChain &TC,
+                    llvm::opt::DerivedArgList &Args, const InputList &Inputs,
+                    ActionList &Actions) const;
 
   /// BuildUniversalActions - Construct the list of actions to perform
   /// for the given arguments, which may require a universal build.
   ///
+  /// \param C - The compilation that is being built.
   /// \param TC - The default host tool chain.
   /// \param Args - The input arguments.
   /// \param Actions - The list to store the resulting actions onto.
-  void BuildUniversalActions(const ToolChain &TC,
-                             llvm::opt::DerivedArgList &Args,
-                             const InputList &BAInputs,
-                             ActionList &Actions) const;
+  void BuildUniversalActions(Compilation &C, const ToolChain &TC,
+                             const InputList &BAInputs) const;
 
   /// BuildJobs - Bind actions to concrete tools and translate
   /// arguments to form the list of jobs to run.
@@ -433,13 +434,6 @@
   /// compilation based on which -f(no-)?lto(=.*)? option occurs last.
   void setLTOMode(const llvm::opt::ArgList &Args);
 
-  /// \brief Retrieves a ToolChain for a particular \p Target triple.
-  ///
-  /// Will cache ToolChains for the life of the driver object, and create them
-  /// on-demand.
-  const ToolChain &getToolChain(const llvm::opt::ArgList &Args,
-                                const llvm::Triple &Target) const;
-
   /// @}
 
   /// \brief Get bitmasks for which option flags to include and exclude based on
@@ -447,6 +441,13 @@
   std::pair<unsigned, unsigned> getIncludeExcludeOptionFlagMasks() const;
 
 public:
+  /// \brief Retrieves a ToolChain for a particular \p Target triple.
+  ///
+  /// Will cache ToolChains for the life of the driver object, and create them
+  /// on-demand.
+  const ToolChain &getToolChain(const llvm::opt::ArgList &Args,
+                                const llvm::Triple &Target) const;
+
   /// GetReleaseVersion - Parse (([0-9]+)(.([0-9]+)(.([0-9]+)?))?)? and
   /// return the grouped values as integers. Numbers which are not
   /// provided are set to 0.
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -1592,6 +1592,8 @@
 def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
 def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option]>,
   HelpText<"Disable builtin #include directories">;
+def nocudainc : Flag<["-"], "nocudainc">;
+def nocudalib : Flag<["-"], "nocudalib">;
 def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
 def nofixprebinding : Flag<["-"], "nofixprebinding">;
 def nolibc : Flag<["-"], "nolibc">;
Index: include/clang/Driver/ToolChain.h
===================================================================
--- include/clang/Driver/ToolChain.h
+++ include/clang/Driver/ToolChain.h
@@ -393,6 +393,10 @@
   virtual void addProfileRTLibs(const llvm::opt::ArgList &Args,
                                 llvm::opt::ArgStringList &CmdArgs) const;
 
+  /// \brief Add arguments to use system-specific CUDA includes.
+  virtual void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args) const;
+
   /// \brief Return sanitizers which are available in this toolchain.
   virtual SanitizerMask getSupportedSanitizers() const;
 };
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -58,18 +58,15 @@
 void CudaDeviceAction::anchor() {}
 
 CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
-                                   const char *ArchName,
-                                   const char *DeviceTriple, bool AtTopLevel)
+                                   const char *ArchName, bool AtTopLevel)
     : Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName),
-      DeviceTriple(DeviceTriple), AtTopLevel(AtTopLevel) {}
+      AtTopLevel(AtTopLevel) {}
 
 void CudaHostAction::anchor() {}
 
 CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input,
-                               const ActionList &DeviceActions,
-                               const char *DeviceTriple)
-    : Action(CudaHostClass, std::move(Input)), DeviceActions(DeviceActions),
-      DeviceTriple(DeviceTriple) {}
+                               const ActionList &DeviceActions)
+    : Action(CudaHostClass, std::move(Input)), DeviceActions(DeviceActions) {}
 
 CudaHostAction::~CudaHostAction() {
   for (auto &DA : DeviceActions)
Index: lib/Driver/Compilation.cpp
===================================================================
--- lib/Driver/Compilation.cpp
+++ lib/Driver/Compilation.cpp
@@ -24,8 +24,9 @@
 
 Compilation::Compilation(const Driver &D, const ToolChain &_DefaultToolChain,
                          InputArgList *_Args, DerivedArgList *_TranslatedArgs)
-    : TheDriver(D), DefaultToolChain(_DefaultToolChain), Args(_Args),
-      TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
+    : TheDriver(D), DefaultToolChain(_DefaultToolChain),
+      CudaHostToolChain(&DefaultToolChain), CudaDeviceToolChain(nullptr),
+      Args(_Args), TranslatedArgs(_TranslatedArgs), Redirects(nullptr),
       ForDiagnostics(false) {}
 
 Compilation::~Compilation() {
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -491,6 +491,10 @@
   // The compilation takes ownership of Args.
   Compilation *C = new Compilation(*this, TC, UArgs.release(), TranslatedArgs);
 
+  C->setCudaDeviceToolChain(
+      &getToolChain(C->getArgs(), llvm::Triple(TC.getTriple().isArch64Bit()
+                                                   ? "nvptx64-nvidia-cuda"
+                                                   : "nvptx-nvidia-cuda")));
   if (!HandleImmediateArgs(*C))
     return C;
 
@@ -501,10 +505,9 @@
   // Construct the list of abstract actions to perform for this compilation. On
   // MachO targets this uses the driver-driver and universal actions.
   if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(C->getDefaultToolChain(), C->getArgs(), Inputs,
-                          C->getActions());
+    BuildUniversalActions(*C, C->getDefaultToolChain(), Inputs);
   else
-    BuildActions(C->getDefaultToolChain(), C->getArgs(), Inputs,
+    BuildActions(*C, C->getDefaultToolChain(), C->getArgs(), Inputs,
                  C->getActions());
 
   if (CCCPrintPhases) {
@@ -617,9 +620,9 @@
   // Darwin OSes this uses the driver-driver and builds universal actions.
   const ToolChain &TC = C.getDefaultToolChain();
   if (TC.getTriple().isOSBinFormatMachO())
-    BuildUniversalActions(TC, C.getArgs(), Inputs, C.getActions());
+    BuildUniversalActions(C, TC, Inputs);
   else
-    BuildActions(TC, C.getArgs(), Inputs, C.getActions());
+    BuildActions(C, TC, C.getArgs(), Inputs, C.getActions());
 
   BuildJobs(C);
 
@@ -948,7 +951,7 @@
     os << '"' << CDA->getGpuArchName() << '"' << ", {"
        << PrintActions1(C, *CDA->begin(), Ids) << "}";
   } else {
-    ActionList *AL;
+    const ActionList *AL;
     if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
       os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
          << ", gpu binaries ";
@@ -997,9 +1000,10 @@
   return false;
 }
 
-void Driver::BuildUniversalActions(const ToolChain &TC, DerivedArgList &Args,
-                                   const InputList &BAInputs,
-                                   ActionList &Actions) const {
+void Driver::BuildUniversalActions(Compilation &C, const ToolChain &TC,
+                                   const InputList &BAInputs) const {
+  DerivedArgList &Args = C.getArgs();
+  ActionList &Actions = C.getActions();
   llvm::PrettyStackTraceString CrashInfo("Building universal build actions");
   // Collect the list of architectures. Duplicates are allowed, but should only
   // be handled once (in the order seen).
@@ -1028,7 +1032,7 @@
     Archs.push_back(Args.MakeArgString(TC.getDefaultUniversalArchName()));
 
   ActionList SingleActions;
-  BuildActions(TC, Args, BAInputs, SingleActions);
+  BuildActions(C, TC, Args, BAInputs, SingleActions);
 
   // Add in arch bindings for every top level action, as well as lipo and
   // dsymutil steps if needed.
@@ -1279,21 +1283,15 @@
 // and returns a new CudaHostAction which wraps /p Current and device
 // side actions.
 static std::unique_ptr<Action>
-buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
-                 const Arg *InputArg, std::unique_ptr<Action> HostAction,
-                 ActionList &Actions) {
-  // Figure out which NVPTX triple to use for device-side compilation based on
-  // whether host is 64-bit.
-  const char *DeviceTriple = TC.getTriple().isArch64Bit()
-                                 ? "nvptx64-nvidia-cuda"
-                                 : "nvptx-nvidia-cuda";
+buildCudaActions(Compilation &C, DerivedArgList &Args, const Arg *InputArg,
+                 std::unique_ptr<Action> HostAction, ActionList &Actions) {
   Arg *PartialCompilationArg = Args.getLastArg(options::OPT_cuda_host_only,
                                                options::OPT_cuda_device_only);
   // Host-only compilation case.
   if (PartialCompilationArg &&
       PartialCompilationArg->getOption().matches(options::OPT_cuda_host_only))
     return std::unique_ptr<Action>(
-        new CudaHostAction(std::move(HostAction), {}, DeviceTriple));
+        new CudaHostAction(std::move(HostAction), {}));
 
   // Collect all cuda_gpu_arch parameters, removing duplicates.
   SmallVector<const char *, 4> GpuArchList;
@@ -1317,8 +1315,11 @@
     CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
 
   // Build actions for all device inputs.
+  assert(C.getCudaDeviceToolChain() &&
+         "Missing toolchain for device-side compilation.");
   ActionList CudaDeviceActions;
-  D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
+  C.getDriver().BuildActions(C, *C.getCudaDeviceToolChain(), Args,
+                             CudaDeviceInputs, CudaDeviceActions);
   assert(GpuArchList.size() == CudaDeviceActions.size() &&
          "Failed to create actions for all devices");
 
@@ -1342,14 +1343,15 @@
     // -o is ambiguous if we have more than one top-level action.
     if (Args.hasArg(options::OPT_o) &&
         (!DeviceOnlyCompilation || GpuArchList.size() > 1)) {
-      D.Diag(clang::diag::err_drv_output_argument_with_multiple_files);
+      C.getDriver().Diag(
+          clang::diag::err_drv_output_argument_with_multiple_files);
       return nullptr;
     }
 
     for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
       Actions.push_back(new CudaDeviceAction(
           std::unique_ptr<Action>(CudaDeviceActions[I]), GpuArchList[I],
-          DeviceTriple, /* AtTopLevel */ true));
+          /* AtTopLevel */ true));
     // Kill host action in case of device-only compilation.
     if (DeviceOnlyCompilation)
       HostAction.reset(nullptr);
@@ -1362,15 +1364,16 @@
   for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
     DeviceActions.push_back(new CudaDeviceAction(
         std::unique_ptr<Action>(CudaDeviceActions[I]), GpuArchList[I],
-        DeviceTriple, /* AtTopLevel */ false));
+        /* AtTopLevel */ false));
   // Return a new host action that incorporates original host action and all
   // device actions.
   return std::unique_ptr<Action>(
-      new CudaHostAction(std::move(HostAction), DeviceActions, DeviceTriple));
+      new CudaHostAction(std::move(HostAction), DeviceActions));
 }
 
-void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
-                          const InputList &Inputs, ActionList &Actions) const {
+void Driver::BuildActions(Compilation &C, const ToolChain &TC,
+                          DerivedArgList &Args, const InputList &Inputs,
+                          ActionList &Actions) const {
   llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
 
   if (!SuppressMissingInputWarning && Inputs.empty()) {
@@ -1500,8 +1503,8 @@
       Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
 
       if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase) {
-        Current = buildCudaActions(*this, TC, Args, InputArg,
-                                   std::move(Current), Actions);
+        Current =
+            buildCudaActions(C, Args, InputArg, std::move(Current), Actions);
         if (!Current)
           break;
       }
@@ -1803,7 +1806,7 @@
     InputInfo II;
     // Append outputs of device jobs to the input list.
     for (const Action *DA : CHA->getDeviceActions()) {
-      BuildJobsForAction(C, DA, TC, "", AtTopLevel,
+      BuildJobsForAction(C, DA, TC, nullptr, AtTopLevel,
                          /*MultipleArchs*/ false, LinkingOutput, II);
       CudaDeviceInputInfos.push_back(II);
     }
@@ -1843,11 +1846,12 @@
   }
 
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    BuildJobsForAction(
-        C, *CDA->begin(),
-        &getToolChain(C.getArgs(), llvm::Triple(CDA->getDeviceTriple())),
-        CDA->getGpuArchName(), CDA->isAtTopLevel(),
-        /*MultipleArchs*/ true, LinkingOutput, Result);
+    // Initial processing of CudaDeviceAction carries host params.
+    // Call BuildJobsForAction() again, now with correct device parameters.
+    assert(CDA->getGpuArchName() && "No GPU name in device action.");
+    BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
+                       CDA->getGpuArchName(), CDA->isAtTopLevel(),
+                       /*MultipleArchs*/ true, LinkingOutput, Result);
     return;
   }
 
Index: lib/Driver/ToolChain.cpp
===================================================================
--- lib/Driver/ToolChain.cpp
+++ lib/Driver/ToolChain.cpp
@@ -657,3 +657,6 @@
     Res |= CFIICall;
   return Res;
 }
+
+void ToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                   ArgStringList &CC1Args) const {}
Index: lib/Driver/ToolChains.h
===================================================================
--- lib/Driver/ToolChains.h
+++ lib/Driver/ToolChains.h
@@ -166,6 +166,7 @@
     std::string CudaLibPath;
     std::string CudaLibDevicePath;
     std::string CudaIncludePath;
+    llvm::StringMap<std::string> CudaLibDeviceMap;
 
   public:
     CudaInstallationDetector(const Driver &D) : IsValid(false), D(D) {}
@@ -185,6 +186,9 @@
     /// \brief Get the detected Cuda device library path.
     StringRef getLibDevicePath() const { return CudaLibDevicePath; }
     /// \brief Get libdevice file for given architecture
+    StringRef getLibDeviceFile(StringRef Gpu) const {
+      return CudaLibDeviceMap.lookup(Gpu);
+    }
   };
 
   CudaInstallationDetector CudaInstallation;
@@ -784,6 +788,8 @@
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
   bool isPIEDefault() const override;
   SanitizerMask getSupportedSanitizers() const override;
   void addProfileRTLibs(const llvm::opt::ArgList &Args,
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -1642,6 +1642,31 @@
           D.getVFS().exists(CudaLibDevicePath)))
       continue;
 
+    const StringRef LibDeviceName = "libdevice.";
+    std::error_code EC;
+    for (llvm::sys::fs::directory_iterator LI(CudaLibDevicePath, EC), LE;
+         !EC && LI != LE; LI = LI.increment(EC)) {
+      StringRef FilePath = LI->path();
+      StringRef FileName = llvm::sys::path::filename(FilePath);
+      // Process all bitcode filenames that look like libdevice.compute_XX.YY.bc
+      if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
+        continue;
+      StringRef GpuArch = FileName.slice(
+          LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
+      CudaLibDeviceMap[GpuArch] = FilePath.str();
+      // Insert map entries for specifc devices with this compute capability.
+      if (GpuArch == "compute_20") {
+        CudaLibDeviceMap["sm_20"] = FilePath;
+        CudaLibDeviceMap["sm_21"] = FilePath;
+      } else if (GpuArch == "compute_30") {
+        CudaLibDeviceMap["sm_30"] = FilePath;
+        CudaLibDeviceMap["sm_32"] = FilePath;
+      } else if (GpuArch == "compute_35") {
+        CudaLibDeviceMap["sm_35"] = FilePath;
+        CudaLibDeviceMap["sm_37"] = FilePath;
+      }
+    }
+
     IsValid = true;
     break;
   }
@@ -3926,6 +3951,18 @@
   }
 }
 
+void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                               ArgStringList &CC1Args) const {
+  if (DriverArgs.hasArg(options::OPT_nocudainc))
+    return;
+
+  if (CudaInstallation.isValid()) {
+    addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
+    CC1Args.push_back("-include");
+    CC1Args.push_back("cuda_runtime.h");
+  }
+}
+
 bool Linux::isPIEDefault() const { return getSanitizerArgs().requiresPIE(); }
 
 SanitizerMask Linux::getSupportedSanitizers() const {
@@ -4008,6 +4045,22 @@
                                      llvm::opt::ArgStringList &CC1Args) const {
   Linux::addClangTargetOptions(DriverArgs, CC1Args);
   CC1Args.push_back("-fcuda-is-device");
+
+  if (DriverArgs.hasArg(options::OPT_nocudalib))
+    return;
+
+  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(
+      DriverArgs.getLastArgValue(options::OPT_march_EQ));
+  if (!LibDeviceFile.empty()) {
+    CC1Args.push_back("-mlink-cuda-bitcode");
+    CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+
+    // Libdevice in CUDA-7.0 requires PTX version that's more recent
+    // than LLVM defaults to. Use PTX4.2 which is the PTX version that
+    // came with CUDA-7.0.
+    CC1Args.push_back("-target-feature");
+    CC1Args.push_back("+ptx42");
+  }
 }
 
 llvm::opt::DerivedArgList *
Index: lib/Driver/Tools.h
===================================================================
--- lib/Driver/Tools.h
+++ lib/Driver/Tools.h
@@ -57,7 +57,8 @@
                                const Driver &D, const llvm::opt::ArgList &Args,
                                llvm::opt::ArgStringList &CmdArgs,
                                const InputInfo &Output,
-                               const InputInfoList &Inputs) const;
+                               const InputInfoList &Inputs,
+                               const ToolChain *AuxToolChain) const;
 
   void AddAArch64TargetArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs) const;
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -248,7 +248,8 @@
                                     const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
-                                    const InputInfoList &Inputs) const {
+                                    const InputInfoList &Inputs,
+                                    const ToolChain *AuxToolChain) const {
   Arg *A;
 
   CheckPreprocessingOptions(D, Args);
@@ -441,11 +442,20 @@
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
   // Add C++ include arguments, if needed.
-  if (types::isCXX(Inputs[0].getType()))
+  if (types::isCXX(Inputs[0].getType())) {
     getToolChain().AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+    if (AuxToolChain)
+      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+  }
 
   // Add system include arguments.
   getToolChain().AddClangSystemIncludeArgs(Args, CmdArgs);
+  if (AuxToolChain)
+      AuxToolChain->AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
+
+  // Add CUDA include arguments, if needed.
+  if (types::isCuda(Inputs[0].getType()))
+    getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
 }
 
 // FIXME: Move to target hook.
@@ -3224,6 +3234,21 @@
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
+  const ToolChain *AuxToolChain = nullptr;
+  if (IsCuda) {
+    if (&getToolChain() == C.getCudaDeviceToolChain())
+      AuxToolChain = C.getCudaHostToolChain();
+    else if (&getToolChain() == C.getCudaHostToolChain())
+      AuxToolChain = C.getCudaDeviceToolChain();
+    else
+      llvm_unreachable("Can't figure out CUDA compilation mode.");
+    if (AuxToolChain) {
+      CmdArgs.push_back("-aux-triple");
+      CmdArgs.push_back(Args.MakeArgString(AuxToolChain->getTriple().str()));
+    }
+    CmdArgs.push_back("-fcuda-target-overloads");
+  }
+
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
                                Triple.getArch() == llvm::Triple::thumb)) {
     unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6;
@@ -4024,7 +4049,8 @@
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
-    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
+    AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs,
+                            AuxToolChain);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
Index: lib/Headers/CMakeLists.txt
===================================================================
--- lib/Headers/CMakeLists.txt
+++ lib/Headers/CMakeLists.txt
@@ -17,6 +17,7 @@
   bmiintrin.h
   cpuid.h
   cuda_builtin_vars.h
+  cuda_runtime.h
   emmintrin.h
   f16cintrin.h
   float.h
Index: lib/Headers/cuda_runtime.h
===================================================================
--- /dev/null
+++ lib/Headers/cuda_runtime.h
@@ -0,0 +1,155 @@
+/*===---- cuda_runtime.h - CUDA runtime support ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_H__
+#define __CLANG_CUDA_RUNTIME_H__
+
+#if defined(__PTX__)
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <stdlib.h>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific of
+// implementation of CUDA-7.x headers and are expected to break with
+// any other version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+
+#undef __CUDACC__
+#define __CUDABE__
+#include "host_config.h"
+#include "host_defines.h"
+#include "driver_types.h"
+#include "common_functions.h"
+
+#undef __CUDABE__
+#define __CUDACC__
+#include_next "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+#include "crt/host_runtime.h"
+#include "crt/device_runtime.h"
+
+// We need decls for functions in CUDA's libdevice woth __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+#include "device_functions.hpp"
+#include "math_functions.hpp"
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_functions.hpp"
+#include "device_atomic_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_32_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+// sm_30_intrinsics.h has declarations that use default argument, so
+// we have to include it and it will in turn include .hpp
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.hpp"
+#undef __MATH_FUNCTIONS_HPP__
+#include "math_functions.hpp"
+#pragma pop_macro("__host__")
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+#define __NVCC__
+
+#if defined(__CUDA_ARCH__)
+// We need to emit IR declaration for non-existing __nvvm_reflect to
+// let backend know that it should be treated as const nothrow
+// function which is implicitly assumed by NVVMReflect pass.
+extern "C" __device__ __attribute__((const)) int __nvvm_reflect(const void *);
+static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
+  return __nvvm_reflect("NONE");
+}
+#endif
+
+#endif // __PTX__
+#endif // __CLANG_CUDA_RUNTIME_H__
Index: test/Driver/cuda-detect.cu
===================================================================
--- test/Driver/cuda-detect.cu
+++ test/Driver/cuda-detect.cu
@@ -1,10 +1,61 @@
 // REQUIRES: clang-driver
 // REQUIRES: x86-registered-target
 //
+// # Check that we properly detect CUDA installation.
 // RUN: %clang -v --target=i386-unknown-linux \
-// RUN:   --sysroot=/tmp/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=i386-unknown-linux \
+// RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
 
+// Make sure we map libdevice bitcode files to proper GPUs.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE21
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 \
+// RUN:   -check-prefix CUDAINC
+// Verify that -nocudainc prevents adding include path to CUDA headers.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 \
+// RUN:   -check-prefix NOCUDAINC
+
+// Verify that no options related to bitcode linking are passes if
+// there's no bitcode file.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix NOLIBDEVICE
+// .. or if we explicitly passed -nocudalib
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix NOLIBDEVICE
+// Verify that we don't add include paths, link with libdevice or
+// -include cuda_runtime without valid CUDA installation.
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
+
 // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda
 // NOCUDA-NOT: Found CUDA installation:
+
+// LIBDEVICE: "-triple" "nvptx-nvidia-cuda"
+// LIBDEVICE-SAME: "-fcuda-is-device"
+// LIBDEVICE-SAME: "-mlink-cuda-bitcode"
+// LIBDEVICE21-SAME: libdevice.compute_20.10.bc
+// LIBDEVICE35-SAME: libdevice.compute_35.10.bc
+// LIBDEVICE-SAME: "-target-feature" "+ptx42"
+// CUDAINC-SAME: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include"
+// CUDAINC-SAME: "-include" "cuda_runtime.h"
+// NOCUDAINC-NOT: "-internal-isystem" "{{.*}}/Inputs/CUDA/usr/local/cuda/include"
+// NOCUDAINC-NOT: "-include" "cuda_runtime.h"
+// LIBDEVICE-SAME: "-x" "cuda"
+
+// NOLIBDEVICE: "-triple" "nvptx-nvidia-cuda"
+// NOLIBDEVICE-SAME: "-fcuda-is-device"
+// NOLIBDEVICE-NOT: "-mlink-cuda-bitcode-file"
+// NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc
+// NOLIBDEVICE-NOT: "-target-feature"
+// NOLIBDEVICE-SAME: "-x" "cuda"
Index: test/Driver/cuda-options.cu
===================================================================
--- test/Driver/cuda-options.cu
+++ test/Driver/cuda-options.cu
@@ -111,14 +111,6 @@
 // Make sure we don't link anything.
 // RUN:   -check-prefix CUDA-NL %s
 
-// Match device-side preprocessor, and compiler phases with -save-temps
-// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
-// CUDA-D1S-SAME: "-fcuda-is-device"
-// CUDA-D1S-SAME: "-x" "cuda"
-// CUDA-D1S: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
-// CUDA-D1S-SAME: "-fcuda-is-device"
-// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
-
 // --cuda-host-only should never trigger unused arg warning.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-NO-UNUSED-CHO %s
@@ -133,34 +125,47 @@
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
 // RUN:    FileCheck -check-prefix CUDA-UNUSED-CDO %s
 
+// Match device-side preprocessor, and compiler phases with -save-temps
+// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda"
+
+// CUDA-D1S: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1S-SAME: "-aux-triple" "x86_64--linux-gnu"
+// CUDA-D1S-SAME: "-fcuda-is-device"
+// CUDA-D1S-SAME: "-x" "cuda-cpp-output"
+
 // Match the job that produces PTX assembly
-// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-D1: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D1NS-SAME: "-aux-triple" "x86_64--linux-gnu"
 // CUDA-D1-SAME: "-fcuda-is-device"
 // CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
 // CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
 // CUDA-D1NS-SAME: "-x" "cuda"
 // CUDA-D1S-SAME: "-x" "ir"
 
-// Match anothe device-side compilation
-// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// Match another device-side compilation
+// CUDA-D2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// CUDA-D2-SAME: "-aux-triple" "x86_64--linux-gnu"
 // CUDA-D2-SAME: "-fcuda-is-device"
 // CUDA-D2-SM30-SAME: "-target-cpu" "sm_30"
 // CUDA-D2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
 // CUDA-D2-SAME: "-x" "cuda"
 
 // Match no device-side compilation
-// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-ND-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // CUDA-ND-SAME-NOT: "-fcuda-is-device"
 
 // Match host-side preprocessor job with -save-temps
-// CUDA-HS: "-cc1" "-triple"
-// CUDA-HS-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-HS: "-cc1" "-triple" "x86_64--linux-gnu"
+// CUDA-HS-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
 // CUDA-HS-SAME-NOT: "-fcuda-is-device"
 // CUDA-HS-SAME: "-x" "cuda"
 
 // Match host-side compilation
-// CUDA-H: "-cc1" "-triple"
-// CUDA-H-SAME-NOT: "nvptx{{(64)?}}-nvidia-cuda"
+// CUDA-H: "-cc1" "-triple" "x86_64--linux-gnu"
+// CUDA-H-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
 // CUDA-H-SAME-NOT: "-fcuda-is-device"
 // CUDA-H-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // CUDA-HNS-SAME: "-x" "cuda"
Index: test/SemaCUDA/function-target-hd.cu
===================================================================
--- test/SemaCUDA/function-target-hd.cu
+++ test/SemaCUDA/function-target-hd.cu
@@ -8,9 +8,9 @@
 // host device functions are not allowed to call device functions.
 
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -verify %s
 // RUN: %clang_cc1 -fsyntax-only -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
+// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
 
 #include "Inputs/cuda.h"
 
Index: unittests/ASTMatchers/ASTMatchersTest.h
===================================================================
--- unittests/ASTMatchers/ASTMatchersTest.h
+++ unittests/ASTMatchers/ASTMatchersTest.h
@@ -178,6 +178,7 @@
   Args.push_back("-xcuda");
   Args.push_back("-fno-ms-extensions");
   Args.push_back("--cuda-host-only");
+  Args.push_back("-nocudainc");
   Args.push_back(CompileArg);
   if (!runToolOnCodeWithArgs(Factory->create(),
                              CudaHeader + Code, Args)) {