Index: clang/include/clang/Driver/ToolChain.h
===================================================================
--- clang/include/clang/Driver/ToolChain.h
+++ clang/include/clang/Driver/ToolChain.h
@@ -38,6 +38,7 @@
 
 namespace driver {
   class Compilation;
+  class CudaInstallationDetector;
   class Driver;
   class JobAction;
   class RegisterEffectiveTriple;
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -470,14 +470,18 @@
   if (llvm::any_of(Inputs, [](std::pair<types::ID, const llvm::opt::Arg *> &I) {
         return types::isCuda(I.first);
       })) {
-    const ToolChain &TC = getToolChain(
-        C.getInputArgs(),
-        llvm::Triple(C.getSingleOffloadToolChain<Action::OFK_Host>()
-                             ->getTriple()
-                             .isArch64Bit()
-                         ? "nvptx64-nvidia-cuda"
-                         : "nvptx-nvidia-cuda"));
-    C.addOffloadDeviceToolChain(&TC, Action::OFK_Cuda);
+    const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+    const llvm::Triple &HostTriple = HostTC->getTriple();
+    llvm::Triple CudaTriple(HostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda"
+                                                     : "nvptx-nvidia-cuda");
+    // Use the CUDA and host triples as the key into the ToolChains map, because
+    // the device toolchain we create depends on both.
+    ToolChain *&CudaTC = ToolChains[CudaTriple.str() + "/" + HostTriple.str()];
+    if (!CudaTC) {
+      CudaTC = new toolchains::CudaToolChain(*this, CudaTriple, *HostTC,
+                                             C.getInputArgs());
+    }
+    C.addOffloadDeviceToolChain(CudaTC, Action::OFK_Cuda);
   }
 
   //
@@ -3602,9 +3606,6 @@
         break;
       }
       break;
-    case llvm::Triple::CUDA:
-      TC = new toolchains::CudaToolChain(*this, Target, Args);
-      break;
     case llvm::Triple::PS4:
       TC = new toolchains::PS4CPU(*this, Target, Args);
       break;
@@ -3646,6 +3647,12 @@
       }
     }
   }
+
+  // Intentionally omitted from the switch above: llvm::Triple::CUDA.  CUDA
+  // compiles always need two toolchains, the CUDA toolchain and the host
+  // toolchain.  So the only valid way to create a CUDA toolchain is via
+  // CreateOffloadingDeviceToolChains.
+
   return *TC;
 }
 
Index: clang/lib/Driver/ToolChains.h
===================================================================
--- clang/lib/Driver/ToolChains.h
+++ clang/lib/Driver/ToolChains.h
@@ -24,6 +24,60 @@
 
 namespace clang {
 namespace driver {
+
+/// A class to find a viable CUDA installation
+class CudaInstallationDetector {
+private:
+  const Driver &D;
+  bool IsValid = false;
+  CudaVersion Version = CudaVersion::UNKNOWN;
+  std::string InstallPath;
+  std::string BinPath;
+  std::string LibPath;
+  std::string LibDevicePath;
+  std::string IncludePath;
+  llvm::StringMap<std::string> LibDeviceMap;
+
+  // CUDA architectures for which we have raised an error in
+  // CheckCudaVersionSupportsArch.
+  mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
+
+public:
+  CudaInstallationDetector(const Driver &D, const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
+
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const;
+
+  /// \brief Emit an error if Version does not support the given Arch.
+  ///
+  /// If either Version or Arch is unknown, does not emit an error.  Emits at
+  /// most one error per Arch.
+  void CheckCudaVersionSupportsArch(CudaArch Arch) const;
+
+  /// \brief Check whether we detected a valid Cuda install.
+  bool isValid() const { return IsValid; }
+  /// \brief Print information about the detected CUDA installation.
+  void print(raw_ostream &OS) const;
+
+  /// \brief Get the detected Cuda install's version.
+  CudaVersion version() const { return Version; }
+  /// \brief Get the detected Cuda installation path.
+  StringRef getInstallPath() const { return InstallPath; }
+  /// \brief Get the detected path to Cuda's bin directory.
+  StringRef getBinPath() const { return BinPath; }
+  /// \brief Get the detected Cuda Include path.
+  StringRef getIncludePath() const { return IncludePath; }
+  /// \brief Get the detected Cuda library path.
+  StringRef getLibPath() const { return LibPath; }
+  /// \brief Get the detected Cuda device library path.
+  StringRef getLibDevicePath() const { return LibDevicePath; }
+  /// \brief Get libdevice file for given architecture
+  std::string getLibDeviceFile(StringRef Gpu) const {
+    return LibDeviceMap.lookup(Gpu);
+  }
+};
+
 namespace toolchains {
 
 /// Generic_GCC - A tool chain using the 'gcc' command to perform
@@ -157,57 +211,6 @@
 
 protected:
   GCCInstallationDetector GCCInstallation;
-
-  // \brief A class to find a viable CUDA installation
-  class CudaInstallationDetector {
-  private:
-    const Driver &D;
-    bool IsValid = false;
-    CudaVersion Version = CudaVersion::UNKNOWN;
-    std::string InstallPath;
-    std::string BinPath;
-    std::string LibPath;
-    std::string LibDevicePath;
-    std::string IncludePath;
-    llvm::StringMap<std::string> LibDeviceMap;
-
-    // CUDA architectures for which we have raised an error in
-    // CheckCudaVersionSupportsArch.
-    mutable llvm::SmallSet<CudaArch, 4> ArchsWithVersionTooLowErrors;
-
-  public:
-    CudaInstallationDetector(const Driver &D) : D(D) {}
-    void init(const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args);
-
-    /// \brief Emit an error if Version does not support the given Arch.
-    ///
-    /// If either Version or Arch is unknown, does not emit an error.  Emits at
-    /// most one error per Arch.
-    void CheckCudaVersionSupportsArch(CudaArch Arch) const;
-
-    /// \brief Check whether we detected a valid Cuda install.
-    bool isValid() const { return IsValid; }
-    /// \brief Print information about the detected CUDA installation.
-    void print(raw_ostream &OS) const;
-
-    /// \brief Get the detected Cuda install's version.
-    CudaVersion version() const { return Version; }
-    /// \brief Get the detected Cuda installation path.
-    StringRef getInstallPath() const { return InstallPath; }
-    /// \brief Get the detected path to Cuda's bin directory.
-    StringRef getBinPath() const { return BinPath; }
-    /// \brief Get the detected Cuda Include path.
-    StringRef getIncludePath() const { return IncludePath; }
-    /// \brief Get the detected Cuda library path.
-    StringRef getLibPath() const { return LibPath; }
-    /// \brief Get the detected Cuda device library path.
-    StringRef getLibDevicePath() const { return LibDevicePath; }
-    /// \brief Get libdevice file for given architecture
-    std::string getLibDeviceFile(StringRef Gpu) const {
-      return LibDeviceMap.lookup(Gpu);
-    }
-  };
-
   CudaInstallationDetector CudaInstallation;
 
 public:
@@ -403,6 +406,8 @@
   /// The OS version we are targeting.
   mutable VersionTuple TargetVersion;
 
+  CudaInstallationDetector CudaInstallation;
+
 private:
   void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const;
 
@@ -543,6 +548,9 @@
   ObjCRuntime getDefaultObjCRuntime(bool isNonFragile) const override;
   bool hasBlocksRuntime() const override;
 
+  void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                          llvm::opt::ArgStringList &CC1Args) const override;
+
   bool UseObjCMixedDispatch() const override {
     // This is only used with the non-fragile ABI and non-legacy dispatch.
 
@@ -572,6 +580,8 @@
   bool SupportsEmbeddedBitcode() const override;
 
   SanitizerMask getSupportedSanitizers() const override;
+
+  void printVerboseInfo(raw_ostream &OS) const override;
 };
 
 /// DarwinClang - The Darwin toolchain used by Clang.
@@ -867,10 +877,10 @@
   Tool *buildLinker() const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY CudaToolChain : public Linux {
+class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
 public:
   CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                const llvm::opt::ArgList &Args);
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args);
 
   llvm::opt::DerivedArgList *
   TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
@@ -881,16 +891,29 @@
   // Never try to use the integrated assembler with CUDA; always fork out to
   // ptxas.
   bool useIntegratedAs() const override { return false; }
+  bool isCrossCompiling() const override { return true; }
+  bool isPICDefault() const override { return false; }
+  bool isPIEDefault() const override { return false; }
+  bool isPICDefaultForced() const override { return false; }
+  bool SupportsProfiling() const override { return false; }
+  bool SupportsObjCGC() const override { return false; }
 
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
-  const Generic_GCC::CudaInstallationDetector &cudaInstallation() const {
-    return CudaInstallation;
-  }
-  Generic_GCC::CudaInstallationDetector &cudaInstallation() {
-    return CudaInstallation;
-  }
+  void addClangWarningOptions(llvm::opt::ArgStringList &CC1Args) const override;
+  CXXStdlibType GetCXXStdlibType(const llvm::opt::ArgList &Args) const override;
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void AddClangCXXStdlibIncludeArgs(
+      const llvm::opt::ArgList &Args,
+      llvm::opt::ArgStringList &CC1Args) const override;
+  void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
+
+  const ToolChain &HostTC;
+  CudaInstallationDetector CudaInstallation;
 
 protected:
   Tool *buildAssembler() const override;  // ptxas
Index: clang/lib/Driver/ToolChains.cpp
===================================================================
--- clang/lib/Driver/ToolChains.cpp
+++ clang/lib/Driver/ToolChains.cpp
@@ -52,7 +52,8 @@
 
 /// Darwin - Darwin tool chain for i386 and x86_64.
 Darwin::Darwin(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
-    : MachO(D, Triple, Args), TargetInitialized(false) {}
+    : MachO(D, Triple, Args), TargetInitialized(false),
+      CudaInstallation(D, Triple, Args) {}
 
 types::ID MachO::LookupTypeForExtension(StringRef Ext) const {
   types::ID Ty = types::lookupTypeForExtension(Ext);
@@ -99,6 +100,11 @@
   }
 }
 
+void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs,
+                                ArgStringList &CC1Args) const {
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+}
+
 // This is just a MachO name translation routine and there's no
 // way to join this into ARMTargetParser without breaking all
 // other assumptions. Maybe MachO should consider standardising
@@ -1296,6 +1302,10 @@
   return Res;
 }
 
+void Darwin::printVerboseInfo(raw_ostream &OS) const {
+  CudaInstallation.print(OS);
+}
+
 /// Generic_GCC - A tool chain using the 'gcc' command to perform
 /// all subcommands; this relies on gcc translating the majority of
 /// command line options.
@@ -1811,10 +1821,10 @@
   return CudaVersion::UNKNOWN;
 }
 
-// \brief -- try common CUDA installation paths looking for files we need for
-// CUDA compilation.
-void Generic_GCC::CudaInstallationDetector::init(
-    const llvm::Triple &TargetTriple, const llvm::opt::ArgList &Args) {
+CudaInstallationDetector::CudaInstallationDetector(
+    const Driver &D, const llvm::Triple &TargetTriple,
+    const llvm::opt::ArgList &Args)
+    : D(D) {
   SmallVector<std::string, 4> CudaPathCandidates;
 
   if (Args.hasArg(options::OPT_cuda_path_EQ))
@@ -1835,13 +1845,25 @@
     BinPath = CudaPath + "/bin";
     IncludePath = InstallPath + "/include";
     LibDevicePath = InstallPath + "/nvvm/libdevice";
-    LibPath = InstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
 
     auto &FS = D.getVFS();
-    if (!(FS.exists(IncludePath) && FS.exists(BinPath) && FS.exists(LibPath) &&
+    if (!(FS.exists(IncludePath) && FS.exists(BinPath) &&
           FS.exists(LibDevicePath)))
       continue;
 
+    // On Linux, we have both lib and lib64 directories, and we need to choose
+    // based on our triple.  On MacOS, we have only a lib directory.
+    //
+    // It's sufficient for our purposes to be flexible: If both lib and lib64
+    // exist, we choose whichever one matches our triple.  Otherwise, if only
+    // lib exists, we use it.
+    if (TargetTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
+      LibPath = InstallPath + "/lib64";
+    else if (FS.exists(InstallPath + "/lib"))
+      LibPath = InstallPath + "/lib";
+    else
+      continue;
+
     llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
         FS.getBufferForFile(InstallPath + "/version.txt");
     if (!VersionFile) {
@@ -1898,7 +1920,33 @@
   }
 }
 
-void Generic_GCC::CudaInstallationDetector::CheckCudaVersionSupportsArch(
+void CudaInstallationDetector::AddCudaIncludeArgs(
+    const ArgList &DriverArgs, ArgStringList &CC1Args) const {
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    // Add cuda_wrappers/* to our system include path.  This lets us wrap
+    // standard library headers.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include");
+    llvm::sys::path::append(P, "cuda_wrappers");
+    CC1Args.push_back("-internal-isystem");
+    CC1Args.push_back(DriverArgs.MakeArgString(P));
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nocudainc))
+    return;
+
+  if (!isValid()) {
+    D.Diag(diag::err_drv_no_cuda_installation);
+    return;
+  }
+
+  CC1Args.push_back("-internal-isystem");
+  CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
+  CC1Args.push_back("-include");
+  CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+}
+
+void CudaInstallationDetector::CheckCudaVersionSupportsArch(
     CudaArch Arch) const {
   if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
       ArchsWithVersionTooLowErrors.count(Arch) > 0)
@@ -1913,7 +1961,7 @@
   }
 }
 
-void Generic_GCC::CudaInstallationDetector::print(raw_ostream &OS) const {
+void CudaInstallationDetector::print(raw_ostream &OS) const {
   if (isValid())
     OS << "Found CUDA installation: " << InstallPath << ", version "
        << CudaVersionToString(Version) << "\n";
@@ -2756,7 +2804,8 @@
 
 Generic_GCC::Generic_GCC(const Driver &D, const llvm::Triple &Triple,
                          const ArgList &Args)
-    : ToolChain(D, Triple, Args), GCCInstallation(D), CudaInstallation(D) {
+    : ToolChain(D, Triple, Args), GCCInstallation(D),
+      CudaInstallation(D, Triple, Args) {
   getProgramPaths().push_back(getDriver().getInstalledDir());
   if (getDriver().getInstalledDir() != getDriver().Dir)
     getProgramPaths().push_back(getDriver().Dir);
@@ -4162,7 +4211,6 @@
 Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : Generic_ELF(D, Triple, Args) {
   GCCInstallation.init(Triple, Args);
-  CudaInstallation.init(Triple, Args);
   Multilibs = GCCInstallation.getMultilibs();
   llvm::Triple::ArchType Arch = Triple.getArch();
   std::string SysRoot = computeSysRoot();
@@ -4767,26 +4815,7 @@
 
 void Linux::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                ArgStringList &CC1Args) const {
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    // Add cuda_wrappers/* to our system include path.  This lets us wrap
-    // standard library headers.
-    SmallString<128> P(getDriver().ResourceDir);
-    llvm::sys::path::append(P, "include");
-    llvm::sys::path::append(P, "cuda_wrappers");
-    addSystemInclude(DriverArgs, CC1Args, P);
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nocudainc))
-    return;
-
-  if (!CudaInstallation.isValid()) {
-    getDriver().Diag(diag::err_drv_no_cuda_installation);
-    return;
-  }
-
-  addSystemInclude(DriverArgs, CC1Args, CudaInstallation.getIncludePath());
-  CC1Args.push_back("-include");
-  CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 void Linux::AddIAMCUIncludeArgs(const ArgList &DriverArgs,
@@ -4968,16 +4997,18 @@
 /// together object files from the assembler into a single blob.
 
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ArgList &Args)
-    : Linux(D, Triple, Args) {
+                             const ToolChain &HostTC, const ArgList &Args)
+    : ToolChain(D, Triple, Args), HostTC(HostTC),
+      CudaInstallation(D, Triple, Args) {
   if (CudaInstallation.isValid())
     getProgramPaths().push_back(CudaInstallation.getBinPath());
 }
 
-void
-CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                     llvm::opt::ArgStringList &CC1Args) const {
-  Linux::addClangTargetOptions(DriverArgs, CC1Args);
+void CudaToolChain::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  HostTC.addClangTargetOptions(DriverArgs, CC1Args);
+
   CC1Args.push_back("-fcuda-is-device");
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
@@ -5019,13 +5050,18 @@
     assert(!Arch.empty() && "Must have an explicit GPU arch.");
     CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
   }
-  Linux::AddCudaIncludeArgs(DriverArgs, CC1Args);
+  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 llvm::opt::DerivedArgList *
 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
-                             StringRef BoundArch, Action::OffloadKind) const {
-  DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
+                             StringRef BoundArch,
+                             Action::OffloadKind DeviceOffloadKind) const {
+  DerivedArgList *DAL =
+      HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
+  if (!DAL)
+    DAL = new DerivedArgList(Args.getBaseArgs());
+
   const OptTable &Opts = getDriver().getOpts();
 
   for (Arg *A : Args) {
@@ -5077,6 +5113,30 @@
   return new tools::NVPTX::Linker(*this);
 }
 
+void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
+  HostTC.addClangWarningOptions(CC1Args);
+}
+
+ToolChain::CXXStdlibType
+CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
+  return HostTC.GetCXXStdlibType(Args);
+}
+
+void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                              ArgStringList &CC1Args) const {
+  HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
+}
+
+void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
+                                                 ArgStringList &CC1Args) const {
+  HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
+}
+
+void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
+                                        ArgStringList &CC1Args) const {
+  HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args)
Index: clang/lib/Driver/Tools.cpp
===================================================================
--- clang/lib/Driver/Tools.cpp
+++ clang/lib/Driver/Tools.cpp
@@ -11981,7 +11981,7 @@
 
   // Check that our installation's ptxas supports gpu_arch.
   if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
-    TC.cudaInstallation().CheckCudaVersionSupportsArch(gpu_arch);
+    TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
   }
 
   ArgStringList CmdArgs;
Index: clang/test/Driver/cuda-detect.cu
===================================================================
--- clang/test/Driver/cuda-detect.cu
+++ clang/test/Driver/cuda-detect.cu
@@ -5,10 +5,18 @@
 // # Check that we properly detect CUDA installation.
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/no-cuda-there 2>&1 | FileCheck %s -check-prefix NOCUDA
+
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/Inputs/CUDA 2>&1 | FileCheck %s
+
 // RUN: %clang -v --target=i386-unknown-linux \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda 2>&1 | FileCheck %s
 
 // Make sure we map libdevice bitcode files to proper GPUs. These
 // tests use Inputs/CUDA_80 which has full set of libdevice files.
@@ -51,33 +59,51 @@
 // RUN:   | FileCheck %s -check-prefix COMMON \
 // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE50
 
-
 // Verify that -nocudainc prevents adding include path to CUDA headers.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \
 // RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \
+// RUN:     -check-prefix LIBDEVICE -check-prefix LIBDEVICE35
+
 // We should not add any CUDA include paths if there's no valid CUDA installation
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC
 
 // Verify that we get an error if there's no libdevice library to link with.
 // NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20  for this purpose.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \
 // RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_20 \
+// RUN:   --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE
 
 // Verify that  -nocudalib prevents linking libdevice bitcode in.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   -nocudalib --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON -check-prefix NOLIBDEVICE
+
 // Verify that we don't add include paths, link with libdevice or
 // -include __clang_cuda_runtime_wrapper.h without valid CUDA installation.
 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \
 // RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
 // RUN:   | FileCheck %s -check-prefix COMMON \
 // RUN:     -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
+// RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \
+// RUN:   --cuda-path=%S/no-cuda-there %s 2>&1 \
+// RUN:   | FileCheck %s -check-prefix COMMON \
+// RUN:     -check-prefix NOCUDAINC -check-prefix NOLIBDEVICE
 
 // Verify that C++ include paths are passed for both host and device frontends.
 // RUN: %clang -### -no-canonical-prefixes -target x86_64-linux-gnu %s \
Index: clang/test/Driver/cuda-external-tools.cu
===================================================================
--- clang/test/Driver/cuda-external-tools.cu
+++ clang/test/Driver/cuda-external-tools.cu
@@ -1,4 +1,5 @@
-// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+// Tests that ptxas and fatbinary are invoked correctly during CUDA
+// compilation.
 //
 // REQUIRES: clang-driver
 // REQUIRES: x86-registered-target
@@ -56,6 +57,14 @@
 // RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
 // RUN:   -check-prefix FATBINARY-EXTRA %s
 
+// MacOS spot-checks
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_32-apple-macosx -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
 // Match clang job that produces PTX assembly.
 // CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // SM20: "-target-cpu" "sm_20"
Index: clang/test/Driver/cuda-macosx.cu
===================================================================
--- /dev/null
+++ clang/test/Driver/cuda-macosx.cu
@@ -0,0 +1,8 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+//
+// RUN: %clang -v --target=i386-apple-macosx \
+// RUN:   --sysroot=%S/Inputs/CUDA-macosx 2>&1 | FileCheck %s
+
+// CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA-macosx/usr/local/cuda