Index: include/clang/Driver/Action.h
===================================================================
--- include/clang/Driver/Action.h
+++ include/clang/Driver/Action.h
@@ -136,7 +136,8 @@
 
 class CudaDeviceAction : public Action {
   virtual void anchor();
-  /// GPU architecture to bind.  Always of the form /sm_\d+/.
+  /// GPU architecture to bind.  Always of the form /sm_\d+/ or null (when the
+  /// action applies to multiple architectures).
   const char *GpuArchName;
   /// True when action results are not consumed by the host action (e.g when
   /// -fsyntax-only or --cuda-device-only options are used).
Index: include/clang/Driver/Options.td
===================================================================
--- include/clang/Driver/Options.td
+++ include/clang/Driver/Options.td
@@ -336,6 +336,10 @@
 def Xclang : Separate<["-"], "Xclang">,
   HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
   Flags<[DriverOption, CoreOption]>;
+def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
+  HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
+def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
+  HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
 def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">;
 def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
Index: include/clang/Driver/Types.def
===================================================================
--- include/clang/Driver/Types.def
+++ include/clang/Driver/Types.def
@@ -93,4 +93,6 @@
 TYPE("image",                    Image,        INVALID,         "out",   "")
 TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
 TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
+TYPE("cuda-cubin",               CUDA_CUBIN,   INVALID,         "cubin", "A")
+TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
 TYPE("none",                     Nothing,      INVALID,         nullptr, "u")
Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -259,6 +259,8 @@
         TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
         llvm::ConstantStruct::get(FatbinWrapperTy, Values),
         "__cuda_fatbin_wrapper");
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinWrapper->setSection(".nvFatBinSegment");
 
     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
Index: lib/Driver/Action.cpp
===================================================================
--- lib/Driver/Action.cpp
+++ lib/Driver/Action.cpp
@@ -56,7 +56,7 @@
                                    bool AtTopLevel)
     : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
       AtTopLevel(AtTopLevel) {
-  assert(IsValidGpuArchName(GpuArchName));
+  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
 }
 
 bool CudaDeviceAction::IsValidGpuArchName(llvm::StringRef ArchName) {
Index: lib/Driver/Driver.cpp
===================================================================
--- lib/Driver/Driver.cpp
+++ lib/Driver/Driver.cpp
@@ -949,8 +949,9 @@
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->begin(), Ids) << "}";
   } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"' << CDA->getGpuArchName() << '"' << ", {"
-       << PrintActions1(C, *CDA->begin(), Ids) << "}";
+    os << '"'
+       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+       << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
   } else {
     const ActionList *AL;
     if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
@@ -1327,7 +1328,7 @@
   // Check whether any of device actions stopped before they could generate PTX.
   bool PartialCompilation =
       llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::BackendJobClass;
+        return a->getKind() != Action::AssembleJobClass;
       });
 
   // Figure out what to do with device actions -- pass them as inputs to the
@@ -1356,16 +1357,32 @@
     return HostAction;
   }
 
-  // Outputs of device actions during complete CUDA compilation get created
-  // with AtTopLevel=false and become inputs for the host action.
+  // If we're not a partial or device-only compilation, we compile each arch to
+  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+  // "link" action, which uses fatbinary to combine these cubins into one
+  // fatbin.  The fatbin is then an input to the host compilation.
   ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    DeviceActions.push_back(
-        C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
-                                       /* AtTopLevel */ false));
+  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+    Action* AssembleAction = CudaDeviceActions[I];
+    assert(AssembleAction->getType() == types::TY_CUDA_CUBIN);
+    assert(AssembleAction->getInputs().size() == 1);
+
+    Action* BackendAction = AssembleAction->getInputs()[0];
+    assert(BackendAction->getType() == types::TY_PP_Asm);
+
+    for (const auto& A : {AssembleAction, BackendAction}) {
+      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+          A, GpuArchList[I], /* AtTopLevel */ false));
+    }
+  }
+  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+      /* GpuArchName = */ nullptr,
+      /* AtTopLevel = */ false);
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+  return C.MakeAction<CudaHostAction>(std::move(HostAction),
+                                      ActionList({FatbinAction}));
 }
 
 void Driver::BuildActions(Compilation &C, const ToolChain &TC,
@@ -1600,7 +1617,12 @@
     return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
   }
   case phases::Assemble:
-    return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+    // Assembling generates an object file, except when we're assembling CUDA,
+    // in which case we get a cubin file.
+    return C.MakeAction<AssembleJobAction>(
+        std::move(Input), TC.getTriple().getOS() == llvm::Triple::CUDA
+                              ? types::TY_CUDA_CUBIN
+                              : types::TY_Object);
   }
 
   llvm_unreachable("invalid phase in ConstructPhaseAction");
@@ -1849,11 +1871,14 @@
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
     // Initial processing of CudaDeviceAction carries host params.
     // Call BuildJobsForAction() again, now with correct device parameters.
-    assert(CDA->getGpuArchName() && "No GPU name in device action.");
-    return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
-                              CDA->getGpuArchName(), CDA->isAtTopLevel(),
-                              /*MultipleArchs*/ true, LinkingOutput,
-                              CachedResults);
+    InputInfo II = BuildJobsForAction(
+        C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+        CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+        CachedResults);
+    // Set the InputInfo's Action to the CudaDeviceAction, so that one can
+    // retrieve the Input's GPU arch.
+    II.setAction(A);
+    return II;
   }
 
   const ActionList *Inputs = &A->getInputs();
Index: lib/Driver/ToolChains.h
===================================================================
--- lib/Driver/ToolChains.h
+++ lib/Driver/ToolChains.h
@@ -163,6 +163,7 @@
     bool IsValid;
     const Driver &D;
     std::string CudaInstallPath;
+    std::string CudaBinPath;
     std::string CudaLibPath;
     std::string CudaLibDevicePath;
     std::string CudaIncludePath;
@@ -179,6 +180,8 @@
 
     /// \brief Get the detected Cuda installation path.
     StringRef getInstallPath() const { return CudaInstallPath; }
+    /// \brief Get the detected path to Cuda's bin directory.
+    StringRef getBinPath() const { return CudaBinPath; }
     /// \brief Get the detected Cuda Include path.
     StringRef getIncludePath() const { return CudaIncludePath; }
     /// \brief Get the detected Cuda library path.
@@ -816,6 +819,10 @@
                 const char *BoundArch) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args) const override;
+
+protected:
+  Tool *buildAssembler() const override;  // ptxas
+  Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
 };
 
 class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux {
Index: lib/Driver/ToolChains.cpp
===================================================================
--- lib/Driver/ToolChains.cpp
+++ lib/Driver/ToolChains.cpp
@@ -1652,13 +1652,14 @@
       continue;
 
     CudaInstallPath = CudaPath;
+    CudaBinPath = CudaPath + "/bin";
     CudaIncludePath = CudaInstallPath + "/include";
     CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
     CudaLibPath =
         CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
 
     if (!(D.getVFS().exists(CudaIncludePath) &&
-          D.getVFS().exists(CudaLibPath) &&
+          D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
           D.getVFS().exists(CudaLibDevicePath)))
       continue;
 
@@ -4182,13 +4183,17 @@
   return new tools::dragonfly::Linker(*this);
 }
 
-/// Stub for CUDA toolchain. At the moment we don't have assembler or
-/// linker and need toolchain mainly to propagate device-side options
-/// to CC1.
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
 
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : Linux(D, Triple, Args) {}
+    : Linux(D, Triple, Args) {
+  if (CudaInstallation.isValid()) {
+    getProgramPaths().push_back(CudaInstallation.getBinPath());
+  }
+}
 
 void
 CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
@@ -4222,7 +4227,7 @@
   for (Arg *A : Args) {
     if (A->getOption().matches(options::OPT_Xarch__)) {
       // Skip this argument unless the architecture matches BoundArch
-      if (A->getValue(0) != StringRef(BoundArch))
+      if (!BoundArch || A->getValue(0) != StringRef(BoundArch))
         continue;
 
       unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
@@ -4253,10 +4258,20 @@
     DAL->append(A);
   }
 
-  DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  if (BoundArch) {
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  }
   return DAL;
 }
 
+Tool *CudaToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *CudaToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args)
Index: lib/Driver/Tools.h
===================================================================
--- lib/Driver/Tools.h
+++ lib/Driver/Tools.h
@@ -903,6 +903,40 @@
 };
 } // end namespace PS4cpu
 
+namespace NVPTX {
+
+// Run ptxas, the NVPTX assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+ public:
+   Assembler(const ToolChain &TC)
+       : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+// Runs fatbinary, which is sort of a linker for NVPTX.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+ public:
+   Linker(const ToolChain &TC)
+       : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+}  // end namespace NVPTX
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -10600,3 +10600,96 @@
   else
     ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
 }
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  std::vector<std::string> gpu_archs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string& gpu_arch = gpu_archs[0];
+
+
+  ArgStringList CmdArgs;
+  if (TC.getTriple().isArch64Bit()) {
+    CmdArgs.push_back("-m64");
+  } else {
+    CmdArgs.push_back("-m32");
+  }
+
+  // Clang's default optimization level is -O0, but ptxas's default is -O3.
+  CmdArgs.push_back(Args.MakeArgString(
+      llvm::Twine("-O") +
+      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+  // not supported at other optimization levels.
+
+  CmdArgs.push_back("--gpu-name");
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back("--output-file");
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  for (const auto& II : Inputs) {
+    CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas)) {
+    CmdArgs.push_back(Args.MakeArgString(A));
+  }
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back("--cuda");
+  if (TC.getTriple().isArch64Bit()) {
+    CmdArgs.push_back("-64");
+  } else {
+    CmdArgs.push_back("-32");
+  }
+  CmdArgs.push_back(Args.MakeArgString("--create"));
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+  for (const auto& II : Inputs) {
+    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    std::string Arch = A->getGpuArchName();
+    // We need to name pass an Arch of the form "sm_XX" for cubin files and
+    // "compute_XX" for ptx.  CudaDeviceAction's getGpuArchName() is guaranteed
+    // to match "sm_XX".
+    if (II.getType() == types::TY_PP_Asm) {
+      std::pair<StringRef, StringRef> Split = StringRef(Arch).split('_');
+      assert(Split.first == "sm");
+      Arch = ("compute_" + Split.second).str();
+    }
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+                                         Arch + ",file=" + II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary)) {
+    CmdArgs.push_back(Args.MakeArgString(A));
+  }
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
Index: lib/Driver/Types.cpp
===================================================================
--- lib/Driver/Types.cpp
+++ lib/Driver/Types.cpp
@@ -232,8 +232,7 @@
         P.push_back(phases::Compile);
         P.push_back(phases::Backend);
       }
-      if (Id != TY_CUDA_DEVICE)
-        P.push_back(phases::Assemble);
+      P.push_back(phases::Assemble);
     }
   }
 
Index: test/Driver/cuda-external-tools.cu
===================================================================
--- /dev/null
+++ test/Driver/cuda-external-tools.cu
@@ -0,0 +1,66 @@
+// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Regular compile with -O2.
+// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile without -O.  This should result in us passing -O0 to ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Regular compile targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+
+// 32-bit compile.
+// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
+// Check -Xcuda-ptxas and -Xcuda-fatbinary
+// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
+// RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
+// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
+// RUN:   -check-prefix FATBINARY-EXTRA %s
+
+// Match clang job that produces PTX assembly.
+// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20: "-target-cpu" "sm_20"
+// SM35: "-target-cpu" "sm_35"
+// SM20: "-o" "[[PTXFILE:[^"]*]]"
+// SM35: "-o" "[[PTXFILE:[^"]*]]"
+
+// Match the call to ptxas (which assembles PTX to SASS).
+// CHECK:ptxas
+// ARCH64: "-m64"
+// ARCH32: "-m32"
+// OPT0: "-O0"
+// OPT2: "-O2"
+// SM20: "--gpu-name" "sm_20"
+// SM35: "--gpu-name" "sm_35"
+// SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
+// SM35: "--output-file" "[[CUBINFILE:[^"]*]]"
+// PTXAS-EXTRA: "-foo1"
+// PTXAS-EXTRA-SAME: "-foo2"
+// CHECK-SAME: "[[PTXFILE]]"
+
+// Match the call to fatbinary (which combines all our PTX and SASS into one
+// blob).
+// CHECK:fatbinary
+// CHECK-DAG: "--cuda"
+// ARCH64-DAG: "-64"
+// ARCH32-DAG: "-32"
+// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]"
+// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
+// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]"
+// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
+// FATBINARY-EXTRA: "-bar1"
+// FATBINARY-EXTRA-SAME: "-bar2"
+
+// Match the clang job for host compilation.
+// CHECK: "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
Index: test/Driver/cuda-options.cu
===================================================================
--- test/Driver/cuda-options.cu
+++ test/Driver/cuda-options.cu
@@ -54,7 +54,7 @@
 // RUN:    -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \
 // RUN:    -check-prefix DEVICE2-SM30 -check-prefix HOST \
 // RUN:    -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s
+// RUN:    -check-prefix NOLINK %s
 
 // Verify that device-side results are passed to the correct tool when
 // -save-temps is used.
@@ -85,10 +85,16 @@
 // DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu"
 // DEVICE-SAME: "-fcuda-is-device"
 // DEVICE-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
+// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // DEVICE-NOSAVE-SAME: "-x" "cuda"
 // DEVICE-SAVE-SAME: "-x" "ir"
 
+// Match the call to ptxas (which assembles PTX to SASS).
+// DEVICE:ptxas
+// DEVICE-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
+// DEVICE-DAG: "[[PTXFILE]]"
+
 // Match another device-side compilation.
 // DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
@@ -101,6 +107,11 @@
 // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // NODEVICE-SAME-NOT: "-fcuda-is-device"
 
+// INCLUDES-DEVICE:fatbinary
+// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+
 // Match host-side preprocessor job with -save-temps.
 // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
@@ -114,8 +125,7 @@
 // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // HOST-NOSAVE-SAME: "-x" "cuda"
 // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
-// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
-// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
 
 // Match external assembler that uses compilation output.
 // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"