Index: cfe/trunk/include/clang/Frontend/CodeGenOptions.h
===================================================================
--- cfe/trunk/include/clang/Frontend/CodeGenOptions.h
+++ cfe/trunk/include/clang/Frontend/CodeGenOptions.h
@@ -205,10 +205,9 @@
   /// the summary and module symbol table (and not, e.g. any debug metadata).
   std::string ThinLinkBitcodeFile;
 
-  /// A list of file names passed with -fcuda-include-gpubinary options to
-  /// forward to CUDA runtime back-end for incorporating them into host-side
-  /// object file.
-  std::vector<std::string> CudaGpuBinaryFileNames;
+  /// Name of file passed with -fcuda-include-gpubinary option to forward to
+  /// CUDA runtime back-end for incorporating them into host-side object file.
+  std::string CudaGpuBinaryFileName;
 
   /// The name of the file to which the backend should save YAML optimization
   /// records.
Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CGCUDANV.cpp
+++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp
@@ -41,10 +41,10 @@
   /// Keeps track of kernel launch stubs emitted in this module
   llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
   llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
-  /// Keeps track of variables containing handles of GPU binaries. Populated by
+  /// Keeps track of variable containing handle of GPU binary. Populated by
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
-  llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
+  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
@@ -245,16 +245,14 @@
 /// Creates a global constructor function for the module:
 /// \code
 /// void __cuda_module_ctor(void*) {
-///     Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-///     __cuda_register_globals(Handle0);
-///     ...
-///     HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-///     __cuda_register_globals(HandleN);
+///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
+///     __cuda_register_globals(Handle);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
-  // No need to generate ctors/dtors if there are no GPU binaries.
-  if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
+  // No need to generate ctors/dtors if there is no GPU binary.
+  std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
+  if (GpuBinaryFileName.empty())
     return nullptr;
 
   // void __cuda_register_globals(void* handle);
@@ -267,6 +265,18 @@
   llvm::StructType *FatbinWrapperTy =
       llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
 
+  // Register GPU binary with the CUDA runtime, store returned handle in a
+  // global variable and save a reference in GpuBinaryHandle to be cleaned up
+  // in destructor on exit. Then associate all known kernels with the GPU binary
+  // handle so CUDA runtime can figure out what to call on the GPU side.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
+      llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
+  if (std::error_code EC = GpuBinaryOrErr.getError()) {
+    CGM.getDiags().Report(diag::err_cannot_open_file)
+        << GpuBinaryFileName << EC.message();
+    return nullptr;
+  }
+
   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
       llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
@@ -276,79 +286,56 @@
 
   CtorBuilder.SetInsertPoint(CtorEntryBB);
 
-  // For each GPU binary, register it with the CUDA runtime and store returned
-  // handle in a global variable and save the handle in GpuBinaryHandles vector
-  // to be cleaned up in destructor on exit. Then associate all known kernels
-  // with the GPU binary handle so CUDA runtime can figure out what to call on
-  // the GPU side.
-  for (const std::string &GpuBinaryFileName :
-       CGM.getCodeGenOpts().CudaGpuBinaryFileNames) {
-    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> GpuBinaryOrErr =
-        llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName);
-    if (std::error_code EC = GpuBinaryOrErr.getError()) {
-      CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName
-                                                        << EC.message();
-      continue;
-    }
-
-    const char *FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
-    // NVIDIA's cuobjdump looks for fatbins in this section.
-    const char *FatbinSectionName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
-
-    // Create initialized wrapper structure that points to the loaded GPU binary
-    ConstantInitBuilder Builder(CGM);
-    auto Values = Builder.beginStruct(FatbinWrapperTy);
-    // Fatbin wrapper magic.
-    Values.addInt(IntTy, 0x466243b1);
-    // Fatbin version.
-    Values.addInt(IntTy, 1);
-    // Data.
-    Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), 
-                                  "", FatbinConstantName, 8));
-    // Unused in fatbin v1.
-    Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
-    llvm::GlobalVariable *FatbinWrapper =
-      Values.finishAndCreateGlobal("__cuda_fatbin_wrapper",
-                                   CGM.getPointerAlign(),
-                                   /*constant*/ true);
-    FatbinWrapper->setSection(FatbinSectionName);
-
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
-    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
-        RegisterFatbinFunc,
-        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
-    llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
-        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
-    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
-                                   CGM.getPointerAlign());
-
-    // Call __cuda_register_globals(GpuBinaryHandle);
-    if (RegisterGlobalsFunc)
-      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
-
-    // Save GpuBinaryHandle so we can unregister it in destructor.
-    GpuBinaryHandles.push_back(GpuBinaryHandle);
-  }
+  const char *FatbinConstantName =
+      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+  // NVIDIA's cuobjdump looks for fatbins in this section.
+  const char *FatbinSectionName =
+      CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+
+  // Create initialized wrapper structure that points to the loaded GPU binary
+  ConstantInitBuilder Builder(CGM);
+  auto Values = Builder.beginStruct(FatbinWrapperTy);
+  // Fatbin wrapper magic.
+  Values.addInt(IntTy, 0x466243b1);
+  // Fatbin version.
+  Values.addInt(IntTy, 1);
+  // Data.
+  Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "",
+                                FatbinConstantName, 8));
+  // Unused in fatbin v1.
+  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
+  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
+      "__cuda_fatbin_wrapper", CGM.getPointerAlign(),
+      /*constant*/ true);
+  FatbinWrapper->setSection(FatbinSectionName);
+
+  // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+  llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+      RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+  GpuBinaryHandle = new llvm::GlobalVariable(
+      TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+      llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+  CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
+                                 CGM.getPointerAlign());
+
+  // Call __cuda_register_globals(GpuBinaryHandle);
+  if (RegisterGlobalsFunc)
+    CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
 
   CtorBuilder.CreateRetVoid();
   return ModuleCtorFunc;
 }
 
-/// Creates a global destructor function that unregisters all GPU code blobs
+/// Creates a global destructor function that unregisters the GPU code blob
 /// registered by constructor.
 /// \code
 /// void __cuda_module_dtor(void*) {
-///     __cudaUnregisterFatBinary(Handle0);
-///     ...
-///     __cudaUnregisterFatBinary(HandleN);
+///     __cudaUnregisterFatBinary(Handle);
 /// }
 /// \endcode
 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
-  // No need for destructor if we don't have handles to unregister.
-  if (GpuBinaryHandles.empty())
+  // No need for destructor if we don't have a handle to unregister.
+  if (!GpuBinaryHandle)
     return nullptr;
 
   // void __cudaUnregisterFatBinary(void ** handle);
@@ -364,11 +351,9 @@
   CGBuilderTy DtorBuilder(CGM, Context);
   DtorBuilder.SetInsertPoint(DtorEntryBB);
 
-  for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) {
-    auto HandleValue =
+  auto HandleValue =
       DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign());
-    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
-  }
+  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
 
   DtorBuilder.CreateRetVoid();
   return ModuleDtorFunc;
Index: cfe/trunk/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- cfe/trunk/lib/Driver/ToolChains/Clang.cpp
+++ cfe/trunk/lib/Driver/ToolChains/Clang.cpp
@@ -4677,13 +4677,12 @@
   }
 
   if (IsCuda) {
-    // Host-side cuda compilation receives device-side outputs as Inputs[1...].
-    // Include them with -fcuda-include-gpubinary.
+    // Host-side cuda compilation receives all device-side outputs in a single
+    // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary.
     if (Inputs.size() > 1) {
-      for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
-        CmdArgs.push_back("-fcuda-include-gpubinary");
-        CmdArgs.push_back(I->getFilename());
-      }
+      assert(Inputs.size() == 2 && "More than one GPU binary!");
+      CmdArgs.push_back("-fcuda-include-gpubinary");
+      CmdArgs.push_back(Inputs[1].getFilename());
     }
 
     if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
Index: cfe/trunk/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- cfe/trunk/lib/Frontend/CompilerInvocation.cpp
+++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp
@@ -1046,8 +1046,8 @@
                       Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags,
                       Opts.SanitizeTrap);
 
-  Opts.CudaGpuBinaryFileNames =
-      Args.getAllArgValues(OPT_fcuda_include_gpubinary);
+  Opts.CudaGpuBinaryFileName =
+      Args.getLastArgValue(OPT_fcuda_include_gpubinary);
 
   Opts.Backchain = Args.hasArg(OPT_mbackchain);
 
Index: cfe/trunk/test/Driver/cuda-options.cu
===================================================================
--- cfe/trunk/test/Driver/cuda-options.cu
+++ cfe/trunk/test/Driver/cuda-options.cu
@@ -73,11 +73,10 @@
 // and that all results are included on the host side.
 // RUN: %clang -### -target x86_64-linux-gnu \
 // RUN:   --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
-// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
-// RUN:    -check-prefix DEVICE2 -check-prefix DEVICE-SM30 \
-// RUN:    -check-prefix DEVICE2-SM35 -check-prefix HOST \
-// RUN:    -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix NOLINK %s
+// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
+// RUN:             -check-prefixes DEVICE-SM30,DEVICE2-SM35 \
+// RUN:             -check-prefixes INCLUDES-DEVICE,INCLUDES-DEVICE2 \
+// RUN:             -check-prefixes HOST,HOST-NOSAVE,NOLINK %s
 
 // Verify that device-side results are passed to the correct tool when
 // -save-temps is used.
@@ -182,9 +181,15 @@
 // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
 // DEVICE2-SAME: "-fcuda-is-device"
 // DEVICE2-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
+// DEVICE2-SAME: "-o" "[[PTXFILE2:[^"]*]]"
 // DEVICE2-SAME: "-x" "cuda"
 
+// Match another call to ptxas.
+// DEVICE2: ptxas
+// DEVICE2-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE2-DAG: "--output-file" "[[CUBINFILE2:[^"]*]]"
+// DEVICE2-DAG: "[[PTXFILE2]]"
+
 // Match no device-side compilation.
 // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // NODEVICE-NOT: "-fcuda-is-device"
@@ -193,6 +198,8 @@
 // INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
 // INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
 // INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+// INCLUDES-DEVICE2-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE2]]"
+// INCLUDES-DEVICE2-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE2]]"
 
 // Match host-side preprocessor job with -save-temps.
 // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
@@ -207,7 +214,11 @@
 // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // HOST-NOSAVE-SAME: "-x" "cuda"
 // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
+// There is only one GPU binary after combining it with fatbinary!
+// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
 // INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// There is only one GPU binary after combining it with fatbinary.
+// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary"
 
 // Match external assembler that uses compilation output.
 // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"