Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Path.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -244,7 +245,7 @@
 
 /// Creates a global constructor function for the module:
 /// \code
-/// void __cuda_module_ctor(void*) {
+/// void __cuda_module_ctor_<ModuleName>(void*) {
 ///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
 ///     __cuda_register_globals(Handle);
 /// }
@@ -277,9 +278,26 @@
     return nullptr;
   }
 
+  // get name from the module to generate unique ctor name for every module
+  const SmallString<128> ModuleName
+      = llvm::sys::path::filename(CGM.getModule().getName());
+  SmallString<128> CtorSuffix("");
+  if (!ModuleName.empty()){
+    CtorSuffix.append("_");
+    CtorSuffix.append(ModuleName);
+  }
+
+  for (size_t i = 0; i < CtorSuffix.size(); ++i) {
+    // Replace everything that's not [a-zA-Z0-9._] with a _. This set happens
+    // to be the set of C preprocessing numbers.
+    if (!isPreprocessingNumberBody(CtorSuffix[i]))
+      CtorSuffix[i] = '_';
+  }
+
   llvm::Function *ModuleCtorFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
+      llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor" + CtorSuffix,
+      &TheModule);
   llvm::BasicBlock *CtorEntryBB =
       llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
   CGBuilderTy CtorBuilder(CGM, Context);
@@ -329,7 +347,7 @@
 /// Creates a global destructor function that unregisters the GPU code blob
 /// registered by constructor.
 /// \code
-/// void __cuda_module_dtor(void*) {
+/// void __cuda_module_dtor_<ModuleName>(void*) {
 ///     __cudaUnregisterFatBinary(Handle);
 /// }
 /// \endcode
@@ -343,9 +361,26 @@
       llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
       "__cudaUnregisterFatBinary");
 
+  // get name from the module to generate unique dtor name for every module
+  const SmallString<128> ModuleName
+      = llvm::sys::path::filename(CGM.getModule().getName());
+  SmallString<128> DtorSuffix("");
+  if (!ModuleName.empty()){
+    DtorSuffix.append("_");
+    DtorSuffix.append(ModuleName);
+  }
+
+  for (size_t i = 0; i < DtorSuffix.size(); ++i) {
+    // Replace everything that's not [a-zA-Z0-9._] with a _. This set happens
+    // to be the set of C preprocessing numbers.
+    if (!isPreprocessingNumberBody(DtorSuffix[i]))
+      DtorSuffix[i] = '_';
+  }
+
   llvm::Function *ModuleDtorFunc = llvm::Function::Create(
       llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
+      llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor" + DtorSuffix,
+      &TheModule);
   llvm::BasicBlock *DtorEntryBB =
       llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
   CGBuilderTy DtorBuilder(CGM, Context);
Index: unittests/CodeGen/IncrementalProcessingTest.cpp
===================================================================
--- unittests/CodeGen/IncrementalProcessingTest.cpp
+++ unittests/CodeGen/IncrementalProcessingTest.cpp
@@ -21,9 +21,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetOptions.h"
 #include "gtest/gtest.h"
 
 #include <memory>
+#include <string>
 
 using namespace llvm;
 using namespace clang;
@@ -171,4 +173,122 @@
 
 }
 
+
+// In CUDA incremental processing, a CUDA ctor or dtor will be generated for
+// every statement if a fatbinary file exists.
+const char CUDATestProgram1[] =
+    "void cudaFunc1(){}\n";
+
+const char CUDATestProgram2[] =
+    "void cudaFunc2(){}\n";
+
+const Function* getCUDActor(llvm::Module& M) {
+  for (const auto& Func: M)
+    if (Func.hasName() && Func.getName().startswith("__cuda_module_ctor_"))
+      return &Func;
+
+  return nullptr;
+}
+
+const Function* getCUDAdtor(llvm::Module& M) {
+  for (const auto& Func: M)
+    if (Func.hasName() && Func.getName().startswith("__cuda_module_dtor_"))
+      return &Func;
+
+  return nullptr;
+}
+
+TEST(IncrementalProcessing, EmitCUDAGlobalInitFunc) {
+    LLVMContext Context;
+    CompilerInstance compiler;
+
+    compiler.createDiagnostics();
+    compiler.getLangOpts().CPlusPlus = 1;
+    compiler.getLangOpts().CPlusPlus11 = 1;
+    compiler.getLangOpts().CUDA = 1;
+
+    compiler.getTargetOpts().Triple = llvm::Triple::normalize(
+        llvm::sys::getProcessTriple());
+    compiler.setTarget(clang::TargetInfo::CreateTargetInfo(
+      compiler.getDiagnostics(),
+      std::make_shared<clang::TargetOptions>(
+        compiler.getTargetOpts())));
+
+    // To enable the generating of cuda host code, it's needs to set up the
+    // auxTriple.
+    llvm::Triple hostTriple(llvm::sys::getProcessTriple());
+    compiler.getFrontendOpts().AuxTriple =
+        hostTriple.isArch64Bit() ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda";
+    auto targetOptions = std::make_shared<clang::TargetOptions>();
+    targetOptions->Triple = compiler.getFrontendOpts().AuxTriple;
+    targetOptions->HostTriple = compiler.getTarget().getTriple().str();
+    compiler.setAuxTarget(clang::TargetInfo::CreateTargetInfo(
+        compiler.getDiagnostics(), targetOptions));
+
+    // A fatbinary file is necessary, that the code generator generates the ctor
+    // and dtor.
+    auto tmpFatbinFileOrError = llvm::sys::fs::TempFile::create("dummy.fatbin");
+    ASSERT_TRUE((bool)tmpFatbinFileOrError);
+    auto tmpFatbinFile = std::move(*tmpFatbinFileOrError);
+    compiler.getCodeGenOpts().CudaGpuBinaryFileName = tmpFatbinFile.TmpName;
+
+    compiler.createFileManager();
+    compiler.createSourceManager(compiler.getFileManager());
+    compiler.createPreprocessor(clang::TU_Prefix);
+    compiler.getPreprocessor().enableIncrementalProcessing();
+
+    compiler.createASTContext();
+
+    CodeGenerator* CG =
+        CreateLLVMCodeGen(
+            compiler.getDiagnostics(),
+            "main-module",
+            compiler.getHeaderSearchOpts(),
+            compiler.getPreprocessorOpts(),
+            compiler.getCodeGenOpts(),
+            Context);
+
+    compiler.setASTConsumer(std::unique_ptr<ASTConsumer>(CG));
+    compiler.createSema(clang::TU_Prefix, nullptr);
+    Sema& S = compiler.getSema();
+
+    std::unique_ptr<Parser> ParseOP(new Parser(S.getPreprocessor(), S,
+                                               /*SkipFunctionBodies*/ false));
+    Parser &P = *ParseOP.get();
+
+    std::array<std::unique_ptr<llvm::Module>, 3> M;
+    M[0] = IncrementalParseAST(compiler, P, *CG, nullptr);
+    ASSERT_TRUE(M[0]);
+
+    M[1] = IncrementalParseAST(compiler, P, *CG, CUDATestProgram1);
+    ASSERT_TRUE(M[1]);
+    ASSERT_TRUE(M[1]->getFunction("_Z9cudaFunc1v"));
+
+    M[2] = IncrementalParseAST(compiler, P, *CG, CUDATestProgram2);
+    ASSERT_TRUE(M[2]);
+    ASSERT_TRUE(M[2]->getFunction("_Z9cudaFunc2v"));
+    // First code should not end up in second module:
+    ASSERT_FALSE(M[2]->getFunction("_Z9cudaFunc1v"));
+
+    // Make sure, that cuda ctor's and dtor's exist:
+    const Function* CUDActor1 = getCUDActor(*M[1]);
+    ASSERT_TRUE(CUDActor1);
+
+    const Function* CUDActor2 = getCUDActor(*M[2]);
+    ASSERT_TRUE(CUDActor2);
+
+    const Function* CUDAdtor1 = getCUDAdtor(*M[1]);
+    ASSERT_TRUE(CUDAdtor1);
+
+    const Function* CUDAdtor2 = getCUDAdtor(*M[2]);
+    ASSERT_TRUE(CUDAdtor2);
+
+    // Compare the names of both ctor's and dtor's to check, that they are
+    // unique.
+    ASSERT_FALSE(CUDActor1->getName() == CUDActor2->getName());
+    ASSERT_FALSE(CUDAdtor1->getName() == CUDAdtor2->getName());
+
+    ASSERT_FALSE((bool)tmpFatbinFile.discard());
+}
+
 } // end anonymous namespace