Index: lib/CodeGen/CGCUDANV.cpp
===================================================================
--- lib/CodeGen/CGCUDANV.cpp
+++ lib/CodeGen/CGCUDANV.cpp
@@ -15,12 +15,13 @@
 #include "CGCUDARuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
-#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "clang/AST/Decl.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Format.h"
 
 using namespace clang;
 using namespace CodeGen;
@@ -45,9 +46,12 @@
   /// ModuleCtorFunction() and used to create corresponding cleanup calls in
   /// ModuleDtorFunction()
   llvm::SmallVector<llvm::GlobalVariable *, 16> GpuBinaryHandles;
+  /// Whether we generate relocatable device code.
+  bool RelocatableDeviceCode;
 
   llvm::Constant *getSetupArgumentFn() const;
   llvm::Constant *getLaunchFn() const;
+  llvm::FunctionType *getRegisterGlobalsFnTy() const;
 
   /// Creates a function to register all kernel stubs generated in this module.
   llvm::Function *makeRegisterGlobalsFn();
@@ -71,7 +75,23 @@
 
     return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
                                                 ConstStr.getPointer(), Zeros);
- }
+  }
+
+  /// Helper function that generates an empty dummy function returning void.
+  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
+    assert(FnTy->getReturnType()->isVoidTy() &&
+           "Can only generate dummy functions returning void!");
+    llvm::Function *DummyFunc = llvm::Function::Create(
+        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
+
+    llvm::BasicBlock *DummyBlock =
+        llvm::BasicBlock::Create(Context, "", DummyFunc);
+    CGBuilderTy FuncBuilder(CGM, Context);
+    FuncBuilder.SetInsertPoint(DummyBlock);
+    FuncBuilder.CreateRetVoid();
+
+    return DummyFunc;
+  }
 
   void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args);
 
@@ -93,7 +113,8 @@
 
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
     : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
-      TheModule(CGM.getModule()) {
+      TheModule(CGM.getModule()),
+      RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) {
   CodeGen::CodeGenTypes &Types = CGM.getTypes();
   ASTContext &Ctx = CGM.getContext();
 
@@ -161,6 +182,10 @@
   CGF.EmitBlock(EndBlock);
 }
 
+llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
+  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
+}
+
 /// Creates a function that sets up state on the host side for CUDA objects that
 /// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
@@ -181,8 +206,8 @@
     return nullptr;
 
   llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
-      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
+      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
+      "__cuda_register_globals", &TheModule);
   llvm::BasicBlock *EntryBB =
       llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
   CGBuilderTy Builder(CGM, Context);
@@ -257,8 +282,29 @@
   if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty())
     return nullptr;
 
+  llvm::FunctionType *RegisterGlobalsFnTy;
+  llvm::FunctionType *RegisterLinkedBinaryFnTy;
+  llvm::Function *DummyCallback;
+  if (RelocatableDeviceCode) {
+    RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
+
+    auto CallbackFnTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
+    DummyCallback = makeDummyFunction(CallbackFnTy);
+
+    // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
+    // void *, void (*)(void **))
+    llvm::Type *Params[] = {RegisterGlobalsFnTy, VoidPtrTy, VoidPtrTy,
+                            CallbackFnTy};
+    RegisterLinkedBinaryFnTy = llvm::FunctionType::get(VoidTy, Params, false);
+  }
+
   // void __cuda_register_globals(void* handle);
   llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
+  // We always need a function to pass in as callback. Create a dummy
+  // implementation if we don't need to register anything.
+  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
+    RegisterGlobalsFunc = makeDummyFunction(RegisterGlobalsFnTy);
+
   // void ** __cudaRegisterFatBinary(void *);
   llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@@ -291,11 +337,18 @@
       continue;
     }
 
-    const char *FatbinConstantName =
-        CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
+    const char *FatbinConstantName;
+    if (RelocatableDeviceCode)
+      // TODO: Figure out how this is called on mac OS!
+      FatbinConstantName = "__nv_relfatbin";
+    else
+      FatbinConstantName =
+          CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
     // NVIDIA's cuobjdump looks for fatbins in this section.
     const char *FatbinSectionName =
         CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
+    // TODO: Figure out how this is called on mac OS!
+    const char *NVModuleIDSectionName = "__nv_module_id";
 
     // Create initialized wrapper structure that points to the loaded GPU binary
     ConstantInitBuilder Builder(CGM);
@@ -315,22 +368,52 @@
                                    /*constant*/ true);
     FatbinWrapper->setSection(FatbinSectionName);
 
-    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
-    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
-        RegisterFatbinFunc,
-        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
-    llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
-        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
-    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
-                                   CGM.getPointerAlign());
-
-    // Call __cuda_register_globals(GpuBinaryHandle);
-    if (RegisterGlobalsFunc)
-      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
-
-    // Save GpuBinaryHandle so we can unregister it in destructor.
-    GpuBinaryHandles.push_back(GpuBinaryHandle);
+    llvm::Constant *NVModuleIDConstant;
+    SmallString<64> NVModuleID;
+    if (RelocatableDeviceCode) {
+      // Generate a unique module ID.
+      llvm::raw_svector_ostream OS(NVModuleID);
+      OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID());
+      NVModuleIDConstant =
+          makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32);
+
+      // Create an alias for the FatbinWrapper that nvcc will look for.
+      llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
+                                Twine("__fatbinwrap") + NVModuleID,
+                                FatbinWrapper);
+    }
+
+    // Register binary with CUDA runtime. This is substantially different in
+    // default mode vs. separate compilation!
+    if (!RelocatableDeviceCode) {
+      // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
+      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
+          RegisterFatbinFunc,
+          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
+      llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable(
+          TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
+          llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+      CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
+                                     CGM.getPointerAlign());
+
+      // Call __cuda_register_globals(GpuBinaryHandle);
+      if (RegisterGlobalsFunc)
+        CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
+
+      // Save GpuBinaryHandle so we can unregister it in destructor.
+      GpuBinaryHandles.push_back(GpuBinaryHandle);
+    } else {
+      SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
+      RegisterLinkedBinaryName += NVModuleID;
+      llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
+          RegisterLinkedBinaryFnTy, RegisterLinkedBinaryName);
+
+      assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
+      assert(DummyCallback && "Expecting dummy function for second callback!");
+      llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper,
+                             NVModuleIDConstant, DummyCallback};
+      CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
+    }
   }
 
   CtorBuilder.CreateRetVoid();