Index: lib/CodeGen/CodeGenModule.h
===================================================================
--- lib/CodeGen/CodeGenModule.h
+++ lib/CodeGen/CodeGenModule.h
@@ -301,6 +301,11 @@
   /// yet.
   std::map<StringRef, GlobalDecl> DeferredDecls;
 
+  /// Contains all GlobalDecls for function definitions that do not
+  /// match current compilation mode. We'll emit these as-needed if
+  /// there's no suitable mode-matching function.
+  std::map<StringRef, GlobalDecl> DeferredCudaDecls;
+
   /// This is a list of deferred decls which we have seen that *are* actually
   /// referenced. These get code generated when the module is done.
   struct DeferredGlobal {
@@ -1179,6 +1184,9 @@
   /// Emit any needed decls for which code generation was deferred.
   void EmitDeferred();
 
+  /// Emit any needed CUDA decls for which code generation was deferred.
+  void EmitDeferredCudaDecls();
+
   /// Call replaceAllUsesWith on all pairs in Replacements.
   void applyReplacements();
 
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -381,6 +381,8 @@
   if (CoverageMapping)
     CoverageMapping->emit();
   emitLLVMUsed();
+  if (LangOpts.CUDA)
+    EmitDeferredCudaDecls();
 
   if (CodeGenOpts.Autolink &&
       (Context.getLangOpts().Modules || !LinkerOptionsMetadata.empty())) {
@@ -1143,6 +1145,34 @@
                                               LinkerOptionsMetadata));
 }
 
+void CodeGenModule::EmitDeferredCudaDecls() {
+  assert(DeferredDeclsToEmit.empty() && "Unexpected deferred decls.");
+
+  // Check whether any of deferred CUDA decls are referred to by the
+  // code in current TU, move them to the list of deferred decls to
+  // emit and call EmitDeferred() to emit them. The decls we emit may
+  // create more unresolved references, so we continue the process
+  // until there are no more references we can resolve.
+  bool NeedToEmit;
+  do {
+    NeedToEmit = false;
+    for (auto I = DeferredCudaDecls.begin(), E = DeferredCudaDecls.end();
+         I != E;)
+      if (llvm::GlobalValue *DGV = GetGlobalValue(I->first)) {
+        if (DGV->isDeclaration()) {
+          addDeferredDeclToEmit(DGV, I->second);
+          NeedToEmit = true;
+        }
+        I = DeferredCudaDecls.erase(I);
+        continue;
+      } else
+        ++I;
+
+    if (NeedToEmit)
+      EmitDeferred();
+  } while (NeedToEmit);
+}
+
 void CodeGenModule::EmitDeferred() {
   // Emit code for any potentially referenced deferred decls.  Since a
   // previously unused static decl may become used during the generation of code
@@ -1417,18 +1447,28 @@
 
   // If this is CUDA, be selective about which declarations we emit.
   if (LangOpts.CUDA) {
+    bool GlobalMatchesCudaMode = true;
     if (LangOpts.CUDAIsDevice) {
       if (!Global->hasAttr<CUDADeviceAttr>() &&
           !Global->hasAttr<CUDAGlobalAttr>() &&
           !Global->hasAttr<CUDAConstantAttr>() &&
           !Global->hasAttr<CUDASharedAttr>())
-        return;
+        GlobalMatchesCudaMode = false;
     } else {
       if (!Global->hasAttr<CUDAHostAttr>() && (
             Global->hasAttr<CUDADeviceAttr>() ||
             Global->hasAttr<CUDAConstantAttr>() ||
             Global->hasAttr<CUDASharedAttr>()))
-        return;
+        GlobalMatchesCudaMode = false;
+    }
+
+    if (!GlobalMatchesCudaMode) {
+      if (getLangOpts().CUDADisableTargetCallChecks) {
+        const auto *FD = dyn_cast<FunctionDecl>(Global);
+        if (FD && FD->doesThisDeclarationHaveABody())
+          DeferredCudaDecls[getMangledName(GD)] = GD;
+      }
+      return;
     }
   }
 
Index: test/CodeGenCUDA/cross-call.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/cross-call.cu
@@ -0,0 +1,48 @@
+// Check handling of code generation for calls crossing host/device boundary.
+// Calls crossing host/device boundary must prefer overload variant that
+// matches current compilation mode. Make sure that we do emit intermediary
+// functions, whether they are host or device.
+
+// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
+//
+// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \
+// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
+// RUN:    -fcuda-is-device -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+
+#include "Inputs/cuda.h"
+
+static __device__ int cross_leaf() { return 41; };
+static __host__ int cross_leaf() { return 42; };
+
+// Here we should pick leaf function that matches compilation mode.
+static __host__ int cross_gate() { return cross_leaf(); }
+
+// two levels of host/device cross-calls to verify that we can deal
+// with cross-calls in both directions in all compilation modes.
+static __host__ int cross_hbridge() { return cross_gate(); }
+static __device__ int cross_dbridge() { return cross_gate(); }
+static __host__ int cross_hbridge2() { return cross_dbridge(); }
+static __device__ int cross_dbridge2() { return cross_hbridge(); }
+
+__host__ int cross_host() { return cross_dbridge2(); }
+__device__ int cross_device() { return cross_hbridge2(); }
+
+// Make sure we only emit globals for current compilation mode.
+// CHECK-HOST: define i32 @_Z10cross_hostv
+// CHECK-DEVICE-NOT: define i32 @_Z10cross_hostv
+// CHECK-DEVICE: define i32 @_Z12cross_devicev
+// CHECK-HOST-NOT: define i32 @_Z12cross_devicev
+
+// .. but allow non-matching ones if they are used.
+// CHECK-HOST-DAG: define internal i32 @_ZL13cross_hbridgev()
+// CHECK-HOST-DAG: define internal i32 @_ZL14cross_dbridge2v()
+// CHECK-DEVICE-DAG: define internal i32 @_ZL13cross_dbridgev()
+// CHECK-DEVICE-DAG: define internal i32 @_ZL14cross_hbridge2v()
+
+// .. and that we pick the leaf function that matches compilation mode.
+// CHECK-BOTH: define internal i32 @_ZL10cross_leafv()
+// CHECK-DEVICE: ret i32 41
+// CHECK-HOST:   ret i32 42
Index: test/CodeGenCUDA/host-device-calls-host.cu
===================================================================
--- test/CodeGenCUDA/host-device-calls-host.cu
+++ test/CodeGenCUDA/host-device-calls-host.cu
@@ -1,4 +1,15 @@
-// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-allow-host-calls-from-host-device -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device \
+// RUN:    -fcuda-allow-host-calls-from-host-device \
+// RUN:    -Wno-cuda-compat -emit-llvm -o - \
+// RUN:    | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-NORMAL
+
+// Enabling target overloads and disabling target call checks allows
+// cross-calling between host/device. We expect to emit IR for used
+// host functions in this case.
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device \
+// RUN:    -fcuda-target-overloads -fcuda-disable-target-call-checks \
+// RUN:    -Wno-cuda-compat -emit-llvm -o - \
+// RUN:    | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HDMIX
 
 #include "Inputs/cuda.h"
 
@@ -12,7 +23,8 @@
   host_function();
 }
 
-// CHECK: declare void @host_function
+// CHECK-NORMAL-LABEL: declare void @host_function
+// CHECK-HDMIX-LABEL: define void @host_function
 
 // CHECK-LABEL: define void @hd_function_b
 extern "C"