Index: include/clang/Sema/Sema.h
===================================================================
--- include/clang/Sema/Sema.h
+++ include/clang/Sema/Sema.h
@@ -8794,10 +8794,14 @@
 
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
-    CFP_LastResort, // Lowest priority. Only in effect if
+    CFP_WrongSide,  // Calls from host-device to host or device
+                    // function that do not match current compilation
+                    // mode. Only in effect if
                     // LangOpts.CUDADisableTargetCallChecks is true.
-    CFP_Fallback,   // Low priority caller/callee combination
-    CFP_Best,       // Preferred caller/callee combination
+    CFP_SameSide,   // Calls from host-device to host or device
+                    // function matching current compilation mode.
+    CFP_HostDevice, // Any calls to host/device functions.
+    CFP_Native,     // host-to-host or device-to-device calls.
   };
 
   /// Identifies relative preference of a given Caller/Callee
Index: lib/Sema/SemaCUDA.cpp
===================================================================
--- lib/Sema/SemaCUDA.cpp
+++ lib/Sema/SemaCUDA.cpp
@@ -68,26 +68,26 @@
 // Ph - preference in host mode
 // Pd - preference in device mode
 // H  - handled in (x)
-// Preferences: b-best, f-fallback, l-last resort, n-never.
+// Preferences: '+'-native, h-host-device, s-same side, w-wrong side, '-'-never.
 //
 // | F  | T  | Ph | Pd |  H  |
 // |----+----+----+----+-----+
-// | d  | d  | b  | b  | (b) |
-// | d  | g  | n  | n  | (a) |
-// | d  | h  | l  | l  | (e) |
-// | d  | hd | f  | f  | (c) |
-// | g  | d  | b  | b  | (b) |
-// | g  | g  | n  | n  | (a) |
-// | g  | h  | l  | l  | (e) |
-// | g  | hd | f  | f  | (c) |
-// | h  | d  | l  | l  | (e) |
-// | h  | g  | b  | b  | (b) |
-// | h  | h  | b  | b  | (b) |
-// | h  | hd | f  | f  | (c) |
-// | hd | d  | l  | f  | (d) |
-// | hd | g  | f  | n  |(d/a)|
-// | hd | h  | f  | l  | (d) |
-// | hd | hd | b  | b  | (b) |
+// | d  | d  | +  | +  | (c) |
+// | d  | g  | -  | -  | (a) |
+// | d  | h  | -  | -  | (e) |
+// | d  | hd | h  | h  | (b) |
+// | g  | d  | +  | +  | (c) |
+// | g  | g  | -  | -  | (a) |
+// | g  | h  | -  | -  | (e) |
+// | g  | hd | h  | h  | (b) |
+// | h  | d  | -  | -  | (e) |
+// | h  | g  | +  | +  | (c) |
+// | h  | h  | +  | +  | (c) |
+// | h  | hd | h  | h  | (b) |
+// | hd | d  | w  | s  | (d) |
+// | hd | g  | s  | -  |(d/a)|
+// | hd | h  | s  | w  | (d) |
+// | hd | hd | h  | h  | (b) |
 
 Sema::CUDAFunctionPreference
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
@@ -112,31 +112,31 @@
        (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice)))
     return CFP_Never;
 
-  // (b) Best case scenarios
+  // (b) Calling HostDevice is OK as a fallback that works for everyone.
+  if (CalleeTarget == CFT_HostDevice)
+    return CFP_HostDevice;
+
+  // (c) Best case scenarios
   if (CalleeTarget == CallerTarget ||
       (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
-    return CFP_Best;
-
-  // (c) Calling HostDevice is OK as a fallback that works for everyone.
-  if (CalleeTarget == CFT_HostDevice)
-    return CFP_Fallback;
+    return CFP_Native;
 
   // Figure out what should be returned 'last resort' cases. Normally
   // those would not be allowed, but we'll consider them if
   // CUDADisableTargetCallChecks is true.
   CUDAFunctionPreference QuestionableResult =
-      getLangOpts().CUDADisableTargetCallChecks ? CFP_LastResort : CFP_Never;
+      getLangOpts().CUDADisableTargetCallChecks ? CFP_WrongSide : CFP_Never;
 
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
     // Calling a function that matches compilation mode is OK.
     // Calling a function from the other side is frowned upon.
     if (getLangOpts().CUDAIsDevice)
-      return CalleeTarget == CFT_Device ? CFP_Fallback : QuestionableResult;
+      return CalleeTarget == CFT_Device ? CFP_SameSide : QuestionableResult;
     else
       return (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)
-                 ? CFP_Fallback
+                 ? CFP_SameSide
                  : QuestionableResult;
   }
 
@@ -144,7 +144,7 @@
   if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) ||
       (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
-    return QuestionableResult;
+    return CFP_Never;
 
   llvm_unreachable("All cases should've been handled by now.");
 }
Index: lib/Sema/SemaOverload.cpp
===================================================================
--- lib/Sema/SemaOverload.cpp
+++ lib/Sema/SemaOverload.cpp
@@ -8527,6 +8527,27 @@
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] If HD function calls a function which has host-only and
+  // device-only variants, nvcc sees only host function during host
+  // compilation and device function only during device-side
+  // compilation. It appears to be a side effect of nvcc's splitting
+  // of host and device code into separate TUs. Alas we need to be
+  // compatible with existing code that relies on this. If we see such
+  // a case, return better variant right away.
+  if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads &&
+      Cand1.Function && Cand2.Function) {
+    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    const Sema::CUDAFunctionPreference CFP1 =
+        S.IdentifyCUDAPreference(Caller, Cand1.Function);
+    const Sema::CUDAFunctionPreference CFP2 =
+        S.IdentifyCUDAPreference(Caller, Cand2.Function);
+    if (((CFP1 == Sema::CFP_SameSide || CFP1 == Sema::CFP_Native) &&
+         (CFP2 <= Sema::CFP_WrongSide)) ||
+        ((CFP1 <= Sema::CFP_WrongSide) &&
+         (CFP2 == Sema::CFP_SameSide || CFP2 == Sema::CFP_Native)))
+      return CFP1 > CFP2;
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
Index: test/CodeGenCUDA/function-overload.cu
===================================================================
--- test/CodeGenCUDA/function-overload.cu
+++ test/CodeGenCUDA/function-overload.cu
@@ -7,7 +7,8 @@
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \
 // RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \
+// RUN:       -check-prefix=CHECK-DEVICE-STRICT %s
 
 // Check target overloads handling with disabled call target checks.
 // RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
@@ -77,6 +78,91 @@
 extern "C" __host__ __device__ int chd(void) {return 14;}
 // CHECK-BOTH:     ret i32 14
 
+// NOTE: this is an artefact of split-mode CUDA compilation that we
+// need to mimic. HD functions are sometimes allowed to call H or D
+// functions. Due to split compilation mode device-side compilation
+// will not see host-only function and thus they will not be
+// considered at all. For clang both H and D variants will become
+// function overloads. Normally target attribute is considered only if
+// C++ rules can not determine which function is better. However in
+// this case we need to discard functions that would not be present
+// during current compilation phase before we apply normal overload
+// resolution rules.
+
+// Large enough difference in calling preferences should have
+// precedence over standard C++ overloading rules.
+template <typename T> T template_vs_function(T arg) { return 15; }
+__device__ float template_vs_function(float arg) { return 16; }
+
+// In this case during host compilation we expect to cal function
+// template even if __device__ function may be available and allowed
+// by -fcuda-disable-target-call-checks and, according to C++ overload
+// resolution rules, would be prefered over function template.
+// CHECK-BOTH-LABEL: define void @_Z5hd_tfv()
+__host__ __device__ void hd_tf(void) {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// Calls from __host__ and __device__ functions should always call
+// overloaded function that matches their mode.
+// CHECK-HOST-LABEL: define void @_Z4h_tfv()
+__host__ void h_tf() {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z4d_tfv()
+__device__ void d_tf() {
+  template_vs_function(1.0f);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// In case of smaller difference between calling preferences
+// (HD->{HD,H} call), C++ rules take precedence. So, when we need to pick
+// between (host or device) function template and HD function, C++
+// rules will have precedence.
+
+template <typename T> T template_vs_hd_function(T arg) { return 15; }
+__host__ __device__ float template_vs_hd_function(float arg) { return 16; }
+
+// CHECK-BOTH-LABEL: define void @_Z7hd_thdfv()
+__host__ __device__ void hd_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double
+}
+
+// CHECK-HOST-LABEL: define void @_Z6h_thdfv()
+__host__ void h_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z6d_thdfv()
+__device__ void d_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // Host-only function template is not callable with strict call checks,
+  // so for device side HD function will be the only choice.
+  // CHECK-DEVICE-STRICT: call float @_Z23template_vs_hd_functionf(float
+
+  // With target checks disabled we'll attempt to use host function template.
+  // CHECK-DEVICE-NC: call double @_Z23template_vs_hd_functionIdET_S0_(double
+}
+
 // CHECK-HOST-LABEL: define void @_Z5hostfv()
 __host__ void hostf(void) {
 #if defined (NOCHECKS)