Index: include/clang/Sema/Sema.h
===================================================================
--- include/clang/Sema/Sema.h
+++ include/clang/Sema/Sema.h
@@ -8792,12 +8792,18 @@
 
   CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D);
 
+  // CUDA function call preference. Must be ordered numerically from
+  // worst to best.
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
-    CFP_LastResort, // Lowest priority. Only in effect if
+    CFP_WrongSide,  // Calls from host-device to host or device
+                    // function that do not match current compilation
+                    // mode. Only in effect if
                     // LangOpts.CUDADisableTargetCallChecks is true.
-    CFP_Fallback,   // Low priority caller/callee combination
-    CFP_Best,       // Preferred caller/callee combination
+    CFP_SameSide,   // Calls from host-device to host or device
+                    // function matching current compilation mode.
+    CFP_HostDevice, // Any calls to host/device functions.
+    CFP_Native,     // host-to-host or device-to-device calls.
   };
 
   /// Identifies relative preference of a given Caller/Callee
Index: lib/Sema/SemaCUDA.cpp
===================================================================
--- lib/Sema/SemaCUDA.cpp
+++ lib/Sema/SemaCUDA.cpp
@@ -68,26 +68,26 @@
 // Ph - preference in host mode
 // Pd - preference in device mode
 // H  - handled in (x)
-// Preferences: b-best, f-fallback, l-last resort, n-never.
+// Preferences: +:native, *:host-device, o:same side, .:wrong side, -:never.
 //
 // | F  | T  | Ph | Pd |  H  |
 // |----+----+----+----+-----+
-// | d  | d  | b  | b  | (b) |
-// | d  | g  | n  | n  | (a) |
-// | d  | h  | l  | l  | (e) |
-// | d  | hd | f  | f  | (c) |
-// | g  | d  | b  | b  | (b) |
-// | g  | g  | n  | n  | (a) |
-// | g  | h  | l  | l  | (e) |
-// | g  | hd | f  | f  | (c) |
-// | h  | d  | l  | l  | (e) |
-// | h  | g  | b  | b  | (b) |
-// | h  | h  | b  | b  | (b) |
-// | h  | hd | f  | f  | (c) |
-// | hd | d  | l  | f  | (d) |
-// | hd | g  | f  | n  |(d/a)|
-// | hd | h  | f  | l  | (d) |
-// | hd | hd | b  | b  | (b) |
+// | d  | d  | +  | +  | (c) |
+// | d  | g  | -  | -  | (a) |
+// | d  | h  | -  | -  | (e) |
+// | d  | hd | *  | *  | (b) |
+// | g  | d  | +  | +  | (c) |
+// | g  | g  | -  | -  | (a) |
+// | g  | h  | -  | -  | (e) |
+// | g  | hd | *  | *  | (b) |
+// | h  | d  | -  | -  | (e) |
+// | h  | g  | +  | +  | (c) |
+// | h  | h  | +  | +  | (c) |
+// | h  | hd | *  | *  | (b) |
+// | hd | d  | .  | o  | (d) |
+// | hd | g  | o  | -  |(d/a)|
+// | hd | h  | o  | .  | (d) |
+// | hd | hd | *  | *  | (b) |
 
 Sema::CUDAFunctionPreference
 Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
@@ -112,39 +112,38 @@
        (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice)))
     return CFP_Never;
 
-  // (b) Best case scenarios
+  // (b) Calling HostDevice is OK as a fallback that works for everyone.
+  if (CalleeTarget == CFT_HostDevice)
+    return CFP_HostDevice;
+
+  // (c) Best case scenarios
   if (CalleeTarget == CallerTarget ||
       (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
-    return CFP_Best;
-
-  // (c) Calling HostDevice is OK as a fallback that works for everyone.
-  if (CalleeTarget == CFT_HostDevice)
-    return CFP_Fallback;
-
-  // Figure out what should be returned 'last resort' cases. Normally
-  // those would not be allowed, but we'll consider them if
-  // CUDADisableTargetCallChecks is true.
-  CUDAFunctionPreference QuestionableResult =
-      getLangOpts().CUDADisableTargetCallChecks ? CFP_LastResort : CFP_Never;
+    return CFP_Native;
 
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
-    // Calling a function that matches compilation mode is OK.
-    // Calling a function from the other side is frowned upon.
-    if (getLangOpts().CUDAIsDevice)
-      return CalleeTarget == CFT_Device ? CFP_Fallback : QuestionableResult;
-    else
-      return (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)
-                 ? CFP_Fallback
-                 : QuestionableResult;
+    // It's OK to call mode-matching function from HD one.
+    if ((getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Device) ||
+        (!getLangOpts().CUDAIsDevice &&
+         (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global)))
+      return CFP_SameSide;
+
+    // We'll allow calls to non-mode-matching functions if target call
+    // checks are disabled. This is needed to avoid complaining about
+    // HD->H calls when we compile for device side and vice versa.
+    if (getLangOpts().CUDADisableTargetCallChecks)
+      return CFP_WrongSide;
+
+    return CFP_Never;
   }
 
   // (e) Calling across device/host boundary is not something you should do.
   if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) ||
       (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
-    return QuestionableResult;
+    return CFP_Never;
 
   llvm_unreachable("All cases should've been handled by now.");
 }
Index: lib/Sema/SemaOverload.cpp
===================================================================
--- lib/Sema/SemaOverload.cpp
+++ lib/Sema/SemaOverload.cpp
@@ -8527,6 +8527,27 @@
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] If HD function calls a function which has host-only and
+  // device-only variants, nvcc sees only host function during host
+  // compilation and device function only during device-side
+  // compilation. It appears to be a side effect of nvcc's splitting
+  // of host and device code into separate TUs. Alas we need to be
+  // compatible with existing code that relies on this. If we see such
+  // a case, return better variant right away.
+  if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads &&
+      Cand1.Function && Cand2.Function) {
+    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    const Sema::CUDAFunctionPreference CFP1 =
+        S.IdentifyCUDAPreference(Caller, Cand1.Function);
+    const Sema::CUDAFunctionPreference CFP2 =
+        S.IdentifyCUDAPreference(Caller, Cand2.Function);
+    if (((CFP1 == Sema::CFP_SameSide || CFP1 == Sema::CFP_Native) &&
+         (CFP2 <= Sema::CFP_WrongSide)) ||
+        ((CFP1 <= Sema::CFP_WrongSide) &&
+         (CFP2 == Sema::CFP_SameSide || CFP2 == Sema::CFP_Native)))
+      return CFP1 > CFP2;
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
Index: test/CodeGenCUDA/function-overload.cu
===================================================================
--- test/CodeGenCUDA/function-overload.cu
+++ test/CodeGenCUDA/function-overload.cu
@@ -77,12 +77,90 @@
 extern "C" __host__ __device__ int chd(void) {return 14;}
 // CHECK-BOTH:     ret i32 14
 
+// NOTE: this is an artefact of split-mode CUDA compilation that we
+// need to mimic. HD functions are sometimes allowed to call H or D
+// functions. Due to split compilation mode device-side compilation
+// will not see host-only function and thus they will not be
+// considered at all. For clang both H and D variants will become
+// function overloads. Normally target attribute is considered only if
+// C++ rules can not determine which function is better. However in
+// this case we need to discard functions that would not be present
+// during current compilation phase before we apply normal overload
+// resolution rules.
+
+// Large enough difference in calling preferences should have
+// precedence over standard C++ overloading rules.
+template <typename T> T template_vs_function(T arg) { return 15; }
+__device__ float template_vs_function(float arg) { return 16; }
+
+// In this case during host compilation we expect to cal function
+// template even if __device__ function may be available and allowed
+// by -fcuda-disable-target-call-checks and, according to C++ overload
+// resolution rules, would be prefered over function template.
+// CHECK-BOTH-LABEL: define void @_Z5hd_tfv()
+__host__ __device__ void hd_tf(void) {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// Calls from __host__ and __device__ functions should always call
+// overloaded function that matches their mode.
+// CHECK-HOST-LABEL: define void @_Z4h_tfv()
+__host__ void h_tf() {
+  template_vs_function(1.0f);
+  // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float
+  template_vs_function(2.0);
+  // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z4d_tfv()
+__device__ void d_tf() {
+  template_vs_function(1.0f);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+  template_vs_function(2.0);
+  // CHECK-DEVICE: call float @_Z20template_vs_functionf(float
+}
+
+// In case of smaller difference between calling preferences
+// (HD->{HD,H} call), C++ rules take precedence. So, when we need to pick
+// between (host or device) function template and HD function, C++
+// rules will have precedence.
+
+template <typename T> T template_vs_hd_function(T arg) { return 15; }
+__host__ __device__ float template_vs_hd_function(float arg) { return 16; }
+
+// CHECK-BOTH-LABEL: define void @_Z7hd_thdfv()
+__host__ __device__ void hd_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double
+}
+
+// CHECK-HOST-LABEL: define void @_Z6h_thdfv()
+__host__ void h_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double
+}
+
+// CHECK-DEVICE-LABEL: define void @_Z6d_thdfv()
+__device__ void d_thdf() {
+  template_vs_hd_function(1.0f);
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+  template_vs_hd_function(1.0);
+  // Host-only function template is not callable with strict call checks,
+  // so for device side HD function will be the only choice.
+  // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float
+}
+
 // CHECK-HOST-LABEL: define void @_Z5hostfv()
 __host__ void hostf(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp,
-#endif
   fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp,
   fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp,
   fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp,
@@ -91,10 +169,6 @@
   fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp,
   gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp,
 
-#if defined (NOCHECKS)
-  d();     // CHECK-HOST-NC: call i32 @_Z1dv()
-  cd();    // CHECK-HOST-NC: call i32 @cd()
-#endif
   h();     // CHECK-HOST: call i32 @_Z1hv()
   ch();    // CHECK-HOST: call i32 @ch()
   dh();    // CHECK-HOST: call i32 @_Z2dhv()
@@ -106,10 +180,6 @@
 __device__ void devicef(void) {
   fp_t dp = d;   // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp,
   fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp,
-#if defined (NOCHECKS)
-  fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
   fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp,
   fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp,
   fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp,
@@ -117,10 +187,6 @@
 
   d();     // CHECK-DEVICE: call i32 @_Z1dv()
   cd();    // CHECK-DEVICE: call i32 @cd()
-#if defined (NOCHECKS)
-  h();     // CHECK-DEVICE-NC: call i32 @_Z1hv()
-  ch();    // CHECK-DEVICE-NC: call i32 @ch()
-#endif
   dh();    // CHECK-DEVICE: call i32 @_Z2dhv()
   cdh();   // CHECK-DEVICE: call i32 @cdh()
 }
Index: test/SemaCUDA/function-overload.cu
===================================================================
--- test/SemaCUDA/function-overload.cu
+++ test/SemaCUDA/function-overload.cu
@@ -70,13 +70,11 @@
 
 __host__ void hostf(void) {
   fp_t dp = d;
-  fp_t cdp = cd;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __device__ function 'd' in __host__ function}}
+  // expected-error@-1 {{reference to __device__ function 'd' in __host__ function}}
   // expected-note@65 {{'d' declared here}}
-  // expected-error@-4 {{reference to __device__ function 'cd' in __host__ function}}
+  fp_t cdp = cd;
+  // expected-error@-1 {{reference to __device__ function 'cd' in __host__ function}}
   // expected-note@68 {{'cd' declared here}}
-#endif
   fp_t hp = h;
   fp_t chp = ch;
   fp_t dhp = dh;
@@ -84,13 +82,11 @@
   gp_t gp = g;
 
   d();
-  cd();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'd'}}
+  // expected-error@-1 {{no matching function for call to 'd'}}
   // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ function}}
-  // expected-error@-4 {{no matching function for call to 'cd'}}
+  cd();
+  // expected-error@-1 {{no matching function for call to 'cd'}}
   // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ function}}
-#endif
   h();
   ch();
   dh();
@@ -104,13 +100,11 @@
   fp_t dp = d;
   fp_t cdp = cd;
   fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __device__ function}}
+  // expected-error@-1 {{reference to __host__ function 'h' in __device__ function}}
   // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __device__ function}}
+  fp_t chp = ch;
+  // expected-error@-1 {{reference to __host__ function 'ch' in __device__ function}}
   // expected-note@69 {{'ch' declared here}}
-#endif
   fp_t dhp = dh;
   fp_t cdhp = cdh;
   gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
@@ -118,14 +112,10 @@
 
   d();
   cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
+  h(); // expected-error {{no matching function for call to 'h'}}
   // expected-note@66 {{candidate function not viable: call to __host__ function from __device__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
+  ch(); // expected-error {{no matching function for call to 'ch'}}
   // expected-note@69 {{candidate function not viable: call to __host__ function from __device__ function}}
-#endif
   dh();
   cdh();
   g(); // expected-error {{no matching function for call to 'g'}}
@@ -138,28 +128,25 @@
   fp_t dp = d;
   fp_t cdp = cd;
   fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __global__ function}}
+  // expected-error@-1 {{reference to __host__ function 'h' in __global__ function}}
   // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __global__ function}}
+  fp_t chp = ch;
+  // expected-error@-1 {{reference to __host__ function 'ch' in __global__ function}}
   // expected-note@69 {{'ch' declared here}}
-#endif
   fp_t dhp = dh;
   fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
-               // expected-note@67 {{'g' declared here}}
+  gp_t gp = g; 
+  // expected-error@-1 {{reference to __global__ function 'g' in __global__ function}}
+  // expected-note@67 {{'g' declared here}}
 
   d();
   cd();
   h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
+  // expected-error@-1 {{no matching function for call to 'h'}}
   // expected-note@66 {{candidate function not viable: call to __host__ function from __global__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
+  ch();
+  // expected-error@-1 {{no matching function for call to 'ch'}}
   // expected-note@69 {{candidate function not viable: call to __host__ function from __global__ function}}
-#endif
   dh();
   cdh();
   g(); // expected-error {{no matching function for call to 'g'}}