Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -11667,6 +11667,8 @@
     return IdentifyCUDATarget(dyn_cast<FunctionDecl>(CurContext));
   }
 
+  bool IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D);
+
   // CUDA function call preference. Must be ordered numerically from
   // worst to best.
   enum CUDAFunctionPreference {
Index: clang/lib/Sema/SemaCUDA.cpp
===================================================================
--- clang/lib/Sema/SemaCUDA.cpp
+++ clang/lib/Sema/SemaCUDA.cpp
@@ -211,6 +211,21 @@
   llvm_unreachable("All cases should've been handled by now.");
 }
 
+template <typename AttrT> static bool hasImplicitAttr(const FunctionDecl *D) {
+  if (!D)
+    return false;
+  if (auto *A = D->getAttr<AttrT>())
+    if (A->isImplicit())
+      return true;
+  return D->isImplicit();
+}
+
+bool Sema::IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D) {
+  bool IsImplicitDevAttr = hasImplicitAttr<CUDADeviceAttr>(D);
+  bool IsImplicitHostAttr = hasImplicitAttr<CUDAHostAttr>(D);
+  return IsImplicitDevAttr && IsImplicitHostAttr;
+}
+
 void Sema::EraseUnwantedCUDAMatches(
     const FunctionDecl *Caller,
     SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
Index: clang/lib/Sema/SemaOverload.cpp
===================================================================
--- clang/lib/Sema/SemaOverload.cpp
+++ clang/lib/Sema/SemaOverload.cpp
@@ -9517,11 +9517,27 @@
   // in global variable initializers once proper context is added.
   if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
     if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      bool IsCallerImplicitHD = S.IsCUDAImplicitHostDeviceFunction(Caller);
+      bool IsCand1ImplicitHD =
+          S.IsCUDAImplicitHostDeviceFunction(Cand1.Function);
+      bool IsCand2ImplicitHD =
+          S.IsCUDAImplicitHostDeviceFunction(Cand2.Function);
       auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function);
       auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function);
       assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never);
-      auto Cand1Emittable = P1 > Sema::CFP_WrongSide;
-      auto Cand2Emittable = P2 > Sema::CFP_WrongSide;
+      // The implicit HD function may be a function in a system header which
+      // is forced by pragma. In device compilation, if we prefer HD candidates
+      // over wrong-sided candidates, overloading resolution may change, which
+      // may result in non-deferrable diagnostics. As a workaround, we let
+      // implicit HD candidates take equal preference as wrong-sided candidates.
+      // This will preserve the overloading resolution.
+      auto EmitThreshold =
+          (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD &&
+           (IsCand1ImplicitHD || IsCand2ImplicitHD))
+              ? Sema::CFP_HostDevice
+              : Sema::CFP_WrongSide;
+      auto Cand1Emittable = P1 > EmitThreshold;
+      auto Cand2Emittable = P2 > EmitThreshold;
       if (Cand1Emittable && !Cand2Emittable)
         return true;
       if (!Cand1Emittable && Cand2Emittable)
Index: clang/test/SemaCUDA/function-overload.cu
===================================================================
--- clang/test/SemaCUDA/function-overload.cu
+++ clang/test/SemaCUDA/function-overload.cu
@@ -1,8 +1,8 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
-// RUN: %clang_cc1 -std=c++11 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -463,3 +463,72 @@
 void foo() {
   __test<int>();
 }
+
+// Test resolving implicit host device candidate vs wrong-sided candidate.
+// In device compilation, implicit host device caller choose implicit host
+// device candidate and wrong-sided candidate with equal preference.
+namespace ImplicitHostDeviceVsWrongSided {
+inline double callee(double x);
+#pragma clang force_cuda_host_device begin
+inline void callee(int x);
+inline double implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving implicit host device candidate vs wrong-sided candidate.
+// In host compilation, implicit host device caller choose implicit host
+// device candidate and same-sided candidate with equal preference.
+namespace ImplicitHostDeviceVsSameSide {
+inline void callee(int x);
+#pragma clang force_cuda_host_device begin
+inline double callee(double x);
+inline double implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// In the implicit host device function 'caller', the second 'callee' should be
+// since it has better match, even though it is an implicit host device function
+// whereas the first 'callee' is a host function. A diagnostic will be emitted
+// if the first 'callee' is chosen since deduced return type cannot be used
+// before it is defined.
+namespace ImplicitHostDeviceByConstExpr {
+template <class a> a b;
+auto callee(...);
+template <class d> constexpr auto callee(d) -> decltype(0);
+struct e {
+  template <class ad, class... f> static auto g(ad, f...) {
+    return h<e, decltype(b<f>)...>;
+  }
+  struct i {
+    template <class, class... f> static constexpr auto caller(f... k) {
+      return callee(k...);
+    }
+  };
+  template <class, class... f> static auto h() {
+    return i::caller<int, f...>;
+  }
+};
+class l {
+  l() {
+    e::g([] {}, this);
+  }
+};
+}
+
+// Test resolving explicit host device candidate vs. wrong-sided candidate.
+// Explicit host device caller favors host device candidate against wrong-sided
+// candidate.
+namespace ExplicitHostDeviceVsWrongSided {
+inline double callee(double x);
+inline __host__ __device__ void callee(int x);
+inline __host__ __device__ double explicit_hd_caller() {
+  return callee(1.0);
+#if __CUDA_ARCH__
+  // expected-error@-2 {{cannot initialize return object of type 'double' with an rvalue of type 'void'}}
+#endif
+}
+}