diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11686,6 +11686,8 @@
     return IdentifyCUDATarget(dyn_cast<FunctionDecl>(CurContext));
   }
 
+  static bool IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D);
+
   // CUDA function call preference. Must be ordered numerically from
   // worst to best.
   enum CUDAFunctionPreference {
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -211,6 +211,20 @@
   llvm_unreachable("All cases should've been handled by now.");
 }
 
+template <typename AttrT> static bool hasImplicitAttr(const FunctionDecl *D) {
+  if (!D)
+    return false;
+  if (auto *A = D->getAttr<AttrT>())
+    return A->isImplicit();
+  return D->isImplicit();
+}
+
+bool Sema::IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D) {
+  bool IsImplicitDevAttr = hasImplicitAttr<CUDADeviceAttr>(D);
+  bool IsImplicitHostAttr = hasImplicitAttr<CUDAHostAttr>(D);
+  return IsImplicitDevAttr && IsImplicitHostAttr;
+}
+
 void Sema::EraseUnwantedCUDAMatches(
     const FunctionDecl *Caller,
     SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9387,16 +9387,22 @@
   return Comparison::Equal;
 }
 
-static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
-                                          const OverloadCandidate &Cand2) {
+static Comparison
+isBetterMultiversionCandidate(const OverloadCandidate &Cand1,
+                              const OverloadCandidate &Cand2) {
   if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function ||
       !Cand2.Function->isMultiVersion())
-    return false;
+    return Comparison::Equal;
 
-  // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this
-  // is obviously better.
-  if (Cand1.Function->isInvalidDecl()) return false;
-  if (Cand2.Function->isInvalidDecl()) return true;
+  // If both are invalid, they are equal. If one of them is invalid, the other
+  // is better.
+  if (Cand1.Function->isInvalidDecl()) {
+    if (Cand2.Function->isInvalidDecl())
+      return Comparison::Equal;
+    return Comparison::Worse;
+  }
+  if (Cand2.Function->isInvalidDecl())
+    return Comparison::Better;
 
   // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer
   // cpu_dispatch, else arbitrarily based on the identifiers.
@@ -9406,16 +9412,18 @@
   const auto *Cand2CPUSpec = Cand2.Function->getAttr<CPUSpecificAttr>();
 
   if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec)
-    return false;
+    return Comparison::Equal;
 
   if (Cand1CPUDisp && !Cand2CPUDisp)
-    return true;
+    return Comparison::Better;
   if (Cand2CPUDisp && !Cand1CPUDisp)
-    return false;
+    return Comparison::Worse;
 
   if (Cand1CPUSpec && Cand2CPUSpec) {
     if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size())
-      return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size();
+      return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size()
+                 ? Comparison::Better
+                 : Comparison::Worse;
 
     std::pair<CPUSpecificAttr::cpus_iterator, CPUSpecificAttr::cpus_iterator>
         FirstDiff = std::mismatch(
@@ -9428,7 +9436,9 @@
     assert(FirstDiff.first != Cand1CPUSpec->cpus_end() &&
            "Two different cpu-specific versions should not have the same "
            "identifier list, otherwise they'd be the same decl!");
-    return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName();
+    return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName()
+               ? Comparison::Better
+               : Comparison::Worse;
   }
   llvm_unreachable("No way to get here unless both had cpu_dispatch");
 }
@@ -9488,6 +9498,66 @@
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] A function with 'never' preference is marked not viable, therefore
+  // is never shown up here. The worst preference shown up here is 'wrong side',
+  // e.g. a host function called by a device host function in device
+  // compilation. This is valid AST as long as the host device function is not
+  // emitted, e.g. it is an inline function which is called only by a host
+  // function. A deferred diagnostic will be triggered if it is emitted.
+  // However a wrong-sided function is still a viable candidate here.
+  //
+  // If Cand1 can be emitted and Cand2 cannot be emitted in the current
+  // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2
+  // can be emitted, Cand1 is not better than Cand2. This rule should have
+  // precedence over other rules.
+  //
+  // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then
+  // other rules should be used to determine which is better. This is because
+  // host/device based overloading resolution is mostly for determining
+  // viability of a function. If two functions are both viable, other factors
+  // should take precedence in preference, e.g. the standard-defined preferences
+  // like argument conversion ranks or enable_if partial-ordering. The
+  // preference for pass-object-size parameters is probably most similar to a
+  // type-based-overloading decision and so should take priority.
+  //
+  // If other rules cannot determine which is better, CUDA preference will be
+  // used again to determine which is better.
+  //
+  // TODO: Currently IdentifyCUDAPreference does not return correct values
+  // for functions called in global variable initializers due to missing
+  // correct context about device/host. Therefore we can only enforce this
+  // rule when there is a caller. We should enforce this rule for functions
+  // in global variable initializers once proper context is added.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      bool IsCallerImplicitHD = Sema::IsCUDAImplicitHostDeviceFunction(Caller);
+      bool IsCand1ImplicitHD =
+          Sema::IsCUDAImplicitHostDeviceFunction(Cand1.Function);
+      bool IsCand2ImplicitHD =
+          Sema::IsCUDAImplicitHostDeviceFunction(Cand2.Function);
+      auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function);
+      auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function);
+      assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never);
+      // The implicit HD function may be a function in a system header which
+      // is forced by pragma. In device compilation, if we prefer HD candidates
+      // over wrong-sided candidates, overloading resolution may change, which
+      // may result in non-deferrable diagnostics. As a workaround, we let
+      // implicit HD candidates take equal preference as wrong-sided candidates.
+      // This will preserve the overloading resolution.
+      auto EmitThreshold =
+          (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD &&
+           (IsCand1ImplicitHD || IsCand2ImplicitHD))
+              ? Sema::CFP_Never
+              : Sema::CFP_WrongSide;
+      auto Cand1Emittable = P1 > EmitThreshold;
+      auto Cand2Emittable = P2 > EmitThreshold;
+      if (Cand1Emittable && !Cand2Emittable)
+        return true;
+      if (!Cand1Emittable && Cand2Emittable)
+        return false;
+    }
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
@@ -9722,12 +9792,6 @@
       return Cmp == Comparison::Better;
   }
 
-  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
-    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
-           S.IdentifyCUDAPreference(Caller, Cand2.Function);
-  }
-
   bool HasPS1 = Cand1.Function != nullptr &&
                 functionHasPassObjectSizeParams(Cand1.Function);
   bool HasPS2 = Cand2.Function != nullptr &&
@@ -9735,7 +9799,21 @@
   if (HasPS1 != HasPS2 && HasPS1)
     return true;
 
-  return isBetterMultiversionCandidate(Cand1, Cand2);
+  auto MV = isBetterMultiversionCandidate(Cand1, Cand2);
+  if (MV == Comparison::Better)
+    return true;
+  if (MV == Comparison::Worse)
+    return false;
+
+  // If other rules cannot determine which is better, CUDA preference is used
+  // to determine which is better.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
+           S.IdentifyCUDAPreference(Caller, Cand2.Function);
+  }
+
+  return false;
 }
 
 /// Determine whether two declarations are "equivalent" for the purposes of
@@ -9821,33 +9899,6 @@
   std::transform(begin(), end(), std::back_inserter(Candidates),
                  [](OverloadCandidate &Cand) { return &Cand; });
 
-  // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but
-  // are accepted by both clang and NVCC. However, during a particular
-  // compilation mode only one call variant is viable. We need to
-  // exclude non-viable overload candidates from consideration based
-  // only on their host/device attributes. Specifically, if one
-  // candidate call is WrongSide and the other is SameSide, we ignore
-  // the WrongSide candidate.
-  if (S.getLangOpts().CUDA) {
-    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    bool ContainsSameSideCandidate =
-        llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
-          // Check viable function only.
-          return Cand->Viable && Cand->Function &&
-                 S.IdentifyCUDAPreference(Caller, Cand->Function) ==
-                     Sema::CFP_SameSide;
-        });
-    if (ContainsSameSideCandidate) {
-      auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) {
-        // Check viable function only to avoid unnecessary data copying/moving.
-        return Cand->Viable && Cand->Function &&
-               S.IdentifyCUDAPreference(Caller, Cand->Function) ==
-                   Sema::CFP_WrongSide;
-      };
-      llvm::erase_if(Candidates, IsWrongSideCandidate);
-    }
-  }
-
   // Find the best viable function.
   Best = end();
   for (auto *Cand : Candidates) {
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -1,8 +1,8 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -14,6 +14,13 @@
 struct HostDeviceReturnTy {};
 struct TemplateReturnTy {};
 
+struct CorrectOverloadRetTy{};
+#if __CUDA_ARCH__
+// expected-note@-2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'const CorrectOverloadRetTy &' for 1st argument}}
+// expected-note@-3 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'CorrectOverloadRetTy &&' for 1st argument}}
+#endif
+struct IncorrectOverloadRetTy{};
+
 typedef HostReturnTy (*HostFnPtr)();
 typedef DeviceReturnTy (*DeviceFnPtr)();
 typedef HostDeviceReturnTy (*HostDeviceFnPtr)();
@@ -331,9 +338,6 @@
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
 template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
-#ifdef __CUDA_ARCH__
-//expected-note@-2 {{declared here}}
-#endif
 {
   return TemplateReturnTy();
 }
@@ -342,11 +346,13 @@
 }
 
 __host__ __device__ void test_host_device_calls_hd_template() {
-  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
-  TemplateReturnTy ret2 = template_vs_hd_function(1);
 #ifdef __CUDA_ARCH__
-  // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+  typedef HostDeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
 #endif
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_hd_function(1);
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -367,14 +373,14 @@
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
 #ifndef __CUDA_ARCH__
-  // expected-note@-3 {{'device_only_function' declared here}}
-  // expected-note@-3 {{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
 #endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
 #ifdef __CUDA_ARCH__
-  // expected-note@-3 {{'host_only_function' declared here}}
-  // expected-note@-3 {{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
 #endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
@@ -392,6 +398,37 @@
 #endif
 }
 
+// wrong-sided overloading should not cause diagnostic unless it is emitted.
+// This inline function is not emitted.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
+
+// wrong-sided overloading should cause diagnostic if it is emitted.
+// This inline function is emitted since it is called by an emitted function.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
+}
+
+__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() {
+  test_host_device_wrong_side_overloading_inline_diag();
+  // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}}
+}
+
 // Verify that we allow overloading function templates.
 template <typename T> __host__ T template_overload(const T &a) { return a; };
 template <typename T> __device__ T template_overload(const T &a) { return a; };
@@ -419,3 +456,136 @@
 int test_constexpr_overload(C2 &x, C2 &y) {
   return constexpr_overload(x, y);
 }
+
+// Verify no ambiguity for new operator.
+void *a = new int;
+__device__ void *b = new int;
+// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+// Verify no ambiguity for new operator.
+template<typename _Tp> _Tp&& f();
+template<typename _Tp, typename = decltype(new _Tp(f<_Tp>()))>
+void __test();
+
+void foo() {
+  __test<int>();
+}
+
+// Test resolving implicit host device candidate vs wrong-sided candidate.
+// In device compilation, implicit host device caller choose implicit host
+// device candidate and wrong-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsWrongSided {
+CorrectOverloadRetTy callee(double x);
+#pragma clang force_cuda_host_device begin
+IncorrectOverloadRetTy callee(int x);
+inline CorrectOverloadRetTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving implicit host device candidate vs same-sided candidate.
+// In host compilation, implicit host device caller choose implicit host
+// device candidate and same-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsSameSide {
+IncorrectOverloadRetTy callee(int x);
+#pragma clang force_cuda_host_device begin
+CorrectOverloadRetTy callee(double x);
+inline CorrectOverloadRetTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving explicit host device candidate vs. wrong-sided candidate.
+// Explicit host device caller favors host device candidate against wrong-sided
+// candidate.
+namespace ExplicitHostDeviceVsWrongSided {
+CorrectOverloadRetTy callee(double x);
+__host__ __device__ IncorrectOverloadRetTy callee(int x);
+inline __host__ __device__ CorrectOverloadRetTy explicit_hd_caller() {
+  return callee(1.0);
+#if __CUDA_ARCH__
+  // expected-error@-2 {{no viable conversion from returned value of type 'IncorrectOverloadRetTy' to function return type 'CorrectOverloadRetTy'}}
+#endif
+}
+}
+
+// In the implicit host device function 'caller', the second 'callee' should be
+// chosen since it has better match, even though it is an implicit host device
+// function whereas the first 'callee' is a host function. A diagnostic will be
+// emitted if the first 'callee' is chosen since deduced return type cannot be
+// used before it is defined.
+namespace ImplicitHostDeviceByConstExpr {
+template <class a> a b;
+auto callee(...);
+template <class d> constexpr auto callee(d) -> decltype(0);
+struct e {
+  template <class ad, class... f> static auto g(ad, f...) {
+    return h<e, decltype(b<f>)...>;
+  }
+  struct i {
+    template <class, class... f> static constexpr auto caller(f... k) {
+      return callee(k...);
+    }
+  };
+  template <class, class... f> static auto h() {
+    return i::caller<int, f...>;
+  }
+};
+class l {
+  l() {
+    e::g([] {}, this);
+  }
+};
+}
+
+// Implicit HD candidate competes with device candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithD {
+  struct a {
+    __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate competes with host candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithH {
+  struct a {
+    a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate comptes with HD candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithHD {
+  struct a {
+    __host__ __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}