Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -242,6 +242,7 @@
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
 LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
+LANGOPT(FixOverloadResolution, 1, 0, "fix host/device related overloading resolution for CUDA/HIP")
 
 LANGOPT(SYCL              , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -712,6 +712,9 @@
 defm gpu_defer_diag : OptInFFlag<"gpu-defer-diag",
   "Defer", "Don't defer", " host/device related diagnostic messages"
   " for CUDA/HIP">;
+defm fix_overload_resolution : OptInFFlag<"fix-overload-resolution",
+  "Fix", "Don't fix", " host/device related overloading resolution"
+  " for CUDA/HIP">;
 def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
   Flags<[CC1Option]>,
   HelpText<"Default max threads per block for kernel launch bounds for HIP">;
Index: clang/include/clang/Sema/Overload.h
===================================================================
--- clang/include/clang/Sema/Overload.h
+++ clang/include/clang/Sema/Overload.h
@@ -1051,6 +1051,9 @@
 
     void destroyCandidates();
 
+    /// Whether diagnostics should be deferred.
+    bool shouldDeferDiags(Sema &S, ArrayRef<Expr *> Args, SourceLocation OpLoc);
+
   public:
     OverloadCandidateSet(SourceLocation Loc, CandidateSetKind CSK,
                          OperatorRewriteInfo RewriteInfo = {})
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -5593,6 +5593,11 @@
     if (Args.hasFlag(options::OPT_fgpu_defer_diag,
                      options::OPT_fno_gpu_defer_diag, false))
       CmdArgs.push_back("-fgpu-defer-diag");
+    if (Args.hasFlag(options::OPT_ffix_overload_resolution,
+                     options::OPT_fno_fix_overload_resolution, false)) {
+      CmdArgs.push_back("-ffix-overload-resolution");
+      CmdArgs.push_back("-fgpu-defer-diag");
+    }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2684,6 +2684,9 @@
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
     Opts.CUDAHostDeviceConstexpr = 0;
 
+  if (Args.hasArg(OPT_ffix_overload_resolution))
+    Opts.FixOverloadResolution = 1;
+
   if (Args.hasArg(OPT_fgpu_defer_diag))
     Opts.GPUDeferDiag = 1;
 
Index: clang/lib/Sema/SemaOverload.cpp
===================================================================
--- clang/lib/Sema/SemaOverload.cpp
+++ clang/lib/Sema/SemaOverload.cpp
@@ -9616,6 +9616,75 @@
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] A function with 'never' preference is marked not viable, therefore
+  // is never shown up here. The worst preference shown up here is 'wrong side',
+  // e.g. a host function called by a device host function in device
+  // compilation. This is valid AST as long as the host device function is not
+  // emitted, e.g. it is an inline function which is called only by a host
+  // function. A deferred diagnostic will be triggered if it is emitted.
+  // However a wrong-sided function is still a viable candidate here.
+  //
+  // If Cand1 can be emitted and Cand2 cannot be emitted in the current
+  // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2
+  // can be emitted, Cand1 is not better than Cand2. This rule should have
+  // precedence over other rules.
+  //
+  // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then
+  // other rules should be used to determine which is better. This is because
+  // host/device based overloading resolution is mostly for determining
+  // viability of a function. If two functions are both viable, other factors
+  // should take precedence in preference, e.g. the standard-defined preferences
+  // like argument conversion ranks or enable_if partial-ordering. The
+  // preference for pass-object-size parameters is probably most similar to a
+  // type-based-overloading decision and so should take priority.
+  //
+  // If other rules cannot determine which is better, CUDA preference will be
+  // used again to determine which is better.
+  //
+  // TODO: Currently IdentifyCUDAPreference does not return correct values
+  // for functions called in global variable initializers due to missing
+  // correct context about device/host. Therefore we can only enforce this
+  // rule when there is a caller. We should enforce this rule for functions
+  // in global variable initializers once proper context is added.
+  //
+  // TODO: We can only enable the hostness based overloading resolution when
+  // -ffix-overload-resolution is on since this requires deferring overloading
+  // resolution diagnostics.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function &&
+      S.getLangOpts().FixOverloadResolution) {
+    if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      bool IsCallerImplicitHD = Sema::isCUDAImplicitHostDeviceFunction(Caller);
+      bool IsCand1ImplicitHD =
+          Sema::isCUDAImplicitHostDeviceFunction(Cand1.Function);
+      bool IsCand2ImplicitHD =
+          Sema::isCUDAImplicitHostDeviceFunction(Cand2.Function);
+      auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function);
+      auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function);
+      assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never);
+      // The implicit HD function may be a function in a system header which
+      // is forced by pragma. In device compilation, if we prefer HD candidates
+      // over wrong-sided candidates, overloading resolution may change, which
+      // may result in non-deferrable diagnostics. As a workaround, we let
+      // implicit HD candidates take equal preference as wrong-sided candidates.
+      // This will preserve the overloading resolution.
+      // TODO: We still need special handling of implicit HD functions since
+      // they may incur other diagnostics to be deferred. We should make all
+      // host/device related diagnostics deferrable and remove special handling
+      // of implicit HD functions.
+      auto EmitThreshold =
+          (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD &&
+           (IsCand1ImplicitHD || IsCand2ImplicitHD))
+              ? Sema::CFP_Never
+              : Sema::CFP_WrongSide;
+      auto Cand1Emittable = P1 > EmitThreshold;
+      auto Cand2Emittable = P2 > EmitThreshold;
+      if (Cand1Emittable && !Cand2Emittable)
+        return true;
+      if (!Cand1Emittable && Cand2Emittable)
+        return false;
+    }
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
@@ -9850,12 +9919,6 @@
       return Cmp == Comparison::Better;
   }
 
-  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
-    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
-           S.IdentifyCUDAPreference(Caller, Cand2.Function);
-  }
-
   bool HasPS1 = Cand1.Function != nullptr &&
                 functionHasPassObjectSizeParams(Cand1.Function);
   bool HasPS2 = Cand2.Function != nullptr &&
@@ -9863,8 +9926,21 @@
   if (HasPS1 != HasPS2 && HasPS1)
     return true;
 
-  Comparison MV = isBetterMultiversionCandidate(Cand1, Cand2);
-  return MV == Comparison::Better;
+  auto MV = isBetterMultiversionCandidate(Cand1, Cand2);
+  if (MV == Comparison::Better)
+    return true;
+  if (MV == Comparison::Worse)
+    return false;
+
+  // If other rules cannot determine which is better, CUDA preference is used
+  // to determine which is better.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
+           S.IdentifyCUDAPreference(Caller, Cand2.Function);
+  }
+
+  return false;
 }
 
 /// Determine whether two declarations are "equivalent" for the purposes of
@@ -9957,7 +10033,10 @@
   // only on their host/device attributes. Specifically, if one
   // candidate call is WrongSide and the other is SameSide, we ignore
   // the WrongSide candidate.
-  if (S.getLangOpts().CUDA) {
+  // We only need to remove wrong-sided candidates here if
+  // -ffix-overload-resolution is off. When -ffix-overload-resolution is on,
+  // all candidates are compared uniformly in isBetterOverloadCandidate.
+  if (S.getLangOpts().CUDA && !S.getLangOpts().FixOverloadResolution) {
     const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
     bool ContainsSameSideCandidate =
         llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
@@ -11620,26 +11699,34 @@
   return Cands;
 }
 
-/// When overload resolution fails, prints diagnostic messages containing the
-/// candidates in the candidate set.
-void OverloadCandidateSet::NoteCandidates(PartialDiagnosticAt PD,
-    Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef<Expr *> Args,
-    StringRef Opc, SourceLocation OpLoc,
-    llvm::function_ref<bool(OverloadCandidate &)> Filter) {
-
+bool OverloadCandidateSet::shouldDeferDiags(Sema &S, ArrayRef<Expr *> Args,
+                                            SourceLocation OpLoc) {
   bool DeferHint = false;
   if (S.getLangOpts().CUDA && S.getLangOpts().GPUDeferDiag) {
-    // Defer diagnostic for CUDA/HIP if there are wrong-sided candidates.
+    // Defer diagnostic for CUDA/HIP if there are wrong-sided candidates or
+    // host device candidates.
     auto WrongSidedCands =
         CompleteCandidates(S, OCD_AllCandidates, Args, OpLoc, [](auto &Cand) {
-          return Cand.Viable == false &&
-                 Cand.FailureKind == ovl_fail_bad_target;
+          return (Cand.Viable == false &&
+                  Cand.FailureKind == ovl_fail_bad_target) ||
+                 (Cand.Function->template hasAttr<CUDAHostAttr>() &&
+                  Cand.Function->template hasAttr<CUDADeviceAttr>());
         });
     DeferHint = WrongSidedCands.size();
   }
+  return DeferHint;
+}
+
+/// When overload resolution fails, prints diagnostic messages containing the
+/// candidates in the candidate set.
+void OverloadCandidateSet::NoteCandidates(
+    PartialDiagnosticAt PD, Sema &S, OverloadCandidateDisplayKind OCD,
+    ArrayRef<Expr *> Args, StringRef Opc, SourceLocation OpLoc,
+    llvm::function_ref<bool(OverloadCandidate &)> Filter) {
+
   auto Cands = CompleteCandidates(S, OCD, Args, OpLoc, Filter);
 
-  S.Diag(PD.first, PD.second, DeferHint);
+  S.Diag(PD.first, PD.second, shouldDeferDiags(S, Args, OpLoc));
 
   NoteCandidates(S, Args, Cands, Opc, OpLoc);
 
@@ -11691,7 +11778,9 @@
   }
 
   if (I != E)
-    S.Diag(OpLoc, diag::note_ovl_too_many_candidates) << int(E - I);
+    S.Diag(OpLoc, diag::note_ovl_too_many_candidates,
+           shouldDeferDiags(S, Args, OpLoc))
+        << int(E - I);
 }
 
 static SourceLocation
Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -35,3 +35,8 @@
 // RUN: %clang -### -nogpuinc -nogpulib -munsafe-fp-atomics \
 // RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
 // UNSAFE-FP-ATOMICS: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-munsafe-fp-atomics"
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib -ffix-overload-resolution \
+// RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefix=FIX-OVERLOAD %s
+// FIX-OVERLOAD: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-ffix-overload-resolution" "-fgpu-defer-diag"
+// FIX-OVERLOAD: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-ffix-overload-resolution" "-fgpu-defer-diag"
Index: clang/test/SemaCUDA/deferred-oeverload.cu
===================================================================
--- clang/test/SemaCUDA/deferred-oeverload.cu
+++ clang/test/SemaCUDA/deferred-oeverload.cu
@@ -54,7 +54,7 @@
 // This fails to substitue for A but no diagnostic
 // should be emitted.
 template<typename T, typename T::foo* = nullptr>
-__host__ __device__ void sfinae(T t) { // com-note {{candidate template ignored: substitution failure [with T = B]}}
+__host__ __device__ void sfinae(T t) { // host-note {{candidate template ignored: substitution failure [with T = B]}}
   t.x = 1;
 }
 
@@ -64,13 +64,13 @@
 // file scope.
 
 template<typename T, typename T::isA* = nullptr>
-__host__ __device__ void sfinae(T t) { // com-note {{candidate template ignored: substitution failure [with T = B]}}
+__host__ __device__ void sfinae(T t) { // host-note {{candidate template ignored: substitution failure [with T = B]}}
   t.x = 1;
 }
 
 void test_sfinae() {
   sfinae(A());
-  sfinae(B()); // com-error{{no matching function for call to 'sfinae'}}
+  sfinae(B()); // host-error{{no matching function for call to 'sfinae'}}
 }
 
 // Make sure throw is diagnosed in OpenMP parallel region in host function.
Index: clang/test/SemaCUDA/function-overload.cu
===================================================================
--- clang/test/SemaCUDA/function-overload.cu
+++ clang/test/SemaCUDA/function-overload.cu
@@ -1,8 +1,16 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify=host,expected %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify=dev,expected %s
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:   -verify=host,hostdefer,devdefer,expected %s
+// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only \
+// RUN:   -fcuda-is-device -verify=dev,devnodeferonly,hostdefer,devdefer,expected %s
+// RUN: %clang_cc1 -ffix-overload-resolution -fgpu-defer-diag -DDEFER=1 \
+// RUN:    -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:    -verify=host,hostdefer,expected %s
+// RUN: %clang_cc1 -ffix-overload-resolution -fgpu-defer-diag -DDEFER=1 \
+// RUN:    -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device \
+// RUN:    -verify=dev,devdeferonly,devdefer,expected %s
 
 #include "Inputs/cuda.h"
 
@@ -76,37 +84,37 @@
 // Helper functions to verify calling restrictions.
 __device__ DeviceReturnTy d() { return DeviceReturnTy(); }
 // host-note@-1 1+ {{'d' declared here}}
-// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// hostdefer-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
 
 __host__ HostReturnTy h() { return HostReturnTy(); }
 // dev-note@-1 1+ {{'h' declared here}}
-// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// devdefer-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+// devdefer-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
 
 __global__ void g() {}
 // dev-note@-1 1+ {{'g' declared here}}
-// expected-note@-2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
+// devdefer-note@-2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
-// expected-note@-4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
+// devdefer-note@-4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
 
 extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
 // host-note@-1 1+ {{'cd' declared here}}
-// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// hostdefer-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
 
 extern "C" __host__ HostReturnTy ch() { return HostReturnTy(); }
 // dev-note@-1 1+ {{'ch' declared here}}
-// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// devdefer-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
 // expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+// devdefer-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
 
 __host__ void hostf() {
   DeviceFnPtr fp_d = d;         // host-error {{reference to __device__ function 'd' in __host__ function}}
-  DeviceReturnTy ret_d = d();   // expected-error {{no matching function for call to 'd'}}
+  DeviceReturnTy ret_d = d();   // hostdefer-error {{no matching function for call to 'd'}}
   DeviceFnPtr fp_cd = cd;       // host-error {{reference to __device__ function 'cd' in __host__ function}}
-  DeviceReturnTy ret_cd = cd(); // expected-error {{no matching function for call to 'cd'}}
+  DeviceReturnTy ret_cd = cd(); // hostdefer-error {{no matching function for call to 'cd'}}
 
   HostFnPtr fp_h = h;
   HostReturnTy ret_h = h();
@@ -130,9 +138,9 @@
   DeviceReturnTy ret_cd = cd();
 
   HostFnPtr fp_h = h;         // dev-error {{reference to __host__ function 'h' in __device__ function}}
-  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostReturnTy ret_h = h();   // devdefer-error {{no matching function for call to 'h'}}
   HostFnPtr fp_ch = ch;       // dev-error {{reference to __host__ function 'ch' in __device__ function}}
-  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+  HostReturnTy ret_ch = ch(); // devdefer-error {{no matching function for call to 'ch'}}
 
   DeviceFnPtr fp_dh = dh;
   DeviceReturnTy ret_dh = dh();
@@ -140,7 +148,7 @@
   DeviceReturnTy ret_cdh = cdh();
 
   GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __device__ function}}
-  g(); // expected-error {{no matching function for call to 'g'}}
+  g(); // devdefer-error {{no matching function for call to 'g'}}
   g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __device__ function}}
 }
 
@@ -151,9 +159,9 @@
   DeviceReturnTy ret_cd = cd();
 
   HostFnPtr fp_h = h;         // dev-error {{reference to __host__ function 'h' in __global__ function}}
-  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostReturnTy ret_h = h();   // devdefer-error {{no matching function for call to 'h'}}
   HostFnPtr fp_ch = ch;       // dev-error {{reference to __host__ function 'ch' in __global__ function}}
-  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+  HostReturnTy ret_ch = ch(); // devdefer-error {{no matching function for call to 'ch'}}
 
   DeviceFnPtr fp_dh = dh;
   DeviceReturnTy ret_dh = dh();
@@ -161,7 +169,7 @@
   DeviceReturnTy ret_cdh = cdh();
 
   GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __global__ function}}
-  g(); // expected-error {{no matching function for call to 'g'}}
+  g(); // devdefer-error {{no matching function for call to 'g'}}
   g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __global__ function}}
 }
 
@@ -184,7 +192,7 @@
 #if defined(__CUDA_ARCH__)
   // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
   // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
-  // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+  // devdefer-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
   // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
 #endif
 
@@ -331,9 +339,7 @@
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
 template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
-#ifdef __CUDA_ARCH__
-//expected-note@-2 {{declared here}}
-#endif
+// devnodeferonly-note@-1{{'template_vs_hd_function<int>' declared here}}
 {
   return TemplateReturnTy();
 }
@@ -342,11 +348,14 @@
 }
 
 __host__ __device__ void test_host_device_calls_hd_template() {
-  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
-  TemplateReturnTy ret2 = template_vs_hd_function(1);
-#ifdef __CUDA_ARCH__
-  // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+#if __CUDA_ARCH__ && DEFER
+  typedef HostDeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
 #endif
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_hd_function(1);
+  // devnodeferonly-error@-1{{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -367,14 +376,14 @@
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
 #ifndef __CUDA_ARCH__
-  // expected-note@-3 {{'device_only_function' declared here}}
-  // expected-note@-3 {{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
+  // expected-note@-3 2{{'device_only_function' declared here}}
 #endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
 #ifdef __CUDA_ARCH__
-  // expected-note@-3 {{'host_only_function' declared here}}
-  // expected-note@-3 {{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
+  // expected-note@-3 2{{'host_only_function' declared here}}
 #endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
@@ -392,6 +401,37 @@
 #endif
 }
 
+// wrong-sided overloading should not cause diagnostic unless it is emitted.
+// This inline function is not emitted.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
+
+// wrong-sided overloading should cause diagnostic if it is emitted.
+// This inline function is emitted since it is called by an emitted function.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
+}
+
+__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() {
+  test_host_device_wrong_side_overloading_inline_diag();
+  // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}}
+}
+
 // Verify that we allow overloading function templates.
 template <typename T> __host__ T template_overload(const T &a) { return a; };
 template <typename T> __device__ T template_overload(const T &a) { return a; };
@@ -419,3 +459,255 @@
 int test_constexpr_overload(C2 &x, C2 &y) {
   return constexpr_overload(x, y);
 }
+
+// Verify no ambiguity for new operator.
+void *a = new int;
+__device__ void *b = new int;
+// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+// Verify no ambiguity for new operator.
+template<typename _Tp> _Tp&& f();
+template<typename _Tp, typename = decltype(new _Tp(f<_Tp>()))>
+void __test();
+
+void foo() {
+  __test<int>();
+}
+
+// Test resolving implicit host device candidate vs wrong-sided candidate.
+// In device compilation, implicit host device caller choose implicit host
+// device candidate and wrong-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsWrongSided {
+HostReturnTy callee(double x);
+#pragma clang force_cuda_host_device begin
+HostDeviceReturnTy callee(int x);
+inline HostReturnTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving implicit host device candidate vs same-sided candidate.
+// In host compilation, implicit host device caller choose implicit host
+// device candidate and same-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsSameSide {
+HostReturnTy callee(int x);
+#pragma clang force_cuda_host_device begin
+HostDeviceReturnTy callee(double x);
+inline HostDeviceReturnTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving explicit host device candidate vs. wrong-sided candidate.
+// When -fgpu-defer-diag is off, wrong-sided candidate is not excluded, therefore
+// the first callee is chosen.
+// When -fgpu-defer-diag is on, wrong-sided candidate is excluded, therefore
+// the second callee is chosen.
+namespace ExplicitHostDeviceVsWrongSided {
+HostReturnTy callee(double x);
+__host__ __device__ HostDeviceReturnTy callee(int x);
+#if __CUDA_ARCH__ && DEFER
+typedef HostDeviceReturnTy ExpectedRetTy;
+#else
+typedef HostReturnTy ExpectedRetTy;
+#endif
+inline __host__ __device__ ExpectedRetTy explicit_hd_caller() {
+  return callee(1.0);
+}
+}
+
+// In the implicit host device function 'caller', the second 'callee' should be
+// chosen since it has better match, even though it is an implicit host device
+// function whereas the first 'callee' is a host function. A diagnostic will be
+// emitted if the first 'callee' is chosen since deduced return type cannot be
+// used before it is defined.
+namespace ImplicitHostDeviceByConstExpr {
+template <class a> a b;
+auto callee(...);
+template <class d> constexpr auto callee(d) -> decltype(0);
+struct e {
+  template <class ad, class... f> static auto g(ad, f...) {
+    return h<e, decltype(b<f>)...>;
+  }
+  struct i {
+    template <class, class... f> static constexpr auto caller(f... k) {
+      return callee(k...);
+    }
+  };
+  template <class, class... f> static auto h() {
+    return i::caller<int, f...>;
+  }
+};
+class l {
+  l() {
+    e::g([] {}, this);
+  }
+};
+}
+
+// Implicit HD candidate competes with device candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithD {
+  struct a {
+    __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate competes with host candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithH {
+  struct a {
+    a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate competes with HD candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithHD {
+  struct a {
+    __host__ __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// HD candidate competes with H candidate.
+// HD has type mismatch whereas H has type match.
+// In device compilation, H wins when -fgpu-defer-diag is off and HD wins
+// when -fgpu-defer-diags is on. In both cases the diagnostic should be
+// deferred.
+namespace TestDeferNoMatchingFuncNotEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d);
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>) {
+    ae e;
+    ag(e);
+  }
+  void f() { (void)ag<b::c>; }
+}
+
+namespace TestDeferNoMatchingFuncEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d);
+    // devnodeferonly-note@-1{{'ag<TestDeferNoMatchingFuncEmitted::b::c>' declared here}}
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>) {
+    ae e;
+    ag(e);
+    // devnodeferonly-error@-1{{reference to __host__ function 'ag<TestDeferNoMatchingFuncEmitted::b::c>' in __host__ __device__ function}}
+    // devdeferonly-error@-2{{no matching function for call to 'ag'}}
+    // devdeferonly-note@-3{{called by 'ag<TestDeferNoMatchingFuncEmitted::b::c>'}}
+  }
+  __host__ __device__ void f() { (void)ag<b::c>; }
+  // devnodeferonly-note@-1{{called by 'f'}}
+  // devdeferonly-note@-2{{called by 'f'}}
+}
+
+// Two HD candidates compete with H candidate.
+// HDs have type mismatch whereas H has type match.
+// In device compilation, H wins when -fgpu-defer-diag is off and two HD win
+// when -fgpu-defer-diags is on. In both cases the diagnostic should be
+// deferred.
+namespace TestDeferAmbiguityNotEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d, int);
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, float) {
+    ae e;
+    ag(e, 1);
+  }
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, double) {
+  }
+  void f() {
+    b::c x;
+    ag(x, 1);
+  }
+}
+
+namespace TestDeferAmbiguityEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d, int);
+    // devnodeferonly-note@-1{{'ag<TestDeferAmbiguityEmitted::b::c>' declared here}}
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, float) {
+    // devdeferonly-note@-1{{candidate function [with ae = int]}}
+    ae e;
+    ag(e, 1);
+  }
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, double) {
+    // devdeferonly-note@-1{{candidate function [with ae = int]}}
+  }
+  __host__ __device__ void f() {
+    b::c x;
+    ag(x, 1);
+    // devnodeferonly-error@-1{{reference to __host__ function 'ag<TestDeferAmbiguityEmitted::b::c>' in __host__ __device__ function}}
+    // devdeferonly-error@-2{{call to 'ag' is ambiguous}}
+  }
+}
+
+// Implicit HD functions compute with H function and D function.
+// In host compilation, foo(0.0, 2) should resolve to X::foo<double, int>.
+// In device compilation, foo(0.0, 2) should resolve to foo(double, int).
+// In either case there should be no ambiguity.
+namespace TestImplicitHDWithHAndD {
+  namespace X {
+    inline double foo(double, double) { return 0;}
+    inline constexpr float foo(float, float) { return 1;}
+    inline constexpr long double foo(long double, long double) { return 2;}
+    template<typename _Tp, typename _Up> inline constexpr double foo(_Tp, _Up) { return 3;}
+  };
+  using X::foo;
+  inline __device__ double foo(double, double) { return 4;}
+  inline __device__ float foo(float, int) { return 5;}
+  inline __device__ float foo(int, int) { return 6;}
+  inline __device__ double foo(double, int) { return 7;}
+  inline __device__ float foo(float, float) { return 9;}
+  template<typename _Tp, typename _Up> inline __device__ double foo(_Tp, _Up) { return 10;}
+
+  int g() {
+    return [](){
+    return foo(0.0, 2);
+    }();
+  }
+}