diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -123,7 +123,8 @@
     return CFT_Device;
   } else if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr)) {
     return CFT_Host;
-  } else if (D->isImplicit() && !IgnoreImplicitHDAttr) {
+  } else if ((D->isImplicit() || !D->isUserProvided()) &&
+             !IgnoreImplicitHDAttr) {
     // Some implicit declarations (like intrinsic functions) are not marked.
     // Set the most lenient target on them for maximal flexibility.
     return CFT_HostDevice;
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1527,9 +1527,24 @@
 bool Sema::isUsualDeallocationFunction(const CXXMethodDecl *Method) {
   // [CUDA] Ignore this function, if we can't call it.
   const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
-  if (getLangOpts().CUDA &&
-      IdentifyCUDAPreference(Caller, Method) <= CFP_WrongSide)
-    return false;
+  if (getLangOpts().CUDA) {
+    auto CallPreference = IdentifyCUDAPreference(Caller, Method);
+    // If it's not callable at all, it's not the right function.
+    if (CallPreference < CFP_WrongSide)
+      return false;
+    if (CallPreference == CFP_WrongSide) {
+      // Maybe. We have to check if there are better alternatives.
+      DeclContext::lookup_result R =
+          Method->getDeclContext()->lookup(Method->getDeclName());
+      for (const auto *D : R) {
+        if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
+          if (IdentifyCUDAPreference(Caller, FD) > CFP_WrongSide)
+            return false;
+        }
+      }
+      // We've found no better variants.
+    }
+  }
 
   SmallVector<const FunctionDecl*, 4> PreventedBy;
   bool Result = Method->isUsualDeallocationFunction(PreventedBy);
diff --git a/clang/test/CodeGenCUDA/usual-deallocators.cu b/clang/test/CodeGenCUDA/usual-deallocators.cu
--- a/clang/test/CodeGenCUDA/usual-deallocators.cu
+++ b/clang/test/CodeGenCUDA/usual-deallocators.cu
@@ -12,6 +12,19 @@
 extern "C" __device__ void dev_fn();
 extern "C" __host__ __device__ void hd_fn();
 
+// Destructors are handled a bit differently, compared to regular functions.
+// Make sure we do trigger kernel generation on the GPU side even if it's only
+// referenced by the destructor.
+template<typename T> __global__ void f(T) {}
+template<typename T> struct A {
+  ~A() { f<<<1, 1>>>(T()); }
+};
+
+// HOST-LABEL: @a
+A<int> a;
+// HOST-LABEL: define linkonce_odr void @_ZN1AIiED1Ev
+// search further down for the deice-side checks for @_Z1fIiEvT_
+
 struct H1D1 {
   __host__ void operator delete(void *) { host_fn(); };
   __device__ void operator delete(void *) { dev_fn(); };
@@ -95,6 +108,9 @@
   test_hd<H1H2D1D2>(t);
 }
 
+// Make sure that we've generated the kernel used by A::~A.
+// DEVICE-LABEL: define dso_local void @_Z1fIiEvT_
+
 // Make sure we've picked deallocator for the correct side of compilation.
 
 // COMMON-LABEL: define  linkonce_odr void @_ZN4H1D1dlEPv(i8* %0)
@@ -131,3 +147,5 @@
 // COMMON-LABEL: define  linkonce_odr void @_ZN8H1H2D1D2dlEPv(i8* %0)
 // DEVICE: call void @dev_fn()
 // HOST: call void @host_fn()
+
+// DEVICE: !0 = !{void (i32)* @_Z1fIiEvT_, !"kernel", i32 1}
diff --git a/clang/test/SemaCUDA/usual-deallocators.cu b/clang/test/SemaCUDA/usual-deallocators.cu
--- a/clang/test/SemaCUDA/usual-deallocators.cu
+++ b/clang/test/SemaCUDA/usual-deallocators.cu
@@ -93,3 +93,12 @@
   test_hd<H1H2D2>(t);
   test_hd<H1H2D1D2>(t);
 }
+
+// This should produce no errors.  Defaulted destructor should be treated as HD,
+// which allows referencing host-only `operator delete` with a deferred
+// diagnostics that would fire if we ever attempt to codegen it on device..
+struct H {
+  virtual ~H() = default;
+  static void operator delete(void *) {}
+};
+H h;