diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11388,9 +11388,14 @@
   ///
   /// Use this rather than examining the function's attributes yourself -- you
   /// will get it wrong.  Returns CFT_Host if D is null.
+  CUDAFunctionTarget IdentifyCUDATarget(const ParsedAttributesView &Attrs);
   CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D,
                                         bool IgnoreImplicitHDAttr = false);
-  CUDAFunctionTarget IdentifyCUDATarget(const ParsedAttributesView &Attrs);
+  CUDAFunctionTarget IdentifyCUDATarget(const VarDecl *D,
+                                        bool IgnoreImplicitHDAttr = false);
+  // This routine is the top level dispatcher to more specific variants above.
+  CUDAFunctionTarget IdentifyCUDATarget(const Decl *D,
+                                        bool IgnoreImplicitHDAttr = false);
 
   /// Gets the CUDA target for the current context.
   CUDAFunctionTarget CurrentCUDATarget() {
@@ -11410,24 +11415,53 @@
     CFP_Native,     // host-to-host or device-to-device calls.
   };
 
-  /// Identifies relative preference of a given Caller/Callee
+  /// Identifies relative preference of a given callee and that call context
   /// combination, based on their host/device attributes.
-  /// \param Caller function which needs address of \p Callee.
-  ///               nullptr in case of global context.
-  /// \param Callee target function
+  /// \param CallContextDecl The context decl which needs address of \p Callee.
+  ///                        Null in case of the global context.
+  /// \param Callee          Target function.
   ///
   /// \returns preference value for particular Caller/Callee combination.
-  CUDAFunctionPreference IdentifyCUDAPreference(const FunctionDecl *Caller,
+  CUDAFunctionPreference IdentifyCUDAPreference(const Decl *CallContextDecl,
                                                 const FunctionDecl *Callee);
 
+  SmallVector<const Decl *, 8> CUDANonLocalVariableStack;
+
+  void pushCUDANonLocalVariable(const Decl *D);
+  void popCUDANonLocalVariable(const Decl *D);
+
+  const Decl *getCUDACurrentNonLocalVariable() const {
+    return CUDANonLocalVariableStack.empty() ? nullptr
+                                             : CUDANonLocalVariableStack.back();
+  }
+
+  const Decl *getCUDAContextDecl() const {
+    // Skip where no real call could be performed.
+    // TODO: There are cases where proper checking is required, such as the
+    // default member initializer.
+    if (!CurContext->isFileContext() && !CurContext->isFunctionOrMethod())
+      return nullptr;
+    const Decl *ContextDecl = dyn_cast<FunctionDecl>(CurContext);
+    if (!ContextDecl)
+      ContextDecl = getCUDACurrentNonLocalVariable();
+    return ContextDecl;
+  }
+
   /// Determines whether Caller may invoke Callee, based on their CUDA
   /// host/device attributes.  Returns false if the call is not allowed.
   ///
   /// Note: Will return true for CFP_WrongSide calls.  These may appear in
   /// semantically correct CUDA programs, but only if they're never codegen'ed.
-  bool IsAllowedCUDACall(const FunctionDecl *Caller,
-                         const FunctionDecl *Callee) {
-    return IdentifyCUDAPreference(Caller, Callee) != CFP_Never;
+  enum SkipCallerKind_t { SkipNoneCaller, SkipImplicitCaller };
+  bool isCUDACallAllowed(const FunctionDecl *Callee,
+                         SkipCallerKind_t Kind = SkipNoneCaller) {
+    // Skip contexts where no real call could be performed.
+    if (!CurContext->isFileContext() && !CurContext->isFunctionOrMethod())
+      return true;
+    if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
+      if (Kind == SkipImplicitCaller && Caller->isImplicit())
+        return true;
+    return IdentifyCUDAPreference(getCUDAContextDecl(), Callee) != CFP_Never;
   }
 
   /// May add implicit CUDAHostAttr and CUDADeviceAttr attributes to FD,
@@ -11461,10 +11495,9 @@
   void CUDASetLambdaAttrs(CXXMethodDecl *Method);
 
   /// Finds a function in \p Matches with highest calling priority
-  /// from \p Caller context and erases all functions with lower
+  /// from the current context and erases all functions with lower
   /// calling priority.
   void EraseUnwantedCUDAMatches(
-      const FunctionDecl *Caller,
       SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches);
 
   /// Given a implicit special member, infer its CUDA target from the
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2342,6 +2342,8 @@
     }
   }
 
+  Actions.pushCUDANonLocalVariable(ThisDecl);
+
   // Parse declarator '=' initializer.
   // If a '==' or '+=' is found, suggest a fixit to '='.
   if (isTokenEqualOrEqualTypo()) {
@@ -2474,6 +2476,8 @@
     Actions.ActOnUninitializedDecl(ThisDecl);
   }
 
+  Actions.popCUDANonLocalVariable(ThisDecl);
+
   Actions.FinalizeDeclaration(ThisDecl);
 
   return ThisDecl;
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -95,7 +95,7 @@
 }
 
 template <typename A>
-static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) {
+static bool hasAttr(const Decl *D, bool IgnoreImplicitAttr) {
   return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) {
            return isa<A>(Attribute) &&
                   !(IgnoreImplicitAttr && Attribute->isImplicit());
@@ -130,6 +130,41 @@
   return CFT_Host;
 }
 
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const VarDecl *D,
+                                                  bool IgnoreImplicitHDAttr) {
+  if (D == nullptr)
+    return CFT_Host;
+
+  assert(D->hasGlobalStorage() && "Only non-local variable needs identifying.");
+
+  if (D->hasAttr<CUDAInvalidTargetAttr>())
+    return CFT_InvalidTarget;
+
+  if (hasAttr<HIPPinnedShadowAttr>(D, IgnoreImplicitHDAttr))
+    return CFT_Host;
+
+  if (hasAttr<CUDAConstantAttr>(D, IgnoreImplicitHDAttr) ||
+      hasAttr<CUDADeviceAttr>(D, IgnoreImplicitHDAttr) ||
+      hasAttr<CUDASharedAttr>(D, IgnoreImplicitHDAttr))
+    return CFT_Device;
+
+  return CFT_Host;
+}
+
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const Decl *D,
+                                                  bool IgnoreImplicitHDAttr) {
+  if (D == nullptr)
+    return CFT_Host;
+
+  if (auto FD = dyn_cast<FunctionDecl>(D))
+    return IdentifyCUDATarget(FD, IgnoreImplicitHDAttr);
+
+  if (auto VD = dyn_cast<VarDecl>(D))
+    return IdentifyCUDATarget(VD, IgnoreImplicitHDAttr);
+
+  llvm_unreachable("Unexpected decl for CUDA target identification.");
+}
+
 // * CUDA Call preference table
 //
 // F - from,
@@ -159,10 +194,10 @@
 // | hd | hd | HD  | HD  | (b) |
 
 Sema::CUDAFunctionPreference
-Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
+Sema::IdentifyCUDAPreference(const Decl *ContextDecl,
                              const FunctionDecl *Callee) {
   assert(Callee && "Callee must be valid.");
-  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
+  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(ContextDecl);
   CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(Callee);
 
   // If one of the targets is invalid, the check always fails, no matter what
@@ -211,16 +246,17 @@
 }
 
 void Sema::EraseUnwantedCUDAMatches(
-    const FunctionDecl *Caller,
     SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
   if (Matches.size() <= 1)
     return;
 
   using Pair = std::pair<DeclAccessPair, FunctionDecl*>;
 
-  // Gets the CUDA function preference for a call from Caller to Match.
+  const Decl *ContextDecl = getCUDAContextDecl();
+
+  // Gets the CUDA function preference for a call from call context to Match.
   auto GetCFP = [&](const Pair &Match) {
-    return IdentifyCUDAPreference(Caller, Match.second);
+    return IdentifyCUDAPreference(ContextDecl, Match.second);
   };
 
   // Find the best call preference among the functions in Matches.
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -16691,6 +16691,20 @@
   return false;
 }
 
+void Sema::pushCUDANonLocalVariable(const Decl *D) {
+  if (!D || D->isInvalidDecl() || !isNonlocalVariable(D))
+    return;
+  CUDANonLocalVariableStack.push_back(D);
+}
+
+void Sema::popCUDANonLocalVariable(const Decl *D) {
+  if (!D || D->isInvalidDecl() || !isNonlocalVariable(D))
+    return;
+  assert(!CUDANonLocalVariableStack.empty() &&
+         CUDANonLocalVariableStack.back() == D);
+  CUDANonLocalVariableStack.pop_back();
+}
+
 /// Invoked when we are about to parse an initializer for the declaration
 /// 'Dcl'.
 ///
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1479,9 +1479,9 @@
 
 bool Sema::isUsualDeallocationFunction(const CXXMethodDecl *Method) {
   // [CUDA] Ignore this function, if we can't call it.
-  const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
+  const Decl *ContextDecl = getCUDAContextDecl();
   if (getLangOpts().CUDA &&
-      IdentifyCUDAPreference(Caller, Method) <= CFP_WrongSide)
+      IdentifyCUDAPreference(ContextDecl, Method) <= CFP_WrongSide)
     return false;
 
   SmallVector<const FunctionDecl*, 4> PreventedBy;
@@ -1495,7 +1495,7 @@
   return llvm::none_of(PreventedBy, [&](const FunctionDecl *FD) {
     assert(FD->getNumParams() == 1 &&
            "Only single-operand functions should be in PreventedBy");
-    return IdentifyCUDAPreference(Caller, FD) >= CFP_HostDevice;
+    return IdentifyCUDAPreference(ContextDecl, FD) >= CFP_HostDevice;
   });
 }
 
@@ -1558,8 +1558,7 @@
 
       // In CUDA, determine how much we'd like / dislike to call this.
       if (S.getLangOpts().CUDA)
-        if (auto *Caller = dyn_cast<FunctionDecl>(S.CurContext))
-          CUDAPref = S.IdentifyCUDAPreference(Caller, FD);
+        CUDAPref = S.IdentifyCUDAPreference(S.getCUDAContextDecl(), FD);
     }
 
     explicit operator bool() const { return FD; }
@@ -2600,7 +2599,7 @@
     }
 
     if (getLangOpts().CUDA)
-      EraseUnwantedCUDAMatches(dyn_cast<FunctionDecl>(CurContext), Matches);
+      EraseUnwantedCUDAMatches(Matches);
   } else {
     // C++1y [expr.new]p22:
     //   For a non-placement allocation function, the normal deallocation
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6300,17 +6300,12 @@
   }
 
   // (CUDA B.1): Check for invalid calls between targets.
-  if (getLangOpts().CUDA)
-    if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
-      // Skip the check for callers that are implicit members, because in this
-      // case we may not yet know what the member's target is; the target is
-      // inferred for the member automatically, based on the bases and fields of
-      // the class.
-      if (!Caller->isImplicit() && !IsAllowedCUDACall(Caller, Function)) {
-        Candidate.Viable = false;
-        Candidate.FailureKind = ovl_fail_bad_target;
-        return;
-      }
+  if (getLangOpts().CUDA &&
+      !isCUDACallAllowed(Function, Sema::SkipImplicitCaller)) {
+    Candidate.Viable = false;
+    Candidate.FailureKind = ovl_fail_bad_target;
+    return;
+  }
 
   if (Function->getTrailingRequiresClause()) {
     ConstraintSatisfaction Satisfaction;
@@ -6821,13 +6816,11 @@
   }
 
   // (CUDA B.1): Check for invalid calls between targets.
-  if (getLangOpts().CUDA)
-    if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
-      if (!IsAllowedCUDACall(Caller, Method)) {
-        Candidate.Viable = false;
-        Candidate.FailureKind = ovl_fail_bad_target;
-        return;
-      }
+  if (getLangOpts().CUDA && !isCUDACallAllowed(Method)) {
+    Candidate.Viable = false;
+    Candidate.FailureKind = ovl_fail_bad_target;
+    return;
+  }
 
   if (Method->getTrailingRequiresClause()) {
     ConstraintSatisfaction Satisfaction;
@@ -9665,9 +9658,9 @@
   }
 
   if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
-    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
-           S.IdentifyCUDAPreference(Caller, Cand2.Function);
+    const Decl *ContextDecl = S.getCUDAContextDecl();
+    return S.IdentifyCUDAPreference(ContextDecl, Cand1.Function) >
+           S.IdentifyCUDAPreference(ContextDecl, Cand2.Function);
   }
 
   bool HasPS1 = Cand1.Function != nullptr &&
@@ -9771,19 +9764,19 @@
   // candidate call is WrongSide and the other is SameSide, we ignore
   // the WrongSide candidate.
   if (S.getLangOpts().CUDA) {
-    const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    const Decl *ContextDecl = S.getCUDAContextDecl();
     bool ContainsSameSideCandidate =
         llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
           // Check viable function only.
           return Cand->Viable && Cand->Function &&
-                 S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+                 S.IdentifyCUDAPreference(ContextDecl, Cand->Function) ==
                      Sema::CFP_SameSide;
         });
     if (ContainsSameSideCandidate) {
       auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) {
         // Check viable function only to avoid unnecessary data copying/moving.
         return Cand->Viable && Cand->Function &&
-               S.IdentifyCUDAPreference(Caller, Cand->Function) ==
+               S.IdentifyCUDAPreference(ContextDecl, Cand->Function) ==
                    Sema::CFP_WrongSide;
       };
       llvm::erase_if(Candidates, IsWrongSideCandidate);
@@ -10770,10 +10763,10 @@
 
 /// CUDA: diagnose an invalid call across targets.
 static void DiagnoseBadTarget(Sema &S, OverloadCandidate *Cand) {
-  FunctionDecl *Caller = cast<FunctionDecl>(S.CurContext);
+  const Decl *ContextDecl = S.getCUDAContextDecl();
   FunctionDecl *Callee = Cand->Function;
 
-  Sema::CUDAFunctionTarget CallerTarget = S.IdentifyCUDATarget(Caller),
+  Sema::CUDAFunctionTarget CallerTarget = S.IdentifyCUDATarget(ContextDecl),
                            CalleeTarget = S.IdentifyCUDATarget(Callee);
 
   std::string FnDesc;
@@ -11809,10 +11802,9 @@
       return false;
 
     if (FunctionDecl *FunDecl = dyn_cast<FunctionDecl>(Fn)) {
-      if (S.getLangOpts().CUDA)
-        if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext))
-          if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl))
-            return false;
+      if (S.getLangOpts().CUDA &&
+          !S.isCUDACallAllowed(FunDecl, Sema::SkipImplicitCaller))
+        return false;
       if (FunDecl->isMultiVersion()) {
         const auto *TA = FunDecl->getAttr<TargetAttr>();
         if (TA && !TA->isDefaultVersion())
@@ -11927,7 +11919,7 @@
   }
 
   void EliminateSuboptimalCudaMatches() {
-    S.EraseUnwantedCUDAMatches(dyn_cast<FunctionDecl>(S.CurContext), Matches);
+    S.EraseUnwantedCUDAMatches(Matches);
   }
 
 public:
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -214,8 +214,10 @@
 // Test for address of overloaded function resolution in the global context.
 HostFnPtr fp_h = h;
 HostFnPtr fp_ch = ch;
+#if !defined(__CUDA_ARCH__)
 CurrentFnPtr fp_dh = dh;
 CurrentFnPtr fp_cdh = cdh;
+#endif
 GlobalFnPtr fp_g = g;
 
 
@@ -419,3 +421,28 @@
 int test_constexpr_overload(C2 &x, C2 &y) {
   return constexpr_overload(x, y);
 }
+
+__device__ float fn(int);
+__host__ float fn(float);
+
+// Overload resolution in the global initialization should follow the same rule
+// as the one in other places. That is, we prefer a callable function over a
+// non-callable function with a better signature match. In this test case, even
+// though the device function has exactly matching with the integer argument,
+// it can't be executed.
+float gvar1 = fn(1);
+
+__device__ float dev_only_fn(int);
+// expected-note@-1 {{candidate function not viable: call to __device__ function from __host__ function}}
+
+float gvar2 = dev_only_fn(1); // expected-error {{no matching function for call to 'dev_only_fn'}}
+
+#ifdef __CUDA_ARCH__
+__device__ DeviceReturnTy gvar3 = template_vs_function(1.f);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__device__ int gvar4 = template_overload(1);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+#else
+TemplateReturnTy gvar3 = template_vs_function(2.f);
+int gvar4 = template_overload(1);
+#endif
diff --git a/clang/test/SemaCUDA/global-initializers-host.cu b/clang/test/SemaCUDA/global-initializers-host.cu
--- a/clang/test/SemaCUDA/global-initializers-host.cu
+++ b/clang/test/SemaCUDA/global-initializers-host.cu
@@ -6,12 +6,14 @@
 // module initializer.
 
 struct S {
+  // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}}
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}}
   __device__ S() {}
-  // expected-note@-1 {{'S' declared here}}
+  // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}}
 };
 
 S s;
-// expected-error@-1 {{reference to __device__ function 'S' in global initializer}}
+// expected-error@-1 {{no matching constructor for initialization of 'S'}}
 
 struct T {
   __host__ __device__ T() {}
@@ -19,14 +21,17 @@
 T t;  // No error, this is OK.
 
 struct U {
+  // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const U' for 1st argument}}
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'U' for 1st argument}}
   __host__ U() {}
+  // expected-note@-1 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
   __device__ U(int) {}
-  // expected-note@-1 {{'U' declared here}}
+  // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}}
 };
 U u(42);
-// expected-error@-1 {{reference to __device__ function 'U' in global initializer}}
+// expected-error@-1 {{no matching constructor for initialization of 'U'}}
 
 __device__ int device_fn() { return 42; }
-// expected-note@-1 {{'device_fn' declared here}}
+// expected-note@-1 {{candidate function not viable: call to __device__ function from __host__ function}}
 int n = device_fn();
-// expected-error@-1 {{reference to __device__ function 'device_fn' in global initializer}}
+// expected-error@-1 {{no matching function for call to 'device_fn'}}
diff --git a/clang/test/SemaCUDA/hip-pinned-shadow.cu b/clang/test/SemaCUDA/hip-pinned-shadow.cu
--- a/clang/test/SemaCUDA/hip-pinned-shadow.cu
+++ b/clang/test/SemaCUDA/hip-pinned-shadow.cu
@@ -13,13 +13,19 @@
 
 template <class T, int texType, int hipTextureReadMode>
 struct texture : public textureReference {
+// expected-note@-1{{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}}
+// expected-note@-2{{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}}
+// expected-note@-3{{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}}
+// expected-note@-4{{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}}
 texture() { a = 1; }
+// expected-note@-1{{candidate constructor not viable: call to __host__ function from __device__ function}}
+// expected-note@-2{{candidate constructor not viable: call to __host__ function from __device__ function}}
 };
 
 __hip_pinned_shadow__ texture<float, 2, 1> tex;
 __device__ __hip_pinned_shadow__ texture<float, 2, 1> tex2; // expected-error{{'hip_pinned_shadow' and 'device' attributes are not compatible}}
-                                                            // expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables}}
-                                                            // expected-note@-2{{conflicting attribute is here}}
+                                                            // expected-note@-1{{conflicting attribute is here}}
+                                                            // expected-error@-2{{no matching constructor for initialization of 'texture<float, 2, 1>'}}
 __constant__ __hip_pinned_shadow__ texture<float, 2, 1> tex3; // expected-error{{'hip_pinned_shadow' and 'constant' attributes are not compatible}}
-                                                              // expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables}}
-                                                              // expected-note@-2{{conflicting attribute is here}}
+                                                              // expected-note@-1{{conflicting attribute is here}}
+                                                              // expected-error@-2{{no matching constructor for initialization of 'texture<float, 2, 1>'}}