diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1138,6 +1138,9 @@
 // Warning about unknown CUDA SDK version.
 def CudaUnknownVersion: DiagGroup<"unknown-cuda-version">;
 
+// Warning about a potential bad target reference.
+def CudaBadTargetRef: DiagGroup<"cuda-bad-target-ref">;
+
 // A warning group for warnings about features supported by HIP but
 // ignored by CUDA.
 def HIPOnly : DiagGroup<"hip-only">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7944,10 +7944,16 @@
   "call to global function %0 not configured">;
 def err_ref_bad_target : Error<
   "reference to %select{__device__|__global__|__host__|__host__ __device__}0 "
-  "function %1 in %select{__device__|__global__|__host__|__host__ __device__}2 function">;
+  "%select{function|variable}1 %2 in "
+  "%select{__device__|__global__|__host__|__host__ __device__}3 function">;
 def err_ref_bad_target_global_initializer : Error<
   "reference to %select{__device__|__global__|__host__|__host__ __device__}0 "
   "function %1 in global initializer">;
+def warn_ref_bad_target_default_argument : Warning<
+  "reference to %select{__device__|__global__|__host__|__host__ __device__}0 "
+  "variable %1 as default argument in "
+  "%select{__device__|__global__|__host__|__host__ __device__}2 function">,
+  InGroup<CudaBadTargetRef>;
 def warn_kern_is_method : Extension<
   "kernel function %0 is a member function; this may not be accepted by nvcc">,
   InGroup<CudaCompat>;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11655,9 +11655,14 @@
   ///
   /// Use this rather than examining the function's attributes yourself -- you
   /// will get it wrong.  Returns CFT_Host if D is null.
-  CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *D,
-                                        bool IgnoreImplicitHDAttr = false);
   CUDAFunctionTarget IdentifyCUDATarget(const ParsedAttributesView &Attrs);
+  CUDAFunctionTarget IdentifyCUDATarget(const FunctionDecl *FD,
+                                        bool IgnoreImplicitHDAttr = false);
+  CUDAFunctionTarget IdentifyCUDATarget(const VarDecl *VD,
+                                        bool IgnoreImplicitHDAttr = false);
+  // This routine is the top level dispatcher to more specific variants above.
+  CUDAFunctionTarget IdentifyCUDATarget(const Decl *D,
+                                        bool IgnoreImplicitHDAttr = false);
 
   /// Gets the CUDA target for the current context.
   CUDAFunctionTarget CurrentCUDATarget() {
@@ -11686,6 +11691,15 @@
   /// \returns preference value for particular Caller/Callee combination.
   CUDAFunctionPreference IdentifyCUDAPreference(const FunctionDecl *Caller,
                                                 const FunctionDecl *Callee);
+  /// Identifies relative preference of a given non-local VD within a Caller,
+  /// based on their host/device attributes.
+  /// \param Caller function which needs address of \p Callee.
+  ///               nullptr in case of global context.
+  /// \param VD     the non-local variable.
+  ///
+  /// \returns preference value for that VD within Caller.
+  CUDAFunctionPreference IdentifyCUDAPreference(const FunctionDecl *Caller,
+                                                const VarDecl *VD);
 
   /// Determines whether Caller may invoke Callee, based on their CUDA
   /// host/device attributes.  Returns false if the call is not allowed.
@@ -11718,6 +11732,26 @@
   ///
   /// - Otherwise, returns true without emitting any diagnostics.
   bool CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee);
+  /// Check whether we're allowed to access VD, a non-local varilable, from the
+  /// given Caller.
+  ///
+  /// - If the accesss is never allowed in a semantically-correct program
+  ///   (CFP_Never), emits an error and returns false.
+  ///
+  /// - If the access is allowed in semantically-correct programs, but only if
+  ///   it's never codegen'ed (CFP_WrongSide), creates a deferred diagnostic to
+  ///   be emitted if and when the caller is codegen'ed, and returns true.
+  ///
+  ///   Will only create deferred diagnostics for a given SourceLocation once,
+  ///   so you can safely call this multiple times without generating duplicate
+  ///   deferred errors.
+  ///
+  /// - Otherwise, returns true without emitting any diagnostics.
+  ///
+  /// TODO: A shadow variable on the host side should be treated specially as
+  /// it is only allowed to be accessed through the runtime interface. It
+  /// cannot be accessed as a regular variable.
+  bool CheckCUDAAccess(SourceLocation Loc, FunctionDecl *Caller, VarDecl *VD);
 
   /// Set __device__ or __host__ __device__ attributes on the given lambda
   /// operator() method.
@@ -11766,6 +11800,19 @@
   // for __constant__ and __device__ variables.
   void checkAllowedCUDAInitializer(VarDecl *VD);
 
+  // \brief Check that default arguments potentially violate CUDA restrictions
+  // in a function declaration. Only warning is issued as it is bound at the
+  // point of declaration.
+  //
+  // \details __device__ variables are accessible from all the threads within
+  // the grid and from the host through the runtime interfaces (see B.2.1).
+  bool checkCUDAParamWithInvalidDefaultArg(SourceLocation Loc, FunctionDecl *FD,
+                                           ParmVarDecl *PVD);
+  // \brief Check that default arguments potentially violate CUDA restrictions
+  // in a function declaration. An error is generated if there is any violance.
+  bool checkCUDAInvalidDefaultArgument(SourceLocation Loc, FunctionDecl *FD,
+                                       Expr *E);
+
   /// Check whether NewFD is a valid overload for CUDA. Emits
   /// diagnostics and invalidates NewFD if not.
   void checkCUDATargetOverload(FunctionDecl *NewFD,
diff --git a/clang/include/clang/Sema/SemaInternal.h b/clang/include/clang/Sema/SemaInternal.h
--- a/clang/include/clang/Sema/SemaInternal.h
+++ b/clang/include/clang/Sema/SemaInternal.h
@@ -327,6 +327,13 @@
   return *this;
 }
 
+/// Determine whether the given declaration is a global variable or static data
+/// member.
+inline bool isNonLocalVariable(const Decl *D) {
+  const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
+  return VD && VD->hasGlobalStorage();
+}
+
 } // end namespace clang
 
 #endif
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
@@ -96,33 +97,34 @@
 }
 
 template <typename A>
-static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) {
+static bool hasAttr(const Decl *D, bool IgnoreImplicitAttr) {
   return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) {
            return isa<A>(Attribute) &&
                   !(IgnoreImplicitAttr && Attribute->isImplicit());
          });
 }
 
-/// IdentifyCUDATarget - Determine the CUDA compilation target for this function
-Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D,
+/// IdentifyCUDATarget - Determine the CUDA compilation target for this
+/// function.
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *FD,
                                                   bool IgnoreImplicitHDAttr) {
   // Code that lives outside a function is run on the host.
-  if (D == nullptr)
+  if (FD == nullptr)
     return CFT_Host;
 
-  if (D->hasAttr<CUDAInvalidTargetAttr>())
+  if (FD->hasAttr<CUDAInvalidTargetAttr>())
     return CFT_InvalidTarget;
 
-  if (D->hasAttr<CUDAGlobalAttr>())
+  if (FD->hasAttr<CUDAGlobalAttr>())
     return CFT_Global;
 
-  if (hasAttr<CUDADeviceAttr>(D, IgnoreImplicitHDAttr)) {
-    if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr))
+  if (hasAttr<CUDADeviceAttr>(FD, IgnoreImplicitHDAttr)) {
+    if (hasAttr<CUDAHostAttr>(FD, IgnoreImplicitHDAttr))
       return CFT_HostDevice;
     return CFT_Device;
-  } else if (hasAttr<CUDAHostAttr>(D, IgnoreImplicitHDAttr)) {
+  } else if (hasAttr<CUDAHostAttr>(FD, IgnoreImplicitHDAttr)) {
     return CFT_Host;
-  } else if (D->isImplicit() && !IgnoreImplicitHDAttr) {
+  } else if (FD->isImplicit() && !IgnoreImplicitHDAttr) {
     // Some implicit declarations (like intrinsic functions) are not marked.
     // Set the most lenient target on them for maximal flexibility.
     return CFT_HostDevice;
@@ -131,6 +133,48 @@
   return CFT_Host;
 }
 
+/// IdentifyCUDATarget - Determine the CUDA compilation target for this
+/// variable.
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const VarDecl *VD,
+                                                  bool IgnoreImplicitHDAttr) {
+  // Code that lives outside a function is run on the host.
+  if (VD == nullptr)
+    return CFT_Host;
+
+  assert(VD->hasGlobalStorage() &&
+         "Only non-local variable needs identifying.");
+
+  if (VD->hasAttr<CUDAInvalidTargetAttr>())
+    return CFT_InvalidTarget;
+
+  if (hasAttr<CUDAConstantAttr>(VD, IgnoreImplicitHDAttr) ||
+      hasAttr<CUDADeviceAttr>(VD, IgnoreImplicitHDAttr) ||
+      hasAttr<CUDASharedAttr>(VD, IgnoreImplicitHDAttr))
+    return CFT_Device;
+
+  if (VD->getType()->isCUDADeviceBuiltinSurfaceType() ||
+      VD->getType()->isCUDADeviceBuiltinTextureType())
+    return CFT_HostDevice;
+
+  return CFT_Host;
+}
+
+/// IdentifyCUDATarget - Determine the CUDA compilation target for a given
+/// declaration.
+Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const Decl *D,
+                                                  bool IgnoreImplicitHDAttr) {
+  if (D == nullptr)
+    return CFT_Host;
+
+  if (const auto *FD = dyn_cast<FunctionDecl>(D))
+    return IdentifyCUDATarget(FD, IgnoreImplicitHDAttr);
+
+  if (const auto *VD = dyn_cast<VarDecl>(D))
+    return IdentifyCUDATarget(VD, IgnoreImplicitHDAttr);
+
+  llvm_unreachable("Unexpected decl for CUDA target identification.");
+}
+
 // * CUDA Call preference table
 //
 // F - from,
@@ -211,6 +255,91 @@
   llvm_unreachable("All cases should've been handled by now.");
 }
 
+// * CUDA variable reference preference table
+//
+// F - from,
+// T - to
+// Ph - preference in host mode
+// Pd - preference in device mode
+// H  - handled in (x)
+// Preferences: N:native, SS:same side, HD:host-device, WS:wrong side, --:never.
+//
+// | F  | T  | Ph  | Pd  |  H  |
+// |----+----+-----+-----+-----+
+// | d  | d  | N   | N   | (b) |
+// | d  | h  | --  | --  | (e) |
+// | d  | hd | HD  | HD  | (a) |
+// | g  | d  | N   | N   | (b) |
+// | g  | h  | --  | --  | (e) |
+// | g  | hd | HD  | HD  | (a) |
+// | h  | d  | HD* | HD* | (d) |
+// | h  | h  | N   | N   | (b) |
+// | h  | hd | HD  | HD  | (a) |
+// | hd | d  | HD* | SS  | (c) |
+// | hd | h  | SS  | WS  | (c) |
+// | hd | hd | HD  | HD  | (a) |
+//
+// * As the shadow variable is always generated on the host side for each
+//   device variable, the host-side code could always access its shadow copy.
+
+Sema::CUDAFunctionPreference
+Sema::IdentifyCUDAPreference(const FunctionDecl *Caller, const VarDecl *VD) {
+  assert(VD && isNonLocalVariable(VD) && "Variable must be a non-local one.");
+  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
+  CUDAFunctionTarget CalleeTarget = IdentifyCUDATarget(VD);
+
+  // If one of the targets is invalid, the check always fails, no matter what
+  // the other target is.
+  if (CallerTarget == CFT_InvalidTarget || CalleeTarget == CFT_InvalidTarget)
+    return CFP_Never;
+
+  // (a) Accessing HostDevice is OK for everyone.
+  if (CalleeTarget == CFT_HostDevice)
+    return CFP_HostDevice;
+
+  // (b) Best case scenarios
+  if (CalleeTarget == CallerTarget ||
+      (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
+    return CFP_Native;
+
+  // (c) HostDevice behavior depends on compilation mode.
+  if (CallerTarget == CFT_HostDevice) {
+    // It's OK to call a compilation-mode matching function from an HD one.
+    if ((getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Device) ||
+        (!getLangOpts().CUDAIsDevice && CalleeTarget == CFT_Host))
+      return CFP_SameSide;
+
+    // Device variables always have their shadow copies on the host side. Even
+    // though the access to them should be made through the runtime API, they
+    // are basically allowed to be accessed in the host code. It's too costy to
+    // examine whether their accesses in the host code is valid, extra tools
+    // such as clang-tidy may need enhancing to report those improper uses.
+    if (CalleeTarget == CFT_Device)
+      return CFP_HostDevice;
+
+    // Calls from HD to non-mode-matching functions (i.e., to host functions
+    // when compiling in device mode or to device functions when compiling in
+    // host mode) are allowed at the sema level, but eventually rejected if
+    // they're ever codegened.  TODO: Reject said calls earlier.
+    return CFP_WrongSide;
+  }
+
+  // (d) Device variables always have their shadow copies on the host side.
+  // Even though the access to them should be made through the runtime API,
+  // they are basically allowed to be accessed in the host code. It's too costy
+  // to examine whether their accesses in the host code is valid, extra tools
+  // such as clang-tidy may need enhancing to report those improper uses.
+  if (CallerTarget == CFT_Host && CalleeTarget == CFT_Device)
+    return CFP_HostDevice;
+
+  // (e) Calling across device/host boundary is not something you should do.
+  if ((CallerTarget == CFT_Device && CalleeTarget == CFT_Host) ||
+      (CallerTarget == CFT_Global && CalleeTarget == CFT_Host))
+    return CFP_Never;
+
+  llvm_unreachable("All cases should've been handled by now.");
+}
+
 void Sema::EraseUnwantedCUDAMatches(
     const FunctionDecl *Caller,
     SmallVectorImpl<std::pair<DeclAccessPair, FunctionDecl *>> &Matches) {
@@ -542,6 +671,61 @@
   }
 }
 
+namespace {
+class CheckDefaultArgumentVisitor
+    : public StmtVisitor<CheckDefaultArgumentVisitor, bool> {
+  Sema &S;
+  SourceLocation Loc;
+  FunctionDecl *FD;
+  ParmVarDecl *PVD;
+
+public:
+  CheckDefaultArgumentVisitor(Sema &S, SourceLocation L, FunctionDecl *F,
+                              ParmVarDecl *P = nullptr)
+      : S(S), Loc(L), FD(F), PVD(P) {}
+
+  bool VisitStmt(Stmt *S) {
+    bool Invalid = false;
+    for (auto *Child : S->children())
+      Invalid |= Child && Visit(Child);
+    return Invalid;
+  }
+
+  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
+    auto VD = dyn_cast<VarDecl>(DRE->getDecl());
+    if (!VD || !isNonLocalVariable(VD))
+      return false;
+    if (PVD) {
+      switch (S.IdentifyCUDAPreference(FD, VD)) {
+      default:
+        return false;
+      case Sema::CFP_Never:
+      case Sema::CFP_WrongSide:
+        break;
+      }
+      S.Diag(Loc, diag::warn_ref_bad_target_default_argument)
+          << S.IdentifyCUDATarget(VD) << VD << S.IdentifyCUDATarget(FD);
+      S.Diag(VD->getLocation(), diag::note_previous_decl) << VD;
+      return true;
+    }
+    return S.CheckCUDAAccess(Loc, FD, VD);
+  }
+};
+} // End anonymous namespace
+
+bool Sema::checkCUDAParamWithInvalidDefaultArg(SourceLocation Loc,
+                                               FunctionDecl *FD,
+                                               ParmVarDecl *PVD) {
+  CheckDefaultArgumentVisitor Checker(*this, Loc, FD, PVD);
+  return Checker.Visit(PVD->getDefaultArg());
+}
+
+bool Sema::checkCUDAInvalidDefaultArgument(SourceLocation Loc, FunctionDecl *FD,
+                                           Expr *E) {
+  CheckDefaultArgumentVisitor Checker(*this, Loc, FD);
+  return Checker.Visit(E);
+}
+
 // With -fcuda-host-device-constexpr, an unattributed constexpr function is
 // treated as implicitly __host__ __device__, unless:
 //  * it is a variadic function (device-side variadic functions are not
@@ -703,7 +887,8 @@
     return true;
 
   DeviceDiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
-      << IdentifyCUDATarget(Callee) << Callee << IdentifyCUDATarget(Caller);
+      << IdentifyCUDATarget(Callee) << /*function*/ 0 << Callee
+      << IdentifyCUDATarget(Caller);
   DeviceDiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
                     Caller, *this)
       << Callee;
@@ -711,6 +896,56 @@
          DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack;
 }
 
+bool Sema::CheckCUDAAccess(SourceLocation Loc, FunctionDecl *Caller,
+                           VarDecl *VD) {
+  assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
+  assert(VD && isNonLocalVariable(VD) && "Variable must be a non-local one.");
+
+  // FIXME: Is bailing out early correct here?  Should we instead assume that
+  // the caller is a global initializer?
+  if (!Caller)
+    return true;
+
+  // If the caller is known-emitted, mark the callee as known-emitted.
+  // Otherwise, mark the call in our call graph so we can traverse it later.
+  bool CallerKnownEmitted =
+      getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted;
+  DeviceDiagBuilder::Kind DiagKind = [this, Caller, VD, CallerKnownEmitted] {
+    switch (IdentifyCUDAPreference(Caller, VD)) {
+    case CFP_Never:
+      return DeviceDiagBuilder::K_Immediate;
+    case CFP_WrongSide:
+      assert(Caller && "WrongSide calls require a non-null caller");
+      // If we know the caller will be emitted, we know this wrong-side call
+      // will be emitted, so it's an immediate error.  Otherwise, defer the
+      // error until we know the caller is emitted.
+      return CallerKnownEmitted ? DeviceDiagBuilder::K_ImmediateWithCallStack
+                                : DeviceDiagBuilder::K_Deferred;
+    default:
+      return DeviceDiagBuilder::K_Nop;
+    }
+  }();
+
+  if (DiagKind == DeviceDiagBuilder::K_Nop)
+    return true;
+
+  // Avoid emitting this error twice for the same location.  Using a hashtable
+  // like this is unfortunate, but because we must continue parsing as normal
+  // after encountering a deferred error, it's otherwise very tricky for us to
+  // ensure that we only emit this deferred error once.
+  if (!LocsWithCUDACallDiags.insert({Caller, Loc}).second)
+    return true;
+
+  DeviceDiagBuilder(DiagKind, Loc, diag::err_ref_bad_target, Caller, *this)
+      << IdentifyCUDATarget(VD) << /*variable*/ 1 << VD
+      << IdentifyCUDATarget(Caller);
+  DeviceDiagBuilder(DiagKind, VD->getLocation(), diag::note_previous_decl,
+                    Caller, *this)
+      << VD;
+  return DiagKind != DeviceDiagBuilder::K_Immediate &&
+         DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack;
+}
+
 void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   if (Method->hasAttr<CUDAHostAttr>() || Method->hasAttr<CUDADeviceAttr>())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -1546,6 +1546,10 @@
   unsigned LastMissingDefaultArg = 0;
   for (; p < NumParams; ++p) {
     ParmVarDecl *Param = FD->getParamDecl(p);
+    if (getLangOpts().CUDA && Param->hasDefaultArg() &&
+        (FD->hasAttr<CUDADeviceAttr>() || FD->hasAttr<CUDAGlobalAttr>())) {
+      checkCUDAParamWithInvalidDefaultArg(Param->getLocation(), FD, Param);
+    }
     if (!Param->hasDefaultArg() && !Param->isParameterPack()) {
       if (Param->isInvalidDecl())
         /* We already complained about this parameter. */;
@@ -16912,15 +16916,6 @@
     Diag(D->getLocation(), diag::err_illegal_initializer);
 }
 
-/// Determine whether the given declaration is a global variable or
-/// static data member.
-static bool isNonlocalVariable(const Decl *D) {
-  if (const VarDecl *Var = dyn_cast_or_null<VarDecl>(D))
-    return Var->hasGlobalStorage();
-
-  return false;
-}
-
 /// Invoked when we are about to parse an initializer for the declaration
 /// 'Dcl'.
 ///
@@ -16943,7 +16938,7 @@
   // If we are parsing the initializer for a static data member, push a
   // new expression evaluation context that is associated with this static
   // data member.
-  if (isNonlocalVariable(D))
+  if (isNonLocalVariable(D))
     PushExpressionEvaluationContext(
         ExpressionEvaluationContext::PotentiallyEvaluated, D);
 }
@@ -16954,7 +16949,7 @@
   if (!D || D->isInvalidDecl())
     return;
 
-  if (isNonlocalVariable(D))
+  if (isNonLocalVariable(D))
     PopExpressionEvaluationContext();
 
   if (S && D->isOutOfLine())
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -345,6 +345,11 @@
     return true;
   }
 
+  if (LangOpts.CUDA && isNonLocalVariable(D) &&
+      !CheckCUDAAccess(Loc, dyn_cast<FunctionDecl>(CurContext),
+                       cast<VarDecl>(D)))
+    return true;
+
   DiagnoseAvailabilityOfDecl(D, Locs, UnknownObjCClass, ObjCPropertyAccess,
                              AvoidPartialAvailabilityChecks, ClassReceiver);
 
@@ -5480,6 +5485,13 @@
            "default argument expression has capturing blocks?");
   }
 
+  // TODO: Add CUDA check on the default argument and issue warning if any
+  // invalid target reference from the function.
+  if (getLangOpts().CUDA &&
+      checkCUDAInvalidDefaultArgument(
+          CallLoc, dyn_cast<FunctionDecl>(CurContext), Param->getDefaultArg()))
+    return true;
+
   // We already type-checked the argument, so we know it works.
   // Just mark all of the declarations in this potentially-evaluated expression
   // as being "referenced".
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -976,8 +976,6 @@
       startLambdaDefinition(Class, Intro.Range, MethodTyInfo, EndLoc, Params,
                             ParamInfo.getDeclSpec().getConstexprSpecifier(),
                             ParamInfo.getTrailingRequiresClause());
-  if (ExplicitParams)
-    CheckCXXDefaultArguments(Method);
 
   // This represents the function body for the lambda function, check if we
   // have to apply optnone due to a pragma.
@@ -995,6 +993,10 @@
   if (getLangOpts().CUDA)
     CUDASetLambdaAttrs(Method);
 
+  // Check parameters with default arguments.
+  if (ExplicitParams)
+    CheckCXXDefaultArguments(Method);
+
   // Number the lambda for linkage purposes if necessary.
   handleLambdaNumbering(Class, Method);
 
diff --git a/clang/test/CodeGenCUDA/function-overload.cu b/clang/test/CodeGenCUDA/function-overload.cu
--- a/clang/test/CodeGenCUDA/function-overload.cu
+++ b/clang/test/CodeGenCUDA/function-overload.cu
@@ -12,13 +12,15 @@
 #include "Inputs/cuda.h"
 
 // Check constructors/destructors for D/H functions
-int x;
+__device__ int x;
 struct s_cd_dh {
+  // TODO: Need to generate warning on direct accesses on shadow variables.
   __host__ s_cd_dh() { x = 11; }
   __device__ s_cd_dh() { x = 12; }
 };
 
 struct s_cd_hd {
+  // TODO: Need to generate warning on direct accesses on shadow variables.
   __host__ __device__ s_cd_hd() { x = 31; }
   __host__ __device__ ~s_cd_hd() { x = 32; }
 };
diff --git a/clang/test/SemaCUDA/variable-target.cu b/clang/test/SemaCUDA/variable-target.cu
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCUDA/variable-target.cu
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+static int gvar;
+// expected-note@-1{{'gvar' declared here}}
+// expected-note@-2{{'gvar' declared here}}
+// expected-note@-3{{'gvar' declared here}}
+// expected-note@-4{{'gvar' declared here}}
+// expected-note@-5{{'gvar' declared here}}
+// expected-note@-6{{'gvar' declared here}}
+
+__device__ int d0() {
+  // expected-error@+1{{reference to __host__ variable 'gvar' in __device__ function}}
+  return gvar;
+}
+__device__ int d1() {
+  // expected-error@+1{{reference to __host__ variable 'gvar' in __device__ function}}
+  return []() -> int { return gvar; }();
+}
+
+// expected-warning@+1{{reference to __host__ variable 'gvar' as default argument in __device__ function}}
+__device__ int d2(int arg = gvar) {
+  return arg;
+}
+__device__ int d3() {
+  // expected-error@+1{{reference to __host__ variable 'gvar' in __device__ function}}
+  return d2();
+}
+
+template <typename F>
+__global__ void g0(F f) {
+  // expected-error@+1{{reference to __host__ variable 'gvar' in __global__ function}}
+  f();
+}
+int h0() {
+  // expected-warning@+1{{reference to __host__ variable 'gvar' as default argument in __device__ function}}
+  g0<<<1, 1>>>([] __device__(int arg = gvar) -> int { return arg; });
+  // expected-note-re@-1{{in instantiation of function template specialization 'g0<(lambda at {{.*}})>' requested here}}
+  return 0;
+}