diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7496,6 +7496,12 @@
   "cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
 def note_cuda_ovl_candidate_target_mismatch : Note<
     "candidate template ignored: target attributes do not match">;
+def warn_decltype_ambiguous_return_type : Warning<
+    "return type of %0 in 'decltype' is ambiguous and may not be expected">;
+def note_decltype_ambiguous_function_chosen : Note<
+    "use this definition of %0">;
+def note_decltype_ambiguous_function_other : Note<
+    "other definition of %0">;
 
 def warn_non_pod_vararg_with_format_string : Warning<
   "cannot pass %select{non-POD|non-trivial}0 object of type %1 to variadic "
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8102,6 +8102,14 @@
     return ExprEvalContexts.back().isUnevaluated();
   }
 
+  bool underDecltypeContext() const {
+    return llvm::any_of(ExprEvalContexts,
+                        [](const ExpressionEvaluationContextRecord &C) {
+                          return C.ExprContext ==
+                                 ExpressionEvaluationContextRecord::EK_Decltype;
+                        });
+  }
+
   /// RAII class used to determine whether SFINAE has
   /// trapped any errors that occur during template argument
   /// deduction.
@@ -10953,14 +10961,18 @@
   CUDAFunctionPreference IdentifyCUDAPreference(const FunctionDecl *Caller,
                                                 const FunctionDecl *Callee);
 
-  /// Determines whether Caller may invoke Callee, based on their CUDA
-  /// host/device attributes.  Returns false if the call is not allowed.
+  /// Determines, under the current context, whether Callee may be invokable,
+  /// based on their CUDA host/device attributes. Returns false if the call is
+  /// not allowed.
   ///
   /// Note: Will return true for CFP_WrongSide calls.  These may appear in
   /// semantically correct CUDA programs, but only if they're never codegen'ed.
-  bool IsAllowedCUDACall(const FunctionDecl *Caller,
-                         const FunctionDecl *Callee) {
-    return IdentifyCUDAPreference(Caller, Callee) != CFP_Never;
+  bool isCUDACallAllowed(const FunctionDecl *Callee) {
+    // Under `decltype`, the rule is relaxed.
+    if (underDecltypeContext())
+      return true;
+    return IdentifyCUDAPreference(dyn_cast<FunctionDecl>(CurContext), Callee) !=
+           CFP_Never;
   }
 
   /// May add implicit CUDAHostAttr and CUDADeviceAttr attributes to FD,
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6237,7 +6237,7 @@
       // case we may not yet know what the member's target is; the target is
       // inferred for the member automatically, based on the bases and fields of
       // the class.
-      if (!Caller->isImplicit() && !IsAllowedCUDACall(Caller, Function)) {
+      if (!Caller->isImplicit() && !isCUDACallAllowed(Function)) {
         Candidate.Viable = false;
         Candidate.FailureKind = ovl_fail_bad_target;
         return;
@@ -6753,7 +6753,7 @@
   // (CUDA B.1): Check for invalid calls between targets.
   if (getLangOpts().CUDA)
     if (const FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext))
-      if (!IsAllowedCUDACall(Caller, Method)) {
+      if (!isCUDACallAllowed(Method)) {
         Candidate.Viable = false;
         Candidate.FailureKind = ovl_fail_bad_target;
         return;
@@ -9673,6 +9673,28 @@
   if (Best->Function && Best->Function->isDeleted())
     return OR_Deleted;
 
+  // Issue a warning of return type resolution under `decltype`.
+  if (S.getLangOpts().CUDA && Best->Function && S.underDecltypeContext()) {
+    SmallVector<const OverloadCandidate *, 16> AmbiSet;
+    QualType BestReturnType = Best->Function->getReturnType();
+    for (auto &Cand : this->Candidates) {
+      if (!Cand.Viable || !Cand.Function)
+        continue;
+      if (BestReturnType != Cand.Function->getReturnType())
+        AmbiSet.push_back(&Cand);
+    }
+    if (!AmbiSet.empty()) {
+      S.Diag(Loc, diag::warn_decltype_ambiguous_return_type) << Best->Function;
+      S.Diag(Best->Function->getLocation(),
+             diag::note_decltype_ambiguous_function_chosen)
+          << Best->Function;
+      for (auto C : AmbiSet)
+        S.Diag(C->Function->getLocation(),
+               diag::note_decltype_ambiguous_function_other)
+            << C->Function;
+    }
+  }
+
   if (!EquivalentCands.empty())
     S.diagnoseEquivalentInternalLinkageDeclarations(Loc, Best->Function,
                                                     EquivalentCands);
@@ -11491,7 +11513,7 @@
     if (FunctionDecl *FunDecl = dyn_cast<FunctionDecl>(Fn)) {
       if (S.getLangOpts().CUDA)
         if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext))
-          if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl))
+          if (!Caller->isImplicit() && !S.isCUDACallAllowed(FunDecl))
             return false;
       if (FunDecl->isMultiVersion()) {
         const auto *TA = FunDecl->getAttr<TargetAttr>();
diff --git a/clang/test/CodeGenCUDA/function-overload.cu b/clang/test/CodeGenCUDA/function-overload.cu
--- a/clang/test/CodeGenCUDA/function-overload.cu
+++ b/clang/test/CodeGenCUDA/function-overload.cu
@@ -8,6 +8,8 @@
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+// RUN: %clang_cc1 -std=c++11 -DCHECK_DECLTYPE -triple amdgcn -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-DECLTYPE %s
 
 #include "Inputs/cuda.h"
 
@@ -53,3 +55,14 @@
 // CHECK-BOTH: define linkonce_odr void @_ZN7s_cd_hdD2Ev(
 // CHECK-BOTH: store i32 32,
 // CHECK-BOTH: ret void
+
+#if defined(CHECK_DECLTYPE)
+int foo(float);
+// CHECK-DECLTYPE-LABEL: @_Z3barf
+// CHECK-DECLTYPE: fptosi
+// CHECK-DECLTYPE: sitofp
+__device__ float bar(float x) {
+  decltype(foo(x)) y = x;
+  return y + 3.f;
+}
+#endif
diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c
--- a/clang/test/Misc/warning-flags.c
+++ b/clang/test/Misc/warning-flags.c
@@ -18,7 +18,7 @@
 
 The list of warnings below should NEVER grow.  It should gradually shrink to 0.
 
-CHECK: Warnings without flags (74):
+CHECK: Warnings without flags (75):
 CHECK-NEXT:   ext_excess_initializers
 CHECK-NEXT:   ext_excess_initializers_in_char_array_initializer
 CHECK-NEXT:   ext_expected_semi_decl_list
@@ -47,6 +47,7 @@
 CHECK-NEXT:   warn_conv_to_base_not_used
 CHECK-NEXT:   warn_conv_to_self_not_used
 CHECK-NEXT:   warn_conv_to_void_not_used
+CHECK-NEXT:   warn_decltype_ambiguous_return_type
 CHECK-NEXT:   warn_delete_array_type
 CHECK-NEXT:   warn_double_const_requires_fp64
 CHECK-NEXT:   warn_drv_assuming_mfloat_abi_is
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -3,6 +3,8 @@
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -std=c++11 -DCHECK_DECLTYPE -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -DCHECK_DECLTYPE -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -419,3 +421,30 @@
 int test_constexpr_overload(C2 &x, C2 &y) {
   return constexpr_overload(x, y);
 }
+
+#if defined(CHECK_DECLTYPE)
+#if defined(__CUDA_ARCH__)
+// expected-note@+6 {{other definition of 't0'}}
+// expected-note@+6 {{use this definition of 't0'}}
+#else
+// expected-note@+3 {{use this definition of 't0'}}
+// expected-note@+3 {{other definition of 't0'}}
+#endif
+__host__ float t0();
+__device__ int t0();
+
+__host__ __device__ void dt0() {
+  // expected-warning@+1 {{return type of 't0' in 'decltype' is ambiguous and may not be expected}}
+  decltype(t0()) ret;
+}
+
+__host__ float t1();
+
+__device__ void dt1() {
+  decltype(t1()) ret; // OK. `decltype` is relaxed.
+}
+
+__host__ __device__ void dt2() {
+  decltype(t1()) ret; // OK. `decltype` is relaxed.
+}
+#endif