Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -238,6 +238,7 @@
 LANGOPT(SYCLVersion       , 32, 0, "Version of the SYCL standard used")
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
+LANGOPT(HIPLambdaHostDevice, 1, 0, "Let non-reference-capturing lambda be host device for HIP")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -625,6 +625,11 @@
 def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
   Flags<[CC1Option]>,
   HelpText<"Default max threads per block for kernel launch bounds for HIP">;
+def fhip_lambda_host_device : Flag<["-"], "fhip-lambda-host-device">,
+  Flags<[CC1Option]>,
+  HelpText<"Let a lambda function without host/device attributes be a host "
+  "device function if it does not capture by reference (HIP only)">;
+def fno_hip_lambda_host_device : Flag<["-"], "fno-hip-lambda-host-device">;
 def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
   HelpText<"Path to libomptarget-nvptx libraries">;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -11677,7 +11677,7 @@
   /// CUDA lambdas declared inside __device__ or __global__ functions inherit
   /// the __device__ attribute.  Similarly, lambdas inside __host__ __device__
   /// functions become __host__ __device__ themselves.
-  void CUDASetLambdaAttrs(CXXMethodDecl *Method);
+  void CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI);
 
   /// Finds a function in \p Matches with highest calling priority
   /// from \p Caller context and erases all functions with lower
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -5404,6 +5404,10 @@
                    options::OPT_fno_hip_new_launch_api, false))
     CmdArgs.push_back("-fhip-new-launch-api");
 
+  if (Args.hasFlag(options::OPT_fhip_lambda_host_device,
+                   options::OPT_fhip_lambda_host_device, false))
+    CmdArgs.push_back("-fhip-lambda-host-device");
+
   if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-fcf-protection=") + A->getValue()));
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -2578,6 +2578,7 @@
           << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
   }
   Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
+  Opts.HIPLambdaHostDevice = Args.hasArg(OPT_fhip_lambda_host_device);
   if (Opts.HIP)
     Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
         Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
Index: clang/lib/Sema/SemaCUDA.cpp
===================================================================
--- clang/lib/Sema/SemaCUDA.cpp
+++ clang/lib/Sema/SemaCUDA.cpp
@@ -711,13 +711,30 @@
          DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack;
 }
 
-void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
+static bool LambdaHasRefCaptures(LambdaIntroducer &LI) {
+  if (LI.Default == LCD_ByRef)
+    return true;
+  for (auto &C : LI.Captures) {
+    if (C.Kind == LCK_ByRef || C.Kind == LCK_This)
+      return true;
+  }
+  return false;
+}
+
+void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   if (Method->hasAttr<CUDAHostAttr>() || Method->hasAttr<CUDADeviceAttr>())
     return;
   FunctionDecl *CurFn = dyn_cast<FunctionDecl>(CurContext);
   if (!CurFn)
     return;
+  if (getLangOpts().HIP &&
+      (Method->isConstexpr() ||
+       (getLangOpts().HIPLambdaHostDevice && !LambdaHasRefCaptures(LI)))) {
+    Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+    Method->addAttr(CUDAHostAttr::CreateImplicit(Context));
+    return;
+  }
   CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn);
   if (Target == CFT_Global || Target == CFT_Device) {
     Method->addAttr(CUDADeviceAttr::CreateImplicit(Context));
Index: clang/lib/Sema/SemaLambda.cpp
===================================================================
--- clang/lib/Sema/SemaLambda.cpp
+++ clang/lib/Sema/SemaLambda.cpp
@@ -996,7 +996,7 @@
   // CUDA lambdas get implicit attributes based on the scope in which they're
   // declared.
   if (getLangOpts().CUDA)
-    CUDASetLambdaAttrs(Method);
+    CUDASetLambdaAttrs(Method, Intro);
 
   // Number the lambda for linkage purposes if necessary.
   handleLambdaNumbering(Class, Method);
Index: clang/test/CodeGenCUDA/lambda.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/lambda.cu
@@ -0,0 +1,85 @@
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN:   -triple x86_64-linux-gnu -fhip-lambda-host-device \
+// RUN:   | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \
+// RUN:   -fhip-lambda-host-device -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// Device side kernel name.
+// HOST: @[[KERN_CAPTURE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_capturevEUlvE_EvT_\00"
+// HOST: @[[KERN_RESOLVE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_resolvevEUlvE_EvT_\00"
+
+// Check functions emitted for test_capture in host compilation.
+// Check lambda is not emitted in host compilation.
+// HOST-LABEL: define void @_Z12test_capturev
+// HOST:  call void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_
+// HOST-LABEL: define internal void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_
+// HOST:  call void @_Z16__device_stub__gIZ12test_capturevEUlvE_EvT_
+// HOST-NOT: define{{.*}}@_ZZ4mainENKUlvE_clEv
+
+// Check functions emitted for test_resolve in host compilation.
+// Check host version of template function 'overloaded' is emitted and called
+// by the lambda function.
+// HOST-LABEL: define void @_Z12test_resolvev
+// HOST:  call void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_()
+// HOST-LABEL: define internal void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_
+// HOST:  call void @_Z16__device_stub__gIZ12test_resolvevEUlvE_EvT_
+// HOST:  call void @_ZZ12test_resolvevENKUlvE_clEv
+// HOST-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv
+// HOST:  call i32 @_Z10overloadedIiET_v
+// HOST-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v
+// HOST:  ret i32 2
+
+// Check kernel is registered with correct device side kernel name.
+// HOST: @__hipRegisterFunction({{.*}}@[[KERN_CAPTURE]]
+// HOST: @__hipRegisterFunction({{.*}}@[[KERN_RESOLVE]]
+
+// DEV: @a = addrspace(1) externally_initialized global i32 0
+
+// Check functions emitted for test_capture in device compilation.
+// Check lambda is emitted in device compilation and accessing device variable.
+// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_capturevEUlvE_EvT_
+// DEV:  call void @_ZZ12test_capturevENKUlvE_clEv
+// DEV-LABEL: define internal void @_ZZ12test_capturevENKUlvE_clEv
+// DEV:  store i32 1, i32* addrspacecast (i32 addrspace(1)* @a to i32*)
+
+// Check functions emitted for test_resolve in device compilation.
+// Check device version of template function 'overloaded' is emitted and called
+// by the lambda function.
+// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_
+// DEV:  call void @_ZZ12test_resolvevENKUlvE_clEv
+// DEV-LABE: define internal void @_ZZ12test_resolvevENKUlvE_clEv
+// DEV:  call i32 @_Z10overloadedIiET_v
+// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v
+// DEV:  ret i32 1
+
+__device__ int a;
+
+template<class T>
+__device__ T overloaded() { return 1; }
+
+template<class T>
+__host__ T overloaded() { return 2; }
+
+template<class F>
+__global__ void g(F f) { f(); }
+
+template<class F>
+void test_capture_helper(F f) { g<<<1,1>>>(f); }
+
+template<class F>
+void test_resolve_helper(F f) { g<<<1,1>>>(f); f(); }
+
+// Test capture of device variable in lambda function.
+void test_capture(void) {
+  test_capture_helper([](){ a = 1;});
+}
+
+// Test resolving host/device function in lambda function.
+// Callee should resolve to correct host/device function based on where
+// the lambda function is called, not where it is defined.
+void test_resolve(void) {
+  test_resolve_helper([](){ overloaded<int>();});
+}
Index: clang/test/SemaCUDA/Inputs/cuda.h
===================================================================
--- clang/test/SemaCUDA/Inputs/cuda.h
+++ clang/test/SemaCUDA/Inputs/cuda.h
@@ -17,6 +17,19 @@
   __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {}
 };
 
+#ifdef __HIP__
+typedef struct hipStream *hipStream_t;
+typedef enum hipError {} hipError_t;
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     hipStream_t stream = 0);
+extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize,
+                                                 size_t sharedSize = 0,
+                                                 hipStream_t stream = 0);
+extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim,
+                                      dim3 blockDim, void **args,
+                                      size_t sharedMem,
+                                      hipStream_t stream);
+#else
 typedef struct cudaStream *cudaStream_t;
 typedef enum cudaError {} cudaError_t;
 
@@ -29,6 +42,7 @@
 extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
                                         dim3 blockDim, void **args,
                                         size_t sharedMem, cudaStream_t stream);
+#endif
 
 // Host- and device-side placement new overloads.
 void *operator new(__SIZE_TYPE__, void *p) { return p; }
Index: clang/test/SemaCUDA/lambda.cu
===================================================================
--- /dev/null
+++ clang/test/SemaCUDA/lambda.cu
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -x hip -verify %s -fhip-lambda-host-device
+
+#include "Inputs/cuda.h"
+
+template<class F>
+__global__ void kernel(F f) { f(); }
+// expected-error@-1 3{{no matching function for call to object of type}}
+
+constexpr __host__ __device__ void hd();
+
+int main(void) {
+  auto lambda_kernel = [&]__global__(){};
+  // expected-error@-1 {{kernel function 'operator()' must be a free function or static member function}}
+
+  int b;
+
+  kernel<<<1,1>>>([](){ hd(); });
+
+  kernel<<<1,1>>>([=](){ hd(); });
+
+  kernel<<<1,1>>>([b](){ hd(); });
+
+  kernel<<<1,1>>>([&]()constexpr{ hd(); });
+
+  kernel<<<1,1>>>([&](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([=, &b](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  kernel<<<1,1>>>([&, b](){ hd(); });
+  // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}}
+  // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}}
+
+  return 0;
+}