Index: clang/lib/CodeGen/CGExpr.cpp =================================================================== --- clang/lib/CodeGen/CGExpr.cpp +++ clang/lib/CodeGen/CGExpr.cpp @@ -1522,6 +1522,29 @@ if (result.HasSideEffects) return ConstantEmission(); + // In CUDA/HIP device compilation, a lambda may capture a reference variable + // referencing a global host variable by copy. In this case the lambda should + // make a copy of the value of the global host variable. The DRE of the + // captured reference variable cannot be emitted as load from the host + // global variable as compile time constant, since the host variable is not + // accessible on device. The DRE of the captured reference variable has to be + // loaded from captures. + if (CGM.getLangOpts().CUDAIsDevice && + refExpr->refersToEnclosingVariableOrCapture()) { + auto *MD = dyn_cast_or_null(CurCodeDecl); + if (MD && MD->getParent()->isLambda() && + MD->getOverloadedOperator() == OO_Call) { + const APValue::LValueBase &base = result.Val.getLValueBase(); + if (const ValueDecl *D = base.dyn_cast()) { + if (const VarDecl *VD = dyn_cast(D)) { + if (!VD->hasAttr()) { + return ConstantEmission(); + } + } + } + } + } + // Emit as a constant. auto C = ConstantEmitter(*this).emitAbstract(refExpr->getLocation(), result.Val, resultType); Index: clang/lib/Sema/SemaExpr.cpp =================================================================== --- clang/lib/Sema/SemaExpr.cpp +++ clang/lib/Sema/SemaExpr.cpp @@ -1939,13 +1939,47 @@ if (isUnevaluatedContext()) return NOUR_Unevaluated; + // CUDA/HIP: Check whether a captured reference variable is referencing a + // host variable in a device or host device lambda. + auto IsCapturingReferenceToHostVarInCUDADeviceLambda = [&](VarDecl *VD) { + if (!getLangOpts().CUDA || !VD->hasInit()) + return false; + assert(VD->getType()->isReferenceType()); + + // Check whether the reference variable is referencing a host variable. + auto *DRE = dyn_cast(VD->getInit()); + if (!DRE) + return false; + auto *Referee = dyn_cast(DRE->getDecl()); + if (!Referee || !Referee->hasGlobalStorage() || + Referee->hasAttr()) + return false; + + // Check whether the current function is a device or host device lambda. + // Check whether the reference variable is a capture by getDeclContext() + // since refersToEnclosingVariableOrCapture() is not ready at this point. + auto *MD = dyn_cast_or_null(CurContext); + if (MD && MD->getParent()->isLambda() && + MD->getOverloadedOperator() == OO_Call && + MD->hasAttr() && VD->getDeclContext() != MD) + return true; + + return false; + }; // C++2a [basic.def.odr]p4: // A variable x whose name appears as a potentially-evaluated expression e // is odr-used by e unless [...] x is a reference that is usable in // constant expressions. + // CUDA/HIP: + // If a reference variable referencing a host variable is captured in a + // device or host device lambda, the value of the referee must be copied + // to the capture and the reference variable must be treated as odr-use + // since the value of the referee is not known at compile time and must + // be loaded from the captured. if (VarDecl *VD = dyn_cast(D)) { if (VD->getType()->isReferenceType() && !(getLangOpts().OpenMP && isOpenMPCapturedDecl(D)) && + !IsCapturingReferenceToHostVarInCUDADeviceLambda(VD) && VD->isUsableInConstantExpressions(Context)) return NOUR_Constant; } Index: clang/test/CodeGenCUDA/lambda-reference-var.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/lambda-reference-var.cu @@ -0,0 +1,126 @@ +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -triple x86_64-linux-gnu \ +// RUN: | FileCheck -check-prefix=HOST %s +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// HOST: %[[T1:.*]] = type <{ i32*, i32, [4 x i8] }> +// HOST: %[[T2:.*]] = type { i32*, i32** } +// HOST: %[[T3:.*]] = type <{ i32*, i32, [4 x i8] }> +// DEV: %[[T1:.*]] = type { i32* } +// DEV: %[[T2:.*]] = type { i32** } +// DEV: %[[T3:.*]] = type <{ i32*, i32, [4 x i8] }> +int global_host_var; +__device__ int global_device_var; + +template +__global__ void kern(F f) { f(); } + +// DEV-LABEL: @_ZZ27dev_capture_dev_ref_by_copyPiENKUlvE_clEv( +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: store i32 %[[VAL]] +__device__ void dev_capture_dev_ref_by_copy(int *out) { + int &ref = global_device_var; + [=](){ *out = ref;}(); +} + +// DEV-LABEL: @_ZZ26dev_capture_dev_ref_by_refPiENKUlvE_clEv( +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: store i32 %[[VAL]] +__device__ void dev_capture_dev_ref_by_ref(int *out) { + int &ref = global_device_var; + [&](){ ref++; *out = ref;}(); +} + +// DEV-LABEL: define void @_Z7dev_refPi( +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: store i32 %[[VAL]] +__device__ void dev_ref(int *out) { + int &ref = global_device_var; + ref++; + *out = ref; +} + +// DEV-LABEL: @_ZZ14dev_lambda_refPiENKUlvE_clEv( +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// DEV: store i32 %[[VAL2]], i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: %[[VAL:.*]] = load i32, i32* addrspacecast (i32 addrspace(1)* @global_device_var to i32*) +// DEV: store i32 %[[VAL]] +__device__ void dev_lambda_ref(int *out) { + [=](){ + int &ref = global_device_var; + ref++; + *out = ref; + }(); +} + +// HOST-LABEL: @_ZZ29host_capture_host_ref_by_copyPiENKUlvE_clEv( +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: store i32 %[[VAL]] +void host_capture_host_ref_by_copy(int *out) { + int &ref = global_host_var; + [=](){ *out = ref;}(); +} + +// HOST-LABEL: @_ZZ28host_capture_host_ref_by_refPiENKUlvE_clEv( +// HOST: %[[CAP:.*]] = getelementptr inbounds %[[T2]], %[[T2]]* %this1, i32 0, i32 0 +// HOST: %[[REF:.*]] = load i32*, i32** %[[CAP]] +// HOST: %[[VAL:.*]] = load i32, i32* %[[REF]] +// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// HOST: store i32 %[[VAL2]], i32* %[[REF]] +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: store i32 %[[VAL]] +void host_capture_host_ref_by_ref(int *out) { + int &ref = global_host_var; + [&](){ ref++; *out = ref;}(); +} + +// HOST-LABEL: define void @_Z8host_refPi( +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// HOST: store i32 %[[VAL2]], i32* @global_host_var +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: store i32 %[[VAL]] +void host_ref(int *out) { + int &ref = global_host_var; + ref++; + *out = ref; +} + +// HOST-LABEL: @_ZZ15host_lambda_refPiENKUlvE_clEv( +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: %[[VAL2:.*]] = add nsw i32 %[[VAL]], 1 +// HOST: store i32 %[[VAL2]], i32* @global_host_var +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: store i32 %[[VAL]] +void host_lambda_ref(int *out) { + [=](){ + int &ref = global_host_var; + ref++; + *out = ref; + }(); +} + +// HOST-LABEL: define void @_Z28dev_capture_host_ref_by_copyPi( +// HOST: %[[CAP:.*]] = getelementptr inbounds %[[T3]], %[[T3]]* %{{.*}}, i32 0, i32 1 +// HOST: %[[VAL:.*]] = load i32, i32* @global_host_var +// HOST: store i32 %[[VAL]], i32* %[[CAP]] +// DEV-LABEL: define internal void @_ZZ28dev_capture_host_ref_by_copyPiENKUlvE_clEv( +// DEV: %[[CAP:.*]] = getelementptr inbounds %[[T3]], %[[T3]]* %this1, i32 0, i32 1 +// DEV: %[[VAL:.*]] = load i32, i32* %[[CAP]] +// DEV: store i32 %[[VAL]] +void dev_capture_host_ref_by_copy(int *out) { + int &ref = global_host_var; + kern<<<1, 1>>>([=]__device__() { *out = ref;}); +} +