Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1624,7 +1624,8 @@ if (IRFunctionArgs.hasSRetArg()) { QualType Ret = FI.getReturnType(); llvm::Type *Ty = ConvertType(Ret); - unsigned AddressSpace = Context.getTargetAddressSpace(Ret); + unsigned AddressSpace = + Context.getTargetAddressSpace(CGM.getASTAllocaAddressSpace()); ArgTypes[IRFunctionArgs.getSRetArgNo()] = llvm::PointerType::get(Ty, AddressSpace); } @@ -4671,7 +4672,17 @@ } } if (IRFunctionArgs.hasSRetArg()) { - IRCallArgs[IRFunctionArgs.getSRetArgNo()] = SRetPtr.getPointer(); + IRCallArgs[IRFunctionArgs.getSRetArgNo()] = + getTargetHooks().performAddrSpaceCast( + *this, SRetPtr.getPointer(), LangAS::Default, + getASTAllocaAddressSpace(), + SRetPtr.getPointer() + ->getType() + ->getPointerElementType() + ->getPointerTo(getContext().getTargetAddressSpace( + getASTAllocaAddressSpace())), + /*non-null*/ true); + } else if (RetAI.isInAlloca()) { Address Addr = Builder.CreateStructGEP(ArgMemory, RetAI.getInAllocaFieldIndex()); Index: clang/lib/CodeGen/CGDecl.cpp =================================================================== --- clang/lib/CodeGen/CGDecl.cpp +++ clang/lib/CodeGen/CGDecl.cpp @@ -1604,8 +1604,11 @@ if (UsePointerValue) DebugAddr = ReturnValuePointer; - (void)DI->EmitDeclareOfAutoVariable(&D, DebugAddr.getPointer(), Builder, - UsePointerValue); + // Local variables are casted to default address space if the alloca address + // space is different. Need to strip casts to get the real variables. + (void)DI->EmitDeclareOfAutoVariable( + &D, DebugAddr.getPointer()->stripPointerCasts(), Builder, + UsePointerValue); } if (D.hasAttr() && HaveInsertPoint()) Index: clang/lib/CodeGen/CodeGenFunction.cpp =================================================================== --- clang/lib/CodeGen/CodeGenFunction.cpp +++ clang/lib/CodeGen/CodeGenFunction.cpp @@ -1084,6 +1084,22 @@ RetTy->isObjCRetainableType()) AutoreleaseResult = true; } + // Alloca address space may be different than default address space. Return + // value is expected to be in default address space. This is no-op if they + // are the same. + if (ReturnValue.isValid()) { + ReturnValue = + Address(getTargetHooks().performAddrSpaceCast( + *this, ReturnValue.getPointer(), LangAS::Default, + getASTAllocaAddressSpace(), + ReturnValue.getPointer() + ->getType() + ->getPointerElementType() + ->getPointerTo(getContext().getTargetAddressSpace( + LangAS::Default)), + /*non-null*/ true), + ReturnValue.getAlignment()); + } EmitStartEHSpec(CurCodeDecl); Index: clang/test/CodeGenCUDA/amdgpu-sret.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/amdgpu-sret.cu @@ -0,0 +1,101 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device \ +// RUN: -emit-llvm -o - -x hip %s -debug-info-kind=limited \ +// RUN: | FileCheck %s + +// Check no assertion with debug info. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx906 \ +// RUN: -aux-triple x86_64-unknown-linux-gnu -fcuda-is-device \ +// RUN: -S -o %t.s -x hip %s \ +// RUN: -debug-info-kind=limited + +#include "Inputs/cuda.h" + +struct A { + int x[100]; + __device__ A(); +}; + +struct B { + int x[100]; +}; + +__device__ B b; + +__device__ void callee(A *a); + +// CHECK-LABEL: @_Z5func1v( +// CHECK-SAME: %struct.A addrspace(5)* noalias sret(%struct.A) align 4 %[[RET:.*]]) +// CHECK: %x = alloca [100 x i32], align 16, addrspace(5) +// CHECK: %x.ascast = addrspacecast [100 x i32] addrspace(5)* %x to [100 x i32]* +// CHECK: %p = alloca %struct.A*, align 8, addrspace(5) +// CHECK: %p.ascast = addrspacecast %struct.A* addrspace(5)* %p to %struct.A** +// CHECK: %[[RET_CAST:.*]] = addrspacecast %struct.A addrspace(5)* %[[RET]] to %struct.A* +// CHECK: call void @llvm.dbg.declare(metadata %struct.A addrspace(5)* %[[RET]] +// CHECK: call void @_ZN1AC1Ev(%struct.A* nonnull dereferenceable(400) %[[RET_CAST]]) +// CHECK: call void @llvm.dbg.declare(metadata [100 x i32] addrspace(5)* %x +// CHECK: call void @_Z6calleeP1A(%struct.A* %[[RET_CAST]]) +// CHECK: %[[RET_CAST2:.*]] = bitcast %struct.A* %[[RET_CAST]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[RET_CAST2]], i8* align 16 %{{.*}}, i64 400, i1 false) +// CHECK: call void @llvm.dbg.declare(metadata %struct.A* addrspace(5)* %p +// CHECK: store %struct.A* %[[RET_CAST]], %struct.A** %p.ascast +__device__ A func1() { + A a; + int x[100]; + callee(&a); + __builtin_memcpy(&a, x, 400); + A *p = &a; + return a; +} + +// CHECK-LABEL: @_Z6func1av(%struct.B addrspace(5)* noalias sret(%struct.B) align 4 +__device__ B func1a() { + B b; + return b; +} + +// Check returning the return value again. + +// CHECK-LABEL: @_Z5func2v( +// CHECK-SAME: %struct.A addrspace(5)* noalias sret(%struct.A) align 4 %[[RET:.*]]) +// CHECK: %[[CAST1:.*]] = addrspacecast %struct.A addrspace(5)* %[[RET]] to %struct.A* +// CHECK: %[[CAST2:.*]] = addrspacecast %struct.A* %[[CAST1]] to %struct.A addrspace(5)* +// CHECK: call void @_Z5func1v(%struct.A addrspace(5)* sret(%struct.A) align 4 %[[CAST2]]) +__device__ A func2() { + A a = func1(); + return a; +} + +// Check assigning the return value to a global variable. + +// CHECK-LABEL: @_Z5func3v( +// CHECK: %[[RET:.*]] = alloca %struct.B, align 4, addrspace(5) +// CHECK: %[[CAST1:.*]] = addrspacecast %struct.B addrspace(5)* %[[RET]] to %struct.B* +// CHECK: %[[CAST2:.*]] = addrspacecast %struct.B* %[[CAST1]] to %struct.B addrspace(5)* +// CHECK: call void @_Z6func1av(%struct.B addrspace(5)* sret(%struct.B) align 4 %[[CAST2]] +// CHECK: %[[CAST3:.*]] = bitcast %struct.B* %[[CAST1]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64{{.*}}@b{{.*}}%[[CAST3]] +__device__ void func3() { + b = func1a(); +} + +// Check assigning the return value to a temporary variable. + +// CHECK-LABEL: @_Z5func4v( +// CHECK: %[[TMP:.*]] = alloca %struct.A, align 4, addrspace(5) +// CHECK: %[[TMP_CAST1:.*]] = addrspacecast %struct.A addrspace(5)* %[[TMP]] to %struct.A* +// CHECK: %[[RET:.*]] = alloca %struct.A, align 4, addrspace(5) +// CHECK: %[[RET_CAST1:.*]] = addrspacecast %struct.A addrspace(5)* %[[RET]] to %struct.A* +// CHECK: call void @_ZN1AC1Ev(%struct.A* nonnull dereferenceable(400) %[[TMP_CAST1]]) +// CHECK: %[[RET_CAST2:.*]] = addrspacecast %struct.A* %[[RET_CAST1]] to %struct.A addrspace(5)* +// CHECK: call void @_Z5func1v(%struct.A addrspace(5)* sret(%struct.A) align 4 %[[RET_CAST2]] +// CHECK: %[[TMP_CAST2:.*]] = bitcast %struct.A* %[[TMP_CAST1]] to i8* +// CHECK: %[[RET_CAST3:.*]] = bitcast %struct.A* %[[RET_CAST1]] to i8* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64{{.*}}%[[TMP_CAST2]]{{.*}}%[[RET_CAST3]] +__device__ void func4() { + A a; + a = func1(); +}