Index: lib/CodeGen/CGCall.h =================================================================== --- lib/CodeGen/CGCall.h +++ lib/CodeGen/CGCall.h @@ -172,9 +172,10 @@ RValue RV; QualType Ty; bool NeedsCopy; - CallArg(RValue rv, QualType ty, bool needscopy) - : RV(rv), Ty(ty), NeedsCopy(needscopy) - { } + unsigned AS; /* Address space of indirect argument */ + CallArg(RValue rv, QualType ty, bool needscopy, + unsigned _AS = LangAS::Default) + : RV(rv), Ty(ty), NeedsCopy(needscopy), AS(_AS) {} }; /// CallArgList - Type for representing both the value and type of @@ -204,8 +205,9 @@ llvm::Instruction *IsActiveIP; }; - void add(RValue rvalue, QualType type, bool needscopy = false) { - push_back(CallArg(rvalue, type, needscopy)); + void add(RValue rvalue, QualType type, bool needscopy = false, + unsigned AS = LangAS::Default) { + push_back(CallArg(rvalue, type, needscopy, AS)); } /// Add all the arguments from another CallArgList to this one. After doing Index: lib/CodeGen/CGCall.cpp =================================================================== --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -3454,6 +3454,12 @@ void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, QualType type) { DisableDebugLocationUpdates Dis(*this, E); + unsigned AS = 0; + if (isa(E) && + cast(E)->getCastKind() == CK_LValueToRValue) { + AS = cast(E)->getSubExpr()->getType().getAddressSpace(); + } + if (const ObjCIndirectCopyRestoreExpr *CRE = dyn_cast(E)) { assert(getLangOpts().ObjCAutoRefCount); @@ -3465,7 +3471,7 @@ if (E->isGLValue()) { assert(E->getObjectKind() == OK_Ordinary); - return args.add(EmitReferenceBindingToExpr(E), type); + return args.add(EmitReferenceBindingToExpr(E), type, false, AS); } bool HasAggregateEvalKind = hasAggregateEvaluationKind(type); @@ -3492,7 +3498,7 @@ EmitAggExpr(E, Slot); RValue RV = Slot.asRValue(); - args.add(RV, type); + args.add(RV, type, false, AS); if (DestroyedInCallee) { // Create a no-op GEP between the placeholder and the cleanup so we can @@ -3512,18 +3518,18 @@ LValue L = EmitLValue(cast(E)->getSubExpr()); assert(L.isSimple()); if (L.getAlignment() >= getContext().getTypeAlignInChars(type)) { - args.add(L.asAggregateRValue(), type, /*NeedsCopy*/true); + args.add(L.asAggregateRValue(), type, /*NeedsCopy*/ true, AS); } else { // We can't represent a misaligned lvalue in the CallArgList, so copy // to an aligned temporary now. Address tmp = CreateMemTemp(type); EmitAggregateCopy(tmp, L.getAddress(), type, L.isVolatile()); - args.add(RValue::getAggregate(tmp), type); + args.add(RValue::getAggregate(tmp), type, false, AS); } return; } - args.add(EmitAnyExprToTemp(E), type); + args.add(EmitAnyExprToTemp(E), type, false, AS); } QualType CodeGenFunction::getVarArgType(const Expr *Arg) { @@ -3819,6 +3825,12 @@ } case ABIArgInfo::Indirect: { + auto CastToAllocaAddrSpace = [&](llvm::Value *V) { + auto *T = V->getType()->getPointerElementType()->getPointerTo( + CGM.getDataLayout().getAllocaAddrSpace()); + return getTargetHooks().performAddrSpaceCast( + *this, V, LangAS::Default, CGM.getASTAllocaAddressSpace(), T, true); + }; assert(NumIRArgs == 1); if (RV.isScalar() || RV.isComplex()) { // Make a temporary alloca to pass the argument. @@ -3836,21 +3848,21 @@ // 2. If the argument is byval, RV is not sufficiently aligned, and // we cannot force it to be sufficiently aligned. // 3. If the argument is byval, but RV is located in an address space - // different than that of the argument (0). + // different than that of the argument (alloca address space). Address Addr = RV.getAggregateAddress(); CharUnits Align = ArgInfo.getIndirectAlign(); const llvm::DataLayout *TD = &CGM.getDataLayout(); - const unsigned RVAddrSpace = Addr.getType()->getAddressSpace(); - const unsigned ArgAddrSpace = - (FirstIRArg < IRFuncTy->getNumParams() - ? IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() - : 0); + const unsigned RVAddrSpace = I->AS; + assert((FirstIRArg >= IRFuncTy->getNumParams() || + IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() == + TD->getAllocaAddrSpace()) && "indirect argument must be in alloca address space"); if ((!ArgInfo.getIndirectByVal() && I->NeedsCopy) || (ArgInfo.getIndirectByVal() && Addr.getAlignment() < Align && llvm::getOrEnforceKnownAlignment(Addr.getPointer(), - Align.getQuantity(), *TD) - < Align.getQuantity()) || - (ArgInfo.getIndirectByVal() && (RVAddrSpace != ArgAddrSpace))) { + Align.getQuantity(), + *TD) < Align.getQuantity()) || + (ArgInfo.getIndirectByVal() && RVAddrSpace != LangAS::Default && + RVAddrSpace != CGM.getASTAllocaAddressSpace())) { // Create an aligned temporary, and copy to it. Address AI = CreateMemTemp(I->Ty, ArgInfo.getIndirectAlign(), "byval-temp", false); @@ -3858,7 +3870,7 @@ EmitAggregateCopy(AI, Addr, I->Ty, RV.isVolatileQualified()); } else { // Skip the extra memcpy call. - IRCallArgs[FirstIRArg] = Addr.getPointer(); + IRCallArgs[FirstIRArg] = CastToAllocaAddrSpace(Addr.getPointer()); } } break; Index: lib/CodeGen/CGDecl.cpp =================================================================== --- lib/CodeGen/CGDecl.cpp +++ lib/CodeGen/CGDecl.cpp @@ -1821,6 +1821,23 @@ llvm::Type *IRTy = ConvertTypeForMem(Ty)->getPointerTo(AS); if (DeclPtr.getType() != IRTy) DeclPtr = Builder.CreateBitCast(DeclPtr, IRTy, D.getName()); + // Indirect argument is in alloca address space, which may be different + // from the default address space. + auto AllocaAS = CGM.getASTAllocaAddressSpace(); + auto *V = DeclPtr.getPointer(); + // TODO: After adding LangAS::opencl_private, change SrcLangAS and + // DestLangAS to opencl_private for OpenCL. + auto SrcLangAS = getLangOpts().OpenCL ? LangAS::Default : AllocaAS; + auto DestLangAS = LangAS::Default; + if (SrcLangAS != DestLangAS) { + assert(getContext().getTargetAddressSpace(SrcLangAS) == + CGM.getDataLayout().getAllocaAddrSpace()); + auto DestAS = getContext().getTargetAddressSpace(DestLangAS); + auto *T = V->getType()->getPointerElementType()->getPointerTo(DestAS); + DeclPtr = Address(getTargetHooks().performAddrSpaceCast( + *this, V, SrcLangAS, DestLangAS, T, true), + DeclPtr.getAlignment()); + } // Push a destructor cleanup for this parameter if the ABI requires it. // Don't push a cleanup in a thunk for a method that will also emit a Index: test/CodeGenCXX/amdgcn-func-arg.cpp =================================================================== --- /dev/null +++ test/CodeGenCXX/amdgcn-func-arg.cpp @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 -O0 -triple amdgcn---amdgiz -emit-llvm %s -o - | FileCheck %s + +class A { +public: + int x; + A():x(0) {} + ~A() {} +}; + +class B { +int x[100]; +}; + +A g_a; +B g_b; + +void func_with_ref_arg(A &a); +void func_with_ref_arg(B &b); + +// CHECK-LABEL: define void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %a) +// CHECK: %p = alloca %class.A*, align 8, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.A* addrspace(5)* %p to %class.A** +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %a to %class.A* +// CHECK: store %class.A* %[[r0]], %class.A** %[[r1]], align 8 +void func_with_indirect_arg(A a) { + A *p = &a; +} + +// CHECK-LABEL: define void @_Z22test_indirect_arg_autov() +// CHECK: %a = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %a to %class.A* +// CHECK: %agg.tmp = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.A addrspace(5)* %agg.tmp to %class.A* +// CHECK: call void @_ZN1AC1Ev(%class.A* %[[r0]]) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r4:.+]] = addrspacecast %class.A* %[[r1]] to %class.A addrspace(5)* +// CHECK: call void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %[[r4]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r1]]) +// CHECK: call void @_Z17func_with_ref_argR1A(%class.A* dereferenceable(4) %[[r0]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r0]]) +void test_indirect_arg_auto() { + A a; + func_with_indirect_arg(a); + func_with_ref_arg(a); +} + +// CHECK: define void @_Z24test_indirect_arg_globalv() +// CHECK: %agg.tmp = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %agg.tmp to %class.A* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r2:.+]] = addrspacecast %class.A* %[[r0]] to %class.A addrspace(5)* +// CHECK: call void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %[[r2]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r0]]) +// CHECK: call void @_Z17func_with_ref_argR1A(%class.A* dereferenceable(4) addrspacecast (%class.A addrspace(1)* @g_a to %class.A*)) +void test_indirect_arg_global() { + func_with_indirect_arg(g_a); + func_with_ref_arg(g_a); +} + +// CHECK-LABEL: define void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %b) +// CHECK: %p = alloca %class.B*, align 8, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.B* addrspace(5)* %p to %class.B** +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %b to %class.B* +// CHECK: store %class.B* %[[r0]], %class.B** %[[r1]], align 8 +void func_with_byval_arg(B b) { + B *p = &b; +} + +// CHECK-LABEL: define void @_Z19test_byval_arg_autov() +// CHECK: %b = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %b to %class.B* +// CHECK: %agg.tmp = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.B addrspace(5)* %agg.tmp to %class.B* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r4:.+]] = addrspacecast %class.B* %[[r1]] to %class.B addrspace(5)* +// CHECK: call void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %[[r4]]) +// CHECK: call void @_Z17func_with_ref_argR1B(%class.B* dereferenceable(400) %[[r0]]) +void test_byval_arg_auto() { + B b; + func_with_byval_arg(b); + func_with_ref_arg(b); +} + +// CHECK-LABEL: define void @_Z21test_byval_arg_globalv() +// CHECK: %agg.tmp = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %agg.tmp to %class.B* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r2:.+]] = addrspacecast %class.B* %[[r0]] to %class.B addrspace(5)* +// CHECK: call void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %[[r2]]) +// CHECK: call void @_Z17func_with_ref_argR1B(%class.B* dereferenceable(400) addrspacecast (%class.B addrspace(1)* @g_b to %class.B*)) +void test_byval_arg_global() { + func_with_byval_arg(g_b); + func_with_ref_arg(g_b); +} Index: test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- test/CodeGenOpenCL/addr-space-struct-arg.cl +++ test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -finclude-default-header -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=COM,X86 %s // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -finclude-default-header -triple amdgcn-amdhsa-amd-amdgizcl | FileCheck -enable-var-scope -check-prefixes=COM,AMD %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -finclude-default-header -triple amdgcn-amdhsa-amd-amdgizcl | FileCheck -enable-var-scope -check-prefixes=COM,AMD,AMD20 %s typedef struct { int cells[9]; @@ -35,6 +36,9 @@ int2 y[20]; }; +#if __OPENCL_C_VERSION__ >= 200 +struct LargeStructOneMember g_s; +#endif // X86-LABEL: define void @foo(%struct.Mat4X4* noalias sret %agg.result, %struct.Mat3X3* byval align 4 %in) // AMD-LABEL: define %struct.Mat4X4 @foo([9 x i32] %in.coerce) @@ -80,10 +84,42 @@ } // AMD-LABEL: define void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %u) +// AMD-NOT: addrspacecast +// AMD: store <2 x i32> %{{.*}}, <2 x i32> addrspace(5)* void FuncOneLargeMember(struct LargeStructOneMember u) { u.x[0] = (int2)(0, 0); } +// AMD20-LABEL: define void @test_indirect_arg_globl() +// AMD20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMD20: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byval_temp]] to i8 addrspace(5)* +// AMD20: call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* %[[r0]], i8 addrspace(1)* bitcast (%struct.LargeStructOneMember addrspace(1)* @g_s to i8 addrspace(1)*), i64 800, i32 8, i1 false) +// AMD20: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[byval_temp]]) +#if __OPENCL_C_VERSION__ >= 200 +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +// AMD-LABEL: define amdgpu_kernel void @test_indirect_arg_local() +// AMD: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMD: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byval_temp]] to i8 addrspace(5)* +// AMD: call void @llvm.memcpy.p5i8.p3i8.i64(i8 addrspace(5)* %[[r0]], i8 addrspace(3)* bitcast (%struct.LargeStructOneMember addrspace(3)* @test_indirect_arg_local.l_s to i8 addrspace(3)*), i64 800, i32 8, i1 false) +// AMD: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[byval_temp]]) +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +// AMD-LABEL: define void @test_indirect_arg_private() +// AMD: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMD-NOT: @llvm.memcpy +// AMD-NEXT: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[p_s]]) +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + // AMD-LABEL: define amdgpu_kernel void @KernelOneMember // AMD-SAME: (<2 x i32> %[[u_coerce:.*]]) // AMD: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5) @@ -112,7 +148,6 @@ u.y[0] = (int2)(0, 0); } - // AMD-LABEL: define amdgpu_kernel void @KernelTwoMember // AMD-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) // AMD: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5)