Index: lib/CodeGen/CGAtomic.cpp =================================================================== --- lib/CodeGen/CGAtomic.cpp +++ lib/CodeGen/CGAtomic.cpp @@ -1160,7 +1160,7 @@ if (UseOptimizedLibcall && Res.getScalarVal()) { llvm::Value *ResVal = Res.getScalarVal(); if (PostOp) { - llvm::Value *LoadVal1 = Args[1].RV.getScalarVal(); + llvm::Value *LoadVal1 = Args[1].getRValue().getScalarVal(); ResVal = Builder.CreateBinOp(PostOp, ResVal, LoadVal1); } if (E->getOp() == AtomicExpr::AO__atomic_nand_fetch) Index: lib/CodeGen/CGCall.h =================================================================== --- lib/CodeGen/CGCall.h +++ lib/CodeGen/CGCall.h @@ -213,12 +213,28 @@ }; struct CallArg { - RValue RV; + private: + union { + RValue RV; + LValue LV; /// The l-value from which the argument is derived. + }; + bool HasLV; + + public: QualType Ty; - bool NeedsCopy; - CallArg(RValue rv, QualType ty, bool needscopy) - : RV(rv), Ty(ty), NeedsCopy(needscopy) - { } + CallArg(RValue rv, QualType ty) : RV(rv), HasLV(false), Ty(ty) {} + CallArg(LValue _LV, QualType ty) : LV(_LV), HasLV(true), Ty(ty) {} + bool hasLValue() const { return HasLV; } + QualType getType() const { return Ty; } + RValue getRValue() const; + LValue getLValue() const { + assert(HasLV); + return LV; + } + void setRValue(RValue _RV) { + assert(!HasLV); + RV = _RV; + } }; /// CallArgList - Type for representing both the value and type of @@ -248,8 +264,10 @@ llvm::Instruction *IsActiveIP; }; - void add(RValue rvalue, QualType type, bool needscopy = false) { - push_back(CallArg(rvalue, type, needscopy)); + void add(RValue rvalue, QualType type) { push_back(CallArg(rvalue, type)); } + + void addUncopiedAggregate(LValue LV, QualType type) { + push_back(CallArg(LV, type)); } /// Add all the arguments from another CallArgList to this one. After doing Index: lib/CodeGen/CGCall.cpp =================================================================== --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -3391,7 +3391,7 @@ assert(InitialArgSize + 1 == Args.size() && "The code below depends on only adding one arg per EmitCallArg"); (void)InitialArgSize; - RValue RVArg = Args.back().RV; + RValue RVArg = Args.back().getRValue(); EmitNonNullArgCheck(RVArg, ArgTypes[Idx], (*Arg)->getExprLoc(), AC, ParamsToSkip + Idx); // @llvm.objectsize should never have side-effects and shouldn't need @@ -3439,6 +3439,12 @@ } // end anonymous namespace +RValue CallArg::getRValue() const { + if (!HasLV) + return RV; + return LV.asAggregateRValue(); +} + void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E, QualType type) { DisableDebugLocationUpdates Dis(*this, E); @@ -3499,15 +3505,7 @@ cast(E)->getCastKind() == CK_LValueToRValue) { LValue L = EmitLValue(cast(E)->getSubExpr()); assert(L.isSimple()); - if (L.getAlignment() >= getContext().getTypeAlignInChars(type)) { - args.add(L.asAggregateRValue(), type, /*NeedsCopy*/true); - } else { - // We can't represent a misaligned lvalue in the CallArgList, so copy - // to an aligned temporary now. - LValue Dest = MakeAddrLValue(CreateMemTemp(type), type); - EmitAggregateCopy(Dest, L, type, L.isVolatile()); - args.add(RValue::getAggregate(Dest.getAddress()), type); - } + args.addUncopiedAggregate(L, type); return; } @@ -3767,7 +3765,18 @@ for (CallArgList::const_iterator I = CallArgs.begin(), E = CallArgs.end(); I != E; ++I, ++info_it, ++ArgNo) { const ABIArgInfo &ArgInfo = info_it->info; - RValue RV = I->RV; + RValue RV; + if (I->hasLValue() && I->getLValue().getAlignment() < + getContext().getTypeAlignInChars(I->Ty)) { + // We can't represent a misaligned lvalue in the CallArgList, so copy + // to an aligned temporary now. + LValue Dest = MakeAddrLValue(CreateMemTemp(I->Ty), I->Ty); + EmitAggregateCopy(Dest, I->getLValue(), I->Ty, + I->getLValue().isVolatile()); + RV = RValue::getAggregate(Dest.getAddress()); + } else { + RV = I->getRValue(); + } // Insert a padding argument to ensure proper alignment. if (IRFunctionArgs.hasPaddingArg(ArgNo)) @@ -3823,22 +3832,29 @@ // source. (This case doesn't occur on any common architecture.) // 2. If the argument is byval, RV is not sufficiently aligned, and // we cannot force it to be sufficiently aligned. - // 3. If the argument is byval, but RV is located in an address space - // different than that of the argument (0). + // 3. If the argument is byval, but RV is not located in default + // or alloca address space. Address Addr = RV.getAggregateAddress(); CharUnits Align = ArgInfo.getIndirectAlign(); const llvm::DataLayout *TD = &CGM.getDataLayout(); - const unsigned RVAddrSpace = Addr.getType()->getAddressSpace(); - const unsigned ArgAddrSpace = - (FirstIRArg < IRFuncTy->getNumParams() - ? IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() - : 0); - if ((!ArgInfo.getIndirectByVal() && I->NeedsCopy) || - (ArgInfo.getIndirectByVal() && Addr.getAlignment() < Align && - llvm::getOrEnforceKnownAlignment(Addr.getPointer(), - Align.getQuantity(), *TD) - < Align.getQuantity()) || - (ArgInfo.getIndirectByVal() && (RVAddrSpace != ArgAddrSpace))) { + + assert((FirstIRArg >= IRFuncTy->getNumParams() || + IRFuncTy->getParamType(FirstIRArg)->getPointerAddressSpace() == + TD->getAllocaAddrSpace()) && + "indirect argument must be in alloca address space"); + if (I->hasLValue() && + ((!ArgInfo.getIndirectByVal() && + (I->getLValue().getAlignment() >= + getContext().getTypeAlignInChars(I->Ty))) || + (ArgInfo.getIndirectByVal() && + ((I->getLValue().getAddressSpace() != LangAS::Default && + I->getLValue().getAddressSpace() != LangAS::opencl_private && + I->getLValue().getAddressSpace() != + CGM.getASTAllocaAddressSpace()) || + (Addr.getAlignment() < Align && + llvm::getOrEnforceKnownAlignment(Addr.getPointer(), + Align.getQuantity(), *TD) < + Align.getQuantity()))))) { // Create an aligned temporary, and copy to it. Address AI = CreateMemTemp(I->Ty, ArgInfo.getIndirectAlign(), "byval-temp", false); @@ -3848,7 +3864,12 @@ EmitAggregateCopy(Dest, Src, I->Ty, RV.isVolatileQualified()); } else { // Skip the extra memcpy call. - IRCallArgs[FirstIRArg] = Addr.getPointer(); + auto *V = Addr.getPointer(); + auto *T = V->getType()->getPointerElementType()->getPointerTo( + CGM.getDataLayout().getAllocaAddrSpace()); + IRCallArgs[FirstIRArg] = getTargetHooks().performAddrSpaceCast( + *this, V, LangAS::Default, CGM.getASTAllocaAddressSpace(), T, + true); } } break; @@ -4358,7 +4379,7 @@ OffsetValue); } else if (const auto *AA = TargetDecl->getAttr()) { llvm::Value *ParamVal = - CallArgs[AA->getParamIndex() - 1].RV.getScalarVal(); + CallArgs[AA->getParamIndex() - 1].getRValue().getScalarVal(); EmitAlignmentAssumption(Ret.getScalarVal(), ParamVal); } } Index: lib/CodeGen/CGClass.cpp =================================================================== --- lib/CodeGen/CGClass.cpp +++ lib/CodeGen/CGClass.cpp @@ -2077,7 +2077,8 @@ assert(Args.size() == 2 && "unexpected argcount for trivial ctor"); QualType SrcTy = D->getParamDecl(0)->getType().getNonReferenceType(); - Address Src(Args[1].RV.getScalarVal(), getNaturalTypeAlignment(SrcTy)); + Address Src(Args[1].getRValue().getScalarVal(), + getNaturalTypeAlignment(SrcTy)); LValue SrcLVal = MakeAddrLValue(Src, SrcTy); QualType DestTy = getContext().getTypeDeclType(ClassDecl); LValue DestLVal = MakeAddrLValue(This, DestTy); @@ -2131,8 +2132,7 @@ const CXXConstructorDecl *D, bool ForVirtualBase, Address This, bool InheritedFromVBase, const CXXInheritedCtorInitExpr *E) { CallArgList Args; - CallArg ThisArg(RValue::get(This.getPointer()), D->getThisType(getContext()), - /*NeedsCopy=*/false); + CallArg ThisArg(RValue::get(This.getPointer()), D->getThisType(getContext())); // Forward the parameters. if (InheritedFromVBase && @@ -2196,7 +2196,7 @@ assert(Args.size() >= Params.size() && "too few arguments for call"); for (unsigned I = 0, N = Args.size(); I != N; ++I) { if (I < Params.size() && isa(Params[I])) { - const RValue &RV = Args[I].RV; + const RValue &RV = Args[I].getRValue(); assert(!RV.isComplex() && "complex indirect params not supported"); ParamValue Val = RV.isScalar() ? ParamValue::forDirect(RV.getScalarVal()) Index: lib/CodeGen/CGDecl.cpp =================================================================== --- lib/CodeGen/CGDecl.cpp +++ lib/CodeGen/CGDecl.cpp @@ -1866,6 +1866,22 @@ llvm::Type *IRTy = ConvertTypeForMem(Ty)->getPointerTo(AS); if (DeclPtr.getType() != IRTy) DeclPtr = Builder.CreateBitCast(DeclPtr, IRTy, D.getName()); + // Indirect argument is in alloca address space, which may be different + // from the default address space. + auto AllocaAS = CGM.getASTAllocaAddressSpace(); + auto *V = DeclPtr.getPointer(); + auto SrcLangAS = getLangOpts().OpenCL ? LangAS::opencl_private : AllocaAS; + auto DestLangAS = + getLangOpts().OpenCL ? LangAS::opencl_private : LangAS::Default; + if (SrcLangAS != DestLangAS) { + assert(getContext().getTargetAddressSpace(SrcLangAS) == + CGM.getDataLayout().getAllocaAddrSpace()); + auto DestAS = getContext().getTargetAddressSpace(DestLangAS); + auto *T = V->getType()->getPointerElementType()->getPointerTo(DestAS); + DeclPtr = Address(getTargetHooks().performAddrSpaceCast( + *this, V, SrcLangAS, DestLangAS, T, true), + DeclPtr.getAlignment()); + } // Push a destructor cleanup for this parameter if the ABI requires it. // Don't push a cleanup in a thunk for a method that will also emit a Index: lib/CodeGen/CGExprCXX.cpp =================================================================== --- lib/CodeGen/CGExprCXX.cpp +++ lib/CodeGen/CGExprCXX.cpp @@ -265,7 +265,7 @@ // when it isn't necessary; just produce the proper effect here. LValue RHS = isa(CE) ? MakeNaturalAlignAddrLValue( - (*RtlArgs)[0].RV.getScalarVal(), + (*RtlArgs)[0].getRValue().getScalarVal(), (*(CE->arg_begin() + 1))->getType()) : EmitLValue(*CE->arg_begin()); EmitAggregateAssign(This, RHS, CE->getType()); @@ -1490,7 +1490,7 @@ AllocAlign); for (unsigned I = 0, N = E->getNumPlacementArgs(); I != N; ++I) { auto &Arg = NewArgs[I + NumNonPlacementArgs]; - Cleanup->setPlacementArg(I, Arg.RV, Arg.Ty); + Cleanup->setPlacementArg(I, Arg.getRValue(), Arg.Ty); } return; @@ -1521,8 +1521,8 @@ AllocAlign); for (unsigned I = 0, N = E->getNumPlacementArgs(); I != N; ++I) { auto &Arg = NewArgs[I + NumNonPlacementArgs]; - Cleanup->setPlacementArg(I, DominatingValue::save(CGF, Arg.RV), - Arg.Ty); + Cleanup->setPlacementArg( + I, DominatingValue::save(CGF, Arg.getRValue()), Arg.Ty); } CGF.initFullExprCleanup(); Index: lib/CodeGen/CGGPUBuiltin.cpp =================================================================== --- lib/CodeGen/CGGPUBuiltin.cpp +++ lib/CodeGen/CGGPUBuiltin.cpp @@ -84,7 +84,7 @@ // We don't know how to emit non-scalar varargs. if (std::any_of(Args.begin() + 1, Args.end(), - [](const CallArg &A) { return !A.RV.isScalar(); })) { + [](const CallArg &A) { return !A.getRValue().isScalar(); })) { CGM.ErrorUnsupported(E, "non-scalar arg to printf"); return RValue::get(llvm::ConstantInt::get(IntTy, 0)); } @@ -97,7 +97,7 @@ } else { llvm::SmallVector ArgTypes; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) - ArgTypes.push_back(Args[I].RV.getScalarVal()->getType()); + ArgTypes.push_back(Args[I].getRValue().getScalarVal()->getType()); // Using llvm::StructType is correct only because printf doesn't accept // aggregates. If we had to handle aggregates here, we'd have to manually @@ -109,7 +109,7 @@ for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { llvm::Value *P = Builder.CreateStructGEP(AllocaTy, Alloca, I - 1); - llvm::Value *Arg = Args[I].RV.getScalarVal(); + llvm::Value *Arg = Args[I].getRValue().getScalarVal(); Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlignment(Arg->getType())); } BufferPtr = Builder.CreatePointerCast(Alloca, llvm::Type::getInt8PtrTy(Ctx)); @@ -117,6 +117,6 @@ // Invoke vprintf and return. llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule()); - return RValue::get( - Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr})); + return RValue::get(Builder.CreateCall( + VprintfFunc, {Args[0].getRValue().getScalarVal(), BufferPtr})); } Index: lib/CodeGen/CGObjCGNU.cpp =================================================================== --- lib/CodeGen/CGObjCGNU.cpp +++ lib/CodeGen/CGObjCGNU.cpp @@ -1456,7 +1456,7 @@ } // Reset the receiver in case the lookup modified it - ActualArgs[0] = CallArg(RValue::get(Receiver), ASTIdTy, false); + ActualArgs[0] = CallArg(RValue::get(Receiver), ASTIdTy); imp = EnforceType(Builder, imp, MSI.MessengerType); Index: lib/CodeGen/CGObjCMac.cpp =================================================================== --- lib/CodeGen/CGObjCMac.cpp +++ lib/CodeGen/CGObjCMac.cpp @@ -1708,7 +1708,7 @@ e = Method->param_end(); i != e; ++i, ++I) { const ParmVarDecl *ParamDecl = (*i); if (ParamDecl->hasAttr()) { - RValue RV = I->RV; + RValue RV = I->getRValue(); assert(RV.isScalar() && "NullReturnState::complete - arg not on object"); CGF.EmitARCRelease(RV.getScalarVal(), ARCImpreciseLifetime); @@ -7075,7 +7075,7 @@ CGF.getPointerAlign()); // Update the message ref argument. - args[1].RV = RValue::get(mref.getPointer()); + args[1].setRValue(RValue::get(mref.getPointer())); // Load the function to call from the message ref table. Address calleeAddr = Index: lib/CodeGen/ItaniumCXXABI.cpp =================================================================== --- lib/CodeGen/ItaniumCXXABI.cpp +++ lib/CodeGen/ItaniumCXXABI.cpp @@ -1479,8 +1479,7 @@ llvm::Value *VTT = CGF.GetVTTParameter(GlobalDecl(D, Type), ForVirtualBase, Delegating); QualType VTTTy = getContext().getPointerType(getContext().VoidPtrTy); - Args.insert(Args.begin() + 1, - CallArg(RValue::get(VTT), VTTTy, /*needscopy=*/false)); + Args.insert(Args.begin() + 1, CallArg(RValue::get(VTT), VTTTy)); return AddedStructorArgs::prefix(1); // Added one arg. } Index: lib/CodeGen/MicrosoftCXXABI.cpp =================================================================== --- lib/CodeGen/MicrosoftCXXABI.cpp +++ lib/CodeGen/MicrosoftCXXABI.cpp @@ -1538,8 +1538,7 @@ } RValue RV = RValue::get(MostDerivedArg); if (FPT->isVariadic()) { - Args.insert(Args.begin() + 1, - CallArg(RV, getContext().IntTy, /*needscopy=*/false)); + Args.insert(Args.begin() + 1, CallArg(RV, getContext().IntTy)); return AddedStructorArgs::prefix(1); } Args.add(RV, getContext().IntTy); Index: test/CodeGenCXX/amdgcn-func-arg.cpp =================================================================== --- /dev/null +++ test/CodeGenCXX/amdgcn-func-arg.cpp @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 -O0 -triple amdgcn---amdgiz -emit-llvm %s -o - | FileCheck %s + +class A { +public: + int x; + A():x(0) {} + ~A() {} +}; + +class B { +int x[100]; +}; + +A g_a; +B g_b; + +void func_with_ref_arg(A &a); +void func_with_ref_arg(B &b); + +// CHECK-LABEL: define void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %a) +// CHECK: %p = alloca %class.A*, align 8, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.A* addrspace(5)* %p to %class.A** +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %a to %class.A* +// CHECK: store %class.A* %[[r0]], %class.A** %[[r1]], align 8 +void func_with_indirect_arg(A a) { + A *p = &a; +} + +// CHECK-LABEL: define void @_Z22test_indirect_arg_autov() +// CHECK: %a = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %a to %class.A* +// CHECK: %agg.tmp = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.A addrspace(5)* %agg.tmp to %class.A* +// CHECK: call void @_ZN1AC1Ev(%class.A* %[[r0]]) +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r4:.+]] = addrspacecast %class.A* %[[r1]] to %class.A addrspace(5)* +// CHECK: call void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %[[r4]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r1]]) +// CHECK: call void @_Z17func_with_ref_argR1A(%class.A* dereferenceable(4) %[[r0]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r0]]) +void test_indirect_arg_auto() { + A a; + func_with_indirect_arg(a); + func_with_ref_arg(a); +} + +// CHECK: define void @_Z24test_indirect_arg_globalv() +// CHECK: %agg.tmp = alloca %class.A, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.A addrspace(5)* %agg.tmp to %class.A* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r2:.+]] = addrspacecast %class.A* %[[r0]] to %class.A addrspace(5)* +// CHECK: call void @_Z22func_with_indirect_arg1A(%class.A addrspace(5)* %[[r2]]) +// CHECK: call void @_ZN1AD1Ev(%class.A* %[[r0]]) +// CHECK: call void @_Z17func_with_ref_argR1A(%class.A* dereferenceable(4) addrspacecast (%class.A addrspace(1)* @g_a to %class.A*)) +void test_indirect_arg_global() { + func_with_indirect_arg(g_a); + func_with_ref_arg(g_a); +} + +// CHECK-LABEL: define void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %b) +// CHECK: %p = alloca %class.B*, align 8, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.B* addrspace(5)* %p to %class.B** +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %b to %class.B* +// CHECK: store %class.B* %[[r0]], %class.B** %[[r1]], align 8 +void func_with_byval_arg(B b) { + B *p = &b; +} + +// CHECK-LABEL: define void @_Z19test_byval_arg_autov() +// CHECK: %b = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %b to %class.B* +// CHECK: %agg.tmp = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r1:.+]] = addrspacecast %class.B addrspace(5)* %agg.tmp to %class.B* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r4:.+]] = addrspacecast %class.B* %[[r1]] to %class.B addrspace(5)* +// CHECK: call void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %[[r4]]) +// CHECK: call void @_Z17func_with_ref_argR1B(%class.B* dereferenceable(400) %[[r0]]) +void test_byval_arg_auto() { + B b; + func_with_byval_arg(b); + func_with_ref_arg(b); +} + +// CHECK-LABEL: define void @_Z21test_byval_arg_globalv() +// CHECK: %agg.tmp = alloca %class.B, align 4, addrspace(5) +// CHECK: %[[r0:.+]] = addrspacecast %class.B addrspace(5)* %agg.tmp to %class.B* +// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64 +// CHECK: %[[r2:.+]] = addrspacecast %class.B* %[[r0]] to %class.B addrspace(5)* +// CHECK: call void @_Z19func_with_byval_arg1B(%class.B addrspace(5)* byval align 4 %[[r2]]) +// CHECK: call void @_Z17func_with_ref_argR1B(%class.B* dereferenceable(400) addrspacecast (%class.B addrspace(1)* @g_b to %class.B*)) +void test_byval_arg_global() { + func_with_byval_arg(g_b); + func_with_ref_arg(g_b); +} Index: test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- test/CodeGenOpenCL/addr-space-struct-arg.cl +++ test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -finclude-default-header -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=COM,X86 %s -// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -finclude-default-header -triple amdgcn-amdhsa-amd-amdgizcl | FileCheck -enable-var-scope -check-prefixes=COM,AMD %s +// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -finclude-default-header -triple amdgcn-amdhsa-amd | FileCheck -enable-var-scope -check-prefixes=COM,AMDGCN %s +// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -finclude-default-header -triple amdgcn-amdhsa-amd | FileCheck -enable-var-scope -check-prefixes=COM,AMDGCN,AMDGCN20 %s typedef struct { int cells[9]; @@ -35,9 +36,12 @@ int2 y[20]; }; +#if __OPENCL_C_VERSION__ >= 200 +struct LargeStructOneMember g_s; +#endif // X86-LABEL: define void @foo(%struct.Mat4X4* noalias sret %agg.result, %struct.Mat3X3* byval align 4 %in) -// AMD-LABEL: define %struct.Mat4X4 @foo([9 x i32] %in.coerce) +// AMDGCN-LABEL: define %struct.Mat4X4 @foo([9 x i32] %in.coerce) Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { Mat4X4 out; return out; @@ -49,15 +53,15 @@ // X86: call void @llvm.memcpy.p0i8.p1i8.i32(i8* // X86: call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* -// AMD: load [9 x i32], [9 x i32] addrspace(1)* -// AMD: call %struct.Mat4X4 @foo([9 x i32] -// AMD: call void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* +// AMDGCN: load [9 x i32], [9 x i32] addrspace(1)* +// AMDGCN: call %struct.Mat4X4 @foo([9 x i32] +// AMDGCN: call void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { out[0] = foo(in[1]); } // X86-LABEL: define void @foo_large(%struct.Mat64X64* noalias sret %agg.result, %struct.Mat32X32* byval align 4 %in) -// AMD-LABEL: define void @foo_large(%struct.Mat64X64 addrspace(5)* noalias sret %agg.result, %struct.Mat32X32 addrspace(5)* byval align 4 %in) +// AMDGCN-LABEL: define void @foo_large(%struct.Mat64X64 addrspace(5)* noalias sret %agg.result, %struct.Mat32X32 addrspace(5)* byval align 4 %in) Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { Mat64X64 out; return out; @@ -68,66 +72,97 @@ // the return value. // X86: call void @llvm.memcpy.p0i8.p1i8.i32(i8* // X86: call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* -// AMD: call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* -// AMD: call void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* +// AMDGCN: call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* +// AMDGCN: call void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { out[0] = foo_large(in[1]); } -// AMD-LABEL: define void @FuncOneMember(<2 x i32> %u.coerce) +// AMDGCN-LABEL: define void @FuncOneMember(<2 x i32> %u.coerce) void FuncOneMember(struct StructOneMember u) { u.x = (int2)(0, 0); } -// AMD-LABEL: define void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %u) +// AMDGCN-LABEL: define void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %u) +// AMDGCN-NOT: addrspacecast +// AMDGCN: store <2 x i32> %{{.*}}, <2 x i32> addrspace(5)* void FuncOneLargeMember(struct LargeStructOneMember u) { u.x[0] = (int2)(0, 0); } -// AMD-LABEL: define amdgpu_kernel void @KernelOneMember -// AMD-SAME: (<2 x i32> %[[u_coerce:.*]]) -// AMD: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5) -// AMD: %[[coerce_dive:.*]] = getelementptr inbounds %struct.StructOneMember, %struct.StructOneMember addrspace(5)* %[[u]], i32 0, i32 0 -// AMD: store <2 x i32> %[[u_coerce]], <2 x i32> addrspace(5)* %[[coerce_dive]] -// AMD: call void @FuncOneMember(<2 x i32> +// AMDGCN20-LABEL: define void @test_indirect_arg_globl() +// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN20: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byval_temp]] to i8 addrspace(5)* +// AMDGCN20: call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 8 %[[r0]], i8 addrspace(1)* align 8 bitcast (%struct.LargeStructOneMember addrspace(1)* @g_s to i8 addrspace(1)*), i64 800, i1 false) +// AMDGCN20: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[byval_temp]]) +#if __OPENCL_C_VERSION__ >= 200 +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +// AMDGCN-LABEL: define amdgpu_kernel void @test_indirect_arg_local() +// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byval_temp]] to i8 addrspace(5)* +// AMDGCN: call void @llvm.memcpy.p5i8.p3i8.i64(i8 addrspace(5)* align 8 %[[r0]], i8 addrspace(3)* align 8 bitcast (%struct.LargeStructOneMember addrspace(3)* @test_indirect_arg_local.l_s to i8 addrspace(3)*), i64 800, i1 false) +// AMDGCN: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[byval_temp]]) +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +// AMDGCN-LABEL: define void @test_indirect_arg_private() +// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN-NOT: @llvm.memcpy +// AMDGCN-NEXT: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[p_s]]) +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + +// AMDGCN-LABEL: define amdgpu_kernel void @KernelOneMember +// AMDGCN-SAME: (<2 x i32> %[[u_coerce:.*]]) +// AMDGCN: %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5) +// AMDGCN: %[[coerce_dive:.*]] = getelementptr inbounds %struct.StructOneMember, %struct.StructOneMember addrspace(5)* %[[u]], i32 0, i32 0 +// AMDGCN: store <2 x i32> %[[u_coerce]], <2 x i32> addrspace(5)* %[[coerce_dive]] +// AMDGCN: call void @FuncOneMember(<2 x i32> kernel void KernelOneMember(struct StructOneMember u) { FuncOneMember(u); } -// AMD-LABEL: define amdgpu_kernel void @KernelLargeOneMember( -// AMD: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMD: store %struct.LargeStructOneMember %u.coerce, %struct.LargeStructOneMember addrspace(5)* %[[U]], align 8 -// AMD: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[U]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN: store %struct.LargeStructOneMember %u.coerce, %struct.LargeStructOneMember addrspace(5)* %[[U]], align 8 +// AMDGCN: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval align 8 %[[U]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } -// AMD-LABEL: define void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1) +// AMDGCN-LABEL: define void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1) void FuncTwoMember(struct StructTwoMember u) { u.y = (int2)(0, 0); } -// AMD-LABEL: define void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval align 8 %u) +// AMDGCN-LABEL: define void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval align 8 %u) void FuncLargeTwoMember(struct LargeStructTwoMember u) { u.y[0] = (int2)(0, 0); } - -// AMD-LABEL: define amdgpu_kernel void @KernelTwoMember -// AMD-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) -// AMD: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) -// AMD: %[[LD0:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* -// AMD: %[[LD1:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* -// AMD: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelTwoMember +// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) +// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) +// AMDGCN: %[[LD0:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN: %[[LD1:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } -// AMD-LABEL: define amdgpu_kernel void @KernelLargeTwoMember -// AMD-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) -// AMD: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMD: store %struct.LargeStructTwoMember %[[u_coerce]], %struct.LargeStructTwoMember addrspace(5)* %[[u]] -// AMD: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval align 8 %[[u]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeTwoMember +// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) +// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) +// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], %struct.LargeStructTwoMember addrspace(5)* %[[u]] +// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval align 8 %[[u]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } Index: test/CodeGenOpenCL/byval.cl =================================================================== --- test/CodeGenOpenCL/byval.cl +++ test/CodeGenOpenCL/byval.cl @@ -1,5 +1,4 @@ // RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn %s | FileCheck %s -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn---opencl %s | FileCheck %s struct A { int x[100];