Index: clang/include/clang/CodeGen/CGFunctionInfo.h =================================================================== --- clang/include/clang/CodeGen/CGFunctionInfo.h +++ clang/include/clang/CodeGen/CGFunctionInfo.h @@ -44,8 +44,8 @@ /// but also emit a zero/sign extension attribute. Extend, - /// Indirect - Pass the argument indirectly via a hidden pointer - /// with the specified alignment (0 indicates default alignment). + /// Indirect - Pass the argument indirectly via a hidden pointer with the + /// specified alignment (0 indicates default alignment) and address space. Indirect, /// Ignore - Ignore the argument (treat as void). Useful for void and @@ -86,6 +86,7 @@ unsigned AllocaFieldIndex; // isInAlloca() }; Kind TheKind; + unsigned IndirectAddrSpace : 24; // isIndirect() bool PaddingInReg : 1; bool InAllocaSRet : 1; // isInAlloca() bool InAllocaIndirect : 1;// isInAlloca() @@ -112,9 +113,10 @@ public: ABIArgInfo(Kind K = Direct) : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K), - PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false), - IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false), - InReg(false), CanBeFlattened(false), SignExt(false) {} + IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false), + InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false), + SRetAfterThis(false), InReg(false), CanBeFlattened(false), + SignExt(false) {} static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0, llvm::Type *Padding = nullptr, @@ -171,19 +173,23 @@ } static ABIArgInfo getIndirect(CharUnits Alignment, bool ByVal = true, bool Realign = false, - llvm::Type *Padding = nullptr) { + llvm::Type *Padding = nullptr, + unsigned AddrSpace = 0) { auto AI = ABIArgInfo(Indirect); AI.setIndirectAlign(Alignment); AI.setIndirectByVal(ByVal); AI.setIndirectRealign(Realign); AI.setSRetAfterThis(false); AI.setPaddingType(Padding); + AI.setIndirectAddrSpace(AddrSpace); return AI; } static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true, - bool Realign = false) { + bool Realign = false, + unsigned AddrSpace = 0) { auto AI = getIndirect(Alignment, ByVal, Realign); AI.setInReg(true); + AI.setIndirectAddrSpace(AddrSpace); return AI; } static ABIArgInfo getInAlloca(unsigned FieldIndex, bool Indirect = false) { @@ -355,6 +361,16 @@ IndirectByVal = IBV; } + unsigned getIndirectAddrSpace() const { + assert(isIndirect() && "Invalid kind!"); + return IndirectAddrSpace; + } + + void setIndirectAddrSpace(unsigned AddrSpace) { + assert(isIndirect() && "Invalid kind!"); + IndirectAddrSpace = AddrSpace; + } + bool getIndirectRealign() const { assert(isIndirect() && "Invalid kind!"); return IndirectRealign; Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1632,10 +1632,8 @@ case ABIArgInfo::Indirect: { assert(NumIRArgs == 1); - // indirect arguments are always on the stack, which is alloca addr space. llvm::Type *LTy = ConvertTypeForMem(it->type); - ArgTypes[FirstIRArg] = LTy->getPointerTo( - CGM.getDataLayout().getAllocaAddrSpace()); + ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace()); break; } @@ -2388,10 +2386,13 @@ Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign()); if (!hasScalarEvaluationKind(Ty)) { - // Aggregates and complex variables are accessed by reference. All we - // need to do is realign the value, if requested. + // Aggregates and complex variables are accessed by reference. All we + // need to do is realign the value, if requested. Also, if the address + // isn't on the stack, copy it there since we need a stack address. Address V = ParamAddr; - if (ArgI.getIndirectRealign()) { + if (ArgI.getIndirectRealign() || + ArgI.getIndirectAddrSpace() != + CGM.getDataLayout().getAllocaAddrSpace()) { Address AlignedTemp = CreateMemTemp(Ty, "coerce"); // Copy from the incoming argument pointer to the temporary with the Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -82,14 +82,16 @@ ABIArgInfo ABIInfo::getNaturalAlignIndirect(QualType Ty, bool ByRef, bool Realign, llvm::Type *Padding) const { - return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), - ByRef, Realign, Padding); + return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), ByRef, + Realign, Padding, + getDataLayout().getAllocaAddrSpace()); } ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty, bool Realign) const { return ABIArgInfo::getIndirectInReg(getContext().getTypeAlignInChars(Ty), - /*ByRef*/ false, Realign); + /*ByRef*/ false, Realign, + getDataLayout().getAllocaAddrSpace()); } Address ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr, @@ -8486,6 +8488,13 @@ /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); } + if (!LTy && isAggregateTypeForABI(Ty)) { + return ABIArgInfo::getIndirect( + getContext().getTypeAlignInChars(Ty), true /*ByVal*/, false /*Realign*/, + nullptr /*Padding*/, + getContext().getTargetAddressSpace(LangAS::opencl_constant)); + } + // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. Index: clang/test/CodeGenCUDA/kernel-args.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args.cu +++ clang/test/CodeGenCUDA/kernel-args.cu @@ -8,14 +8,14 @@ int a[32]; }; -// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce) +// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} @@ -29,11 +29,11 @@ void test() { Kernel K; - // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}} // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)templateKernel); - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byval(%struct.A) align 4 %{{.+}} // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)Kernel::templateMemberKernel); } Index: clang/test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -139,10 +139,13 @@ FuncOneMember(*u); } -// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructOneMember %u.coerce, %struct.LargeStructOneMember addrspace(5)* %[[U]], align 8 -// AMDGCN: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 %[[U]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember +// AMDGCN-SAME: (%struct.LargeStructOneMember addrspace(4)* byval(%struct.LargeStructOneMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructOneMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructOneMember addrspace(4)* [[BYVAL_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } @@ -158,20 +161,34 @@ } // AMDGCN-LABEL: define amdgpu_kernel void @KernelTwoMember -// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) -// AMDGCN: %[[LD0:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* -// AMDGCN: %[[LD1:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN-SAME: (%struct.StructTwoMember addrspace(4)* byval(%struct.StructTwoMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.StructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.StructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.StructTwoMember addrspace(4)* [[BYVAL_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 16, i1 false) +// AMDGCN: %[[LD0:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN: %[[LD1:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* // AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } // AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeTwoMember -// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], %struct.LargeStructTwoMember addrspace(5)* %[[u]] -// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 %[[u]]) +// AMDGCN-SAME: (%struct.LargeStructTwoMember addrspace(4)* byval(%struct.LargeStructTwoMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(4)* %{{.+}} to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 480, i1 false) +// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } + +// Make sure the address of the argument gives the stack copy address, not the kernarg. +// AMDGCN-LABEL: define amdgpu_kernel void @struct_arg_kernarg_address +// AMDGCN: call void @llvm.memcpy.p5i8.p4i8.i64 +// AMDGCN: store volatile %struct.LargeStructOneMember addrspace(5)* %kernarg_struct, %struct.LargeStructOneMember addrspace(5)* addrspace(5)* %x_addr, align 4 +__kernel void struct_arg_kernarg_address(struct LargeStructOneMember kernarg_struct, global int* out, + int idx0, int idx1) { + __private struct LargeStructOneMember* volatile x_addr = &kernarg_struct; +} Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -216,7 +216,7 @@ int w; } struct_4regs; -// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce) +// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct addrspace(4)* nocapture byval(%struct.empty_struct) align 1 {{%.+}}) __kernel void kernel_empty_struct_arg(empty_struct s) { } // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce) @@ -225,28 +225,28 @@ // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce) __kernel void kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t arg1) { } -// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce) +// CHECK: void @kernel_struct_arg(%struct.struct_arg addrspace(4)* nocapture byval(%struct.struct_arg) align 4 {{%.+}}) __kernel void kernel_struct_arg(struct_arg_t arg1) { } -// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg %arg1.coerce) +// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg addrspace(4)* nocapture byval(%struct.struct_padding_arg) align 8 %{{.+}}) __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { } -// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg %arg1.coerce) +// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg addrspace(4)* nocapture byval(%struct.struct_of_arrays_arg) align 4 %{{.+}}) __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { } -// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce) +// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg addrspace(4)* nocapture byval(%struct.struct_of_structs_arg) align 4 %{{.+}}) __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { } // CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce) __kernel void test_kernel_transparent_union_arg(transparent_u u) { } -// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg addrspace(4)* nocapture byval(%struct.single_array_element_struct_arg) align 4 %{{.+}}) __kernel void kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) { } -// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg addrspace(4)* nocapture byval(%struct.single_struct_element_struct_arg) align 8 %{{.+}}) __kernel void kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t arg1) { } -// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair %arg1.coerce) +// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair addrspace(4)* nocapture byval(%struct.different_size_type_pair) align 8 %{{.+}}) __kernel void kernel_different_size_type_pair_arg(different_size_type_pair arg1) { } // CHECK: define void @func_f32_arg(float %arg)