Index: clang/include/clang/CodeGen/CGFunctionInfo.h =================================================================== --- clang/include/clang/CodeGen/CGFunctionInfo.h +++ clang/include/clang/CodeGen/CGFunctionInfo.h @@ -44,8 +44,8 @@ /// but also emit a zero/sign extension attribute. Extend, - /// Indirect - Pass the argument indirectly via a hidden pointer - /// with the specified alignment (0 indicates default alignment). + /// Indirect - Pass the argument indirectly via a hidden pointer with the + /// specified alignment (0 indicates default alignment) and address space. Indirect, /// Ignore - Ignore the argument (treat as void). Useful for void and @@ -86,10 +86,12 @@ unsigned AllocaFieldIndex; // isInAlloca() }; Kind TheKind; + unsigned IndirectAddrSpace : 24; // isIndirect() bool PaddingInReg : 1; bool InAllocaSRet : 1; // isInAlloca() bool InAllocaIndirect : 1;// isInAlloca() bool IndirectByVal : 1; // isIndirect() + bool IndirectByRef : 1; // isIndirect() bool IndirectRealign : 1; // isIndirect() bool SRetAfterThis : 1; // isIndirect() bool InReg : 1; // isDirect() || isExtend() || isIndirect() @@ -112,9 +114,10 @@ public: ABIArgInfo(Kind K = Direct) : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K), - PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false), - IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false), - InReg(false), CanBeFlattened(false), SignExt(false) {} + IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false), + InAllocaIndirect(false), IndirectByVal(false), IndirectByRef(false), + IndirectRealign(false), SRetAfterThis(false), InReg(false), + CanBeFlattened(false), SignExt(false) {} static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0, llvm::Type *Padding = nullptr, @@ -171,15 +174,32 @@ } static ABIArgInfo getIndirect(CharUnits Alignment, bool ByVal = true, bool Realign = false, - llvm::Type *Padding = nullptr) { + llvm::Type *Padding = nullptr, + unsigned AddrSpace = 0) { auto AI = ABIArgInfo(Indirect); AI.setIndirectAlign(Alignment); AI.setIndirectByVal(ByVal); AI.setIndirectRealign(Realign); AI.setSRetAfterThis(false); AI.setPaddingType(Padding); + AI.setIndirectAddrSpace(AddrSpace); return AI; } + + /// Pass this in memory using the IR byref attribute. + static ABIArgInfo getIndirectByRef(CharUnits Alignment, unsigned AddrSpace, + bool Realign = false, + llvm::Type *Padding = nullptr) { + auto AI = ABIArgInfo(Indirect); + AI.setIndirectAlign(Alignment); + AI.setIndirectByRef(true); + AI.setIndirectRealign(Realign); + AI.setSRetAfterThis(false); + AI.setPaddingType(Padding); + AI.setIndirectAddrSpace(AddrSpace); + return AI; + } + static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true, bool Realign = false) { auto AI = getIndirect(Alignment, ByVal, Realign); @@ -355,6 +375,26 @@ IndirectByVal = IBV; } + bool getIndirectByRef() const { + assert(isIndirect() && "Invalid kind!"); + return IndirectByRef; + } + + void setIndirectByRef(bool IBV) { + assert(isIndirect() && "Invalid kind!"); + IndirectByRef = IBV; + } + + unsigned getIndirectAddrSpace() const { + assert(isIndirect() && "Invalid kind!"); + return IndirectAddrSpace; + } + + void setIndirectAddrSpace(unsigned AddrSpace) { + assert(isIndirect() && "Invalid kind!"); + IndirectAddrSpace = AddrSpace; + } + bool getIndirectRealign() const { assert(isIndirect() && "Invalid kind!"); return IndirectRealign; Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -1631,10 +1631,8 @@ case ABIArgInfo::Indirect: { assert(NumIRArgs == 1); - // indirect arguments are always on the stack, which is alloca addr space. llvm::Type *LTy = ConvertTypeForMem(it->type); - ArgTypes[FirstIRArg] = LTy->getPointerTo( - CGM.getDataLayout().getAllocaAddrSpace()); + ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace()); break; } @@ -2183,6 +2181,8 @@ if (AI.getIndirectByVal()) Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType)); + else if (AI.getIndirectByRef()) + Attrs.addByRefAttr(getTypes().ConvertTypeForMem(ParamType)); CharUnits Align = AI.getIndirectAlign(); @@ -2200,12 +2200,15 @@ // For now, only add this when we have a byval argument. // TODO: be less lazy about updating test cases. - if (AI.getIndirectByVal()) + if (AI.getIndirectByVal() || AI.getIndirectByRef()) Attrs.addAlignmentAttr(Align.getQuantity()); // byval disables readnone and readonly. - FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly) - .removeAttribute(llvm::Attribute::ReadNone); + if (!AI.getIndirectByRef()) { + FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly) + .removeAttribute(llvm::Attribute::ReadNone); + } + break; } case ABIArgInfo::Ignore: @@ -2440,10 +2443,11 @@ Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign()); if (!hasScalarEvaluationKind(Ty)) { - // Aggregates and complex variables are accessed by reference. All we - // need to do is realign the value, if requested. + // Aggregates and complex variables are accessed by reference. All we + // need to do is realign the value, if requested. Also, if the address + // is byref, copy it since the incoming argument may not be mutable. Address V = ParamAddr; - if (ArgI.getIndirectRealign()) { + if (ArgI.getIndirectRealign() || ArgI.getIndirectByRef()) { Address AlignedTemp = CreateMemTemp(Ty, "coerce"); // Copy from the incoming argument pointer to the temporary with the Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8792,18 +8792,26 @@ // TODO: Can we omit empty structs? - llvm::Type *LTy = nullptr; if (const Type *SeltTy = isSingleElementStruct(Ty, getContext())) - LTy = CGT.ConvertType(QualType(SeltTy, 0)); + Ty = QualType(SeltTy, 0); + llvm::Type *OrigLTy = CGT.ConvertType(Ty); + llvm::Type *LTy = OrigLTy; if (getContext().getLangOpts().HIP) { - if (!LTy) - LTy = CGT.ConvertType(Ty); LTy = coerceKernelArgumentType( - LTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), + OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default), /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device)); } + // FIXME: Should use byref when promoting pointers in structs, but this + // requires adding implementing the coercion. + if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) { + return ABIArgInfo::getIndirectByRef( + getContext().getTypeAlignInChars(Ty), + getContext().getTargetAddressSpace(LangAS::opencl_constant), + false /*Realign*/, nullptr /*Padding*/); + } + // If we set CanBeFlattened to true, CodeGen will expand the struct to its // individual elements, which confuses the Clover OpenCL backend; therefore we // have to set it to false here. Other args of getDirect() are just defaults. Index: clang/test/CodeGenCUDA/kernel-args.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args.cu +++ clang/test/CodeGenCUDA/kernel-args.cu @@ -8,14 +8,14 @@ int a[32]; }; -// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce) +// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}) // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} @@ -29,11 +29,11 @@ void test() { Kernel K; - // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)templateKernel); - // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce) + // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}} // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x) launch((void*)Kernel::templateMemberKernel); } Index: clang/test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -94,10 +94,10 @@ } // AMDGCN20-LABEL: define void @test_indirect_arg_globl() -// AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN20: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byval_temp]] to i8 addrspace(5)* +// AMDGCN20: %[[byref_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN20: %[[r0:.*]] = bitcast %struct.LargeStructOneMember addrspace(5)* %[[byref_temp]] to i8 addrspace(5)* // AMDGCN20: call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 8 %[[r0]], i8 addrspace(1)* align 8 bitcast (%struct.LargeStructOneMember addrspace(1)* @g_s to i8 addrspace(1)*), i64 800, i1 false) -// AMDGCN20: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) +// AMDGCN20: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 %[[byref_temp]]) #if __OPENCL_C_VERSION__ >= 200 void test_indirect_arg_globl(void) { FuncOneLargeMember(g_s); @@ -139,10 +139,13 @@ FuncOneMember(*u); } -// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructOneMember %u.coerce, %struct.LargeStructOneMember addrspace(5)* %[[U]], align 8 -// AMDGCN: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 %[[U]]) +// AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeOneMember +// AMDGCN-SAME: (%struct.LargeStructOneMember addrspace(4)* byref(%struct.LargeStructOneMember) align 8 [[BYVAL_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructOneMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructOneMember addrspace(4)* [[BYVAL_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(%struct.LargeStructOneMember addrspace(5)* byval(%struct.LargeStructOneMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } @@ -158,20 +161,34 @@ } // AMDGCN-LABEL: define amdgpu_kernel void @KernelTwoMember -// AMDGCN-SAME: (%struct.StructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5) -// AMDGCN: %[[LD0:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* -// AMDGCN: %[[LD1:.*]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN-SAME: (%struct.StructTwoMember addrspace(4)* byref(%struct.StructTwoMember) align 8 [[BYREF_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.StructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.StructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.StructTwoMember addrspace(4)* [[BYREF_PTR]] to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 16, i1 false) +// AMDGCN: %[[LD0:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* +// AMDGCN: %[[LD1:.+]] = load <2 x i32>, <2 x i32> addrspace(5)* // AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]]) kernel void KernelTwoMember(struct StructTwoMember u) { FuncTwoMember(u); } // AMDGCN-LABEL: define amdgpu_kernel void @KernelLargeTwoMember -// AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) -// AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) -// AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], %struct.LargeStructTwoMember addrspace(5)* %[[u]] -// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 %[[u]]) +// AMDGCN-SAME: (%struct.LargeStructTwoMember addrspace(4)* byref(%struct.LargeStructTwoMember) align 8 [[BYREF_PTR:%.+]]) +// AMDGCN: [[U_ALLOCA:%.+]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) +// AMDGCN-NEXT: [[CAST_ALLOCA:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(5)* [[U_ALLOCA]] to i8 addrspace(5)* +// AMDGCN-NEXT: [[CAST_KERNARG:%.+]] = bitcast %struct.LargeStructTwoMember addrspace(4)* %{{.+}} to i8 addrspace(4)* +// AMDGCN-NEXT: call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 8 [[CAST_ALLOCA]], i8 addrspace(4)* align 8 [[CAST_KERNARG]], i64 480, i1 false) +// AMDGCN: call void @FuncLargeTwoMember(%struct.LargeStructTwoMember addrspace(5)* byval(%struct.LargeStructTwoMember) align 8 [[U_ALLOCA]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } + +// Make sure the address of the argument gives the stack copy address, not the kernarg. +// AMDGCN-LABEL: define amdgpu_kernel void @struct_arg_kernarg_address +// AMDGCN: call void @llvm.memcpy.p5i8.p4i8.i64 +// AMDGCN: store volatile %struct.LargeStructOneMember addrspace(5)* %kernarg_struct, %struct.LargeStructOneMember addrspace(5)* addrspace(5)* %x_addr, align 4 +__kernel void struct_arg_kernarg_address(struct LargeStructOneMember kernarg_struct, global int* out, + int idx0, int idx1) { + __private struct LargeStructOneMember* volatile x_addr = &kernarg_struct; +} Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -67,7 +67,6 @@ int i2; } struct_of_structs_arg_t; -// CHECK: %union.transparent_u = type { i32 } typedef union { int b1; @@ -216,7 +215,7 @@ int w; } struct_4regs; -// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct %s.coerce) +// CHECK: void @kernel_empty_struct_arg(%struct.empty_struct addrspace(4)* nocapture byref(%struct.empty_struct) align 1 {{%.+}}) __kernel void kernel_empty_struct_arg(empty_struct s) { } // CHECK: void @kernel_single_element_struct_arg(i32 %arg1.coerce) @@ -225,28 +224,28 @@ // CHECK: void @kernel_nested_single_element_struct_arg(i32 %arg1.coerce) __kernel void kernel_nested_single_element_struct_arg(nested_single_element_struct_arg_t arg1) { } -// CHECK: void @kernel_struct_arg(%struct.struct_arg %arg1.coerce) +// CHECK: void @kernel_struct_arg(%struct.struct_arg addrspace(4)* nocapture byref(%struct.struct_arg) align 4 {{%.+}}) __kernel void kernel_struct_arg(struct_arg_t arg1) { } -// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg %arg1.coerce) +// CHECK: void @kernel_struct_padding_arg(%struct.struct_padding_arg addrspace(4)* nocapture byref(%struct.struct_padding_arg) align 8 %{{.+}}) __kernel void kernel_struct_padding_arg(struct_padding_arg arg1) { } -// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg %arg1.coerce) +// CHECK: void @kernel_test_struct_of_arrays_arg(%struct.struct_of_arrays_arg addrspace(4)* nocapture byref(%struct.struct_of_arrays_arg) align 4 %{{.+}}) __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { } -// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce) +// CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg addrspace(4)* nocapture byref(%struct.struct_of_structs_arg) align 4 %{{.+}}) __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { } -// CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce) +// CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce) __kernel void test_kernel_transparent_union_arg(transparent_u u) { } -// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg addrspace(4)* nocapture byref(%struct.single_array_element_struct_arg) align 4 %{{.+}}) __kernel void kernel_single_array_element_struct_arg(single_array_element_struct_arg_t arg1) { } -// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg %arg1.coerce) +// CHECK: void @kernel_single_struct_element_struct_arg(%struct.single_struct_element_struct_arg addrspace(4)* nocapture byref(%struct.single_struct_element_struct_arg) align 8 %{{.+}}) __kernel void kernel_single_struct_element_struct_arg(single_struct_element_struct_arg_t arg1) { } -// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair %arg1.coerce) +// CHECK: void @kernel_different_size_type_pair_arg(%struct.different_size_type_pair addrspace(4)* nocapture byref(%struct.different_size_type_pair) align 8 %{{.+}}) __kernel void kernel_different_size_type_pair_arg(different_size_type_pair arg1) { } // CHECK: define void @func_f32_arg(float %arg)