Index: clang/docs/ReleaseNotes.rst =================================================================== --- clang/docs/ReleaseNotes.rst +++ clang/docs/ReleaseNotes.rst @@ -193,6 +193,10 @@ AMDGPU Support ^^^^^^^^^^^^^^ +- Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct + arguments in C ABI. Callee is responsible for allocating stack memory and + copying the value of the struct if modified. Note that AMDGPU backend still + supports byval for struct arguments. X86 Support ^^^^^^^^^^^ Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2156,7 +2156,8 @@ const llvm::DataLayout &DL, const ABIArgInfo &AI, bool CheckCoerce = true) { llvm::Type *Ty = Types.ConvertTypeForMem(QTy); - if (AI.getKind() == ABIArgInfo::Indirect) + if (AI.getKind() == ABIArgInfo::Indirect || + AI.getKind() == ABIArgInfo::IndirectAliased) return true; if (AI.getKind() == ABIArgInfo::Extend) return true; @@ -5126,12 +5127,15 @@ auto LV = I->getKnownLValue(); auto AS = LV.getAddressSpace(); - if (!ArgInfo.getIndirectByVal() || + bool isByValOrRef = + ArgInfo.isIndirectAliased() || ArgInfo.getIndirectByVal(); + + if (!isByValOrRef || (LV.getAlignment() < getContext().getTypeAlignInChars(I->Ty))) { NeedCopy = true; } if (!getLangOpts().OpenCL) { - if ((ArgInfo.getIndirectByVal() && + if ((isByValOrRef && (AS != LangAS::Default && AS != CGM.getASTAllocaAddressSpace()))) { NeedCopy = true; @@ -5139,7 +5143,7 @@ } // For OpenCL even if RV is located in default or alloca address space // we don't want to perform address space cast for it. - else if ((ArgInfo.getIndirectByVal() && + else if ((isByValOrRef && Addr.getType()->getAddressSpace() != IRFuncTy-> getParamType(FirstIRArg)->getPointerAddressSpace())) { NeedCopy = true; Index: clang/lib/CodeGen/Targets/AMDGPU.cpp =================================================================== --- clang/lib/CodeGen/Targets/AMDGPU.cpp +++ clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -248,6 +248,12 @@ return ABIArgInfo::getDirect(); } } + + // Use pass-by-reference in stead of pass-by-value for struct arguments in + // function ABI. + return ABIArgInfo::getIndirectAliased( + getContext().getTypeAlignInChars(Ty), + getContext().getTargetAddressSpace(LangAS::opencl_private)); } // Otherwise just do the default thing. Index: clang/test/CodeGenCUDA/kernel-args.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-args.cu +++ clang/test/CodeGenCUDA/kernel-args.cu @@ -9,14 +9,14 @@ float *p; }; -// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}) +// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}) // NVPTX: define{{.*}} void @_Z6kernel1A(ptr noundef byval(%struct.A) align 8 %x) __global__ void kernel(A x) { } class Kernel { public: - // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}}) + // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}}) // NVPTX: define{{.*}} void @_ZN6Kernel12memberKernelE1A(ptr noundef byval(%struct.A) align 8 %x) static __global__ void memberKernel(A x){} template static __global__ void templateMemberKernel(T x) {} @@ -30,11 +30,11 @@ void test() { Kernel K; - // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}} + // AMDGCN: define{{.*}} amdgpu_kernel void @_Z14templateKernelI1AEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}} // NVPTX: define{{.*}} void @_Z14templateKernelI1AEvT_(ptr noundef byval(%struct.A) align 8 %x) launch((void*)templateKernel); - // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) byref(%struct.A) align 8 %{{.+}} + // AMDGCN: define{{.*}} amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr addrspace(4) noundef byref(%struct.A) align 8 %{{.+}} // NVPTX: define{{.*}} void @_ZN6Kernel20templateMemberKernelI1AEEvT_(ptr noundef byval(%struct.A) align 8 %x) launch((void*)Kernel::templateMemberKernel); } Index: clang/test/CodeGenCXX/amdgcn-func-arg.cpp =================================================================== --- clang/test/CodeGenCXX/amdgcn-func-arg.cpp +++ clang/test/CodeGenCXX/amdgcn-func-arg.cpp @@ -19,14 +19,13 @@ void func_with_ref_arg(B &b); // CHECK-LABEL: @_Z22func_with_indirect_arg1A( -// CHECK-SAME: ptr addrspace(5) noundef [[ARG:%.*]]) // CHECK-NEXT: entry: -// CHECK-NEXT: [[INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_INDIRECT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5) -// CHECK-NEXT: [[INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDIRECT_ADDR]] to ptr +// CHECK-NEXT: [[A_INDIRECT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_INDIRECT_ADDR]] to ptr // CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr -// CHECK-NEXT: store ptr addrspace(5) [[ARG]], ptr [[INDIRECT_ADDR_ASCAST]] -// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A:%.*]] to ptr +// CHECK-NEXT: store ptr addrspace(5) [[A:%.*]], ptr [[A_INDIRECT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr // CHECK-NEXT: store ptr [[A_ASCAST]], ptr [[P_ASCAST]], align 8 // CHECK-NEXT: ret void // @@ -73,10 +72,12 @@ // CHECK-LABEL: @_Z19func_with_byval_arg1B( // CHECK-NEXT: entry: +// CHECK-NEXT: [[COERCE:%.*]] = alloca [[CLASS_B:%.*]], align 4, addrspace(5) // CHECK-NEXT: [[P:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[B:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr // CHECK-NEXT: [[P_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P]] to ptr -// CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B:%.*]] to ptr -// CHECK-NEXT: store ptr [[B_ASCAST]], ptr [[P_ASCAST]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[B]], ptr addrspace(5) align 4 [[TMP0:%.*]], i64 400, i1 false) +// CHECK-NEXT: store ptr [[B]], ptr [[P_ASCAST]], align 8 // CHECK-NEXT: ret void // void func_with_byval_arg(B b) { @@ -91,7 +92,7 @@ // CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 [[B_ASCAST]], i64 400, i1 false) // CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) // CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) [[B_ASCAST]]) // CHECK-NEXT: ret void // @@ -107,7 +108,7 @@ // CHECK-NEXT: [[AGG_TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AGG_TMP]] to ptr // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[AGG_TMP_ASCAST]], ptr align 4 addrspacecast (ptr addrspace(1) @g_b to ptr), i64 400, i1 false) // CHECK-NEXT: [[AGG_TMP_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[AGG_TMP_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byval([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) +// CHECK-NEXT: call void @_Z19func_with_byval_arg1B(ptr addrspace(5) noundef byref([[CLASS_B]]) align 4 [[AGG_TMP_ASCAST_ASCAST]]) // CHECK-NEXT: call void @_Z17func_with_ref_argR1B(ptr noundef nonnull align 4 dereferenceable(400) addrspacecast (ptr addrspace(1) @g_b to ptr)) // CHECK-NEXT: ret void // Index: clang/test/CodeGenOpenCL/addr-space-struct-arg.cl =================================================================== --- clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -66,7 +66,9 @@ } // X86-LABEL: define{{.*}} void @foo_large(ptr noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in) -// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byval(%struct.Mat32X32) align 4 %in) +// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) noalias sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}} +// AMDGCN: %in = alloca %struct.Mat32X32, align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false) Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { Mat64X64 out; return out; @@ -88,7 +90,9 @@ u.x = (int2)(0, 0); } -// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %u) +// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}} +// AMDGCN: %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5) +// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false) // AMDGCN-NOT: addrspacecast // AMDGCN: store <2 x i32> %{{.*}}, ptr addrspace(5) void FuncOneLargeMember(struct LargeStructOneMember u) { @@ -98,7 +102,7 @@ // AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl() // AMDGCN20: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) // AMDGCN20: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) +// AMDGCN20: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) #if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) void test_indirect_arg_globl(void) { FuncOneLargeMember(g_s); @@ -108,7 +112,7 @@ // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local() // AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) // AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) +// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]]) kernel void test_indirect_arg_local(void) { local struct LargeStructOneMember l_s; FuncOneLargeMember(l_s); @@ -117,7 +121,7 @@ // AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private() // AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) // AMDGCN-NOT: @llvm.memcpy -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[p_s]]) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]]) void test_indirect_arg_private(void) { struct LargeStructOneMember p_s; FuncOneLargeMember(p_s); @@ -142,7 +146,7 @@ // AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember( // AMDGCN: %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5) // AMDGCN: store %struct.LargeStructOneMember %u.coerce, ptr addrspace(5) %[[U]], align 8 -// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval(%struct.LargeStructOneMember) align 8 %[[U]]) +// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]]) kernel void KernelLargeOneMember(struct LargeStructOneMember u) { FuncOneLargeMember(u); } @@ -152,7 +156,10 @@ u.y = (int2)(0, 0); } -// AMDGCN-LABEL: define{{.*}} void @FuncLargeTwoMember(ptr addrspace(5) noundef byval(%struct.LargeStructTwoMember) align 8 %u) +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember +// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) +// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) +// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) void FuncLargeTwoMember(struct LargeStructTwoMember u) { u.y[0] = (int2)(0, 0); } @@ -171,7 +178,7 @@ // AMDGCN-SAME: (%struct.LargeStructTwoMember %[[u_coerce:.*]]) // AMDGCN: %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5) // AMDGCN: store %struct.LargeStructTwoMember %[[u_coerce]], ptr addrspace(5) %[[u]] -// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byval(%struct.LargeStructTwoMember) align 8 %[[u]]) +// AMDGCN: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]]) kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { FuncLargeTwoMember(u); } Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -86,8 +86,10 @@ } // AMDGCN-LABEL: define dso_local void @foo_large -// AMDGCN-SAME: (ptr addrspace(5) noalias sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// AMDGCN-SAME: (ptr addrspace(5) noalias sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) // AMDGCN-NEXT: ret void // Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { @@ -109,7 +111,7 @@ // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] // AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // @@ -135,14 +137,16 @@ } // AMDGCN-LABEL: define dso_local void @FuncOneLargeMember -// AMDGCN-SAME: (ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) // AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 // AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // void FuncOneLargeMember(struct LargeStructOneMember u) { @@ -155,7 +159,7 @@ // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // void test_indirect_arg_globl(void) { @@ -168,7 +172,7 @@ // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // kernel void test_indirect_arg_local(void) { @@ -180,7 +184,7 @@ // AMDGCN-SAME: () #[[ATTR0]] { // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // void test_indirect_arg_private(void) { @@ -223,7 +227,7 @@ // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8 -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // kernel void KernelLargeOneMember(struct LargeStructOneMember u) { @@ -250,14 +254,16 @@ } // AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember -// AMDGCN-SAME: (ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) // AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 // AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // void FuncLargeTwoMember(struct LargeStructTwoMember u) { @@ -285,7 +291,7 @@ // AMDGCN-NEXT: entry: // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: store [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], ptr addrspace(5) [[U]], align 8 -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] // AMDGCN-NEXT: ret void // kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { Index: clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -448,11 +448,11 @@ // CHECK: define{{.*}} void @func_reg_state_lo(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 %s.coerce0, float %s.coerce1, i32 %s.coerce2) void func_reg_state_lo(int4 arg0, int4 arg1, int4 arg2, int arg3, struct_arg_t s) { } -// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef %arg4, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_arg) align 4 %s) +// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef %arg4, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_arg) align 4 %{{.*}}) void func_reg_state_hi(int4 arg0, int4 arg1, int4 arg2, int arg3, int arg4, struct_arg_t s) { } // XXX - Why don't the inner structs flatten? -// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested %arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr addrspace(5) nocapture noundef readnone byval(%struct.num_regs_nested_struct) align 8 %arg4) +// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested %arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr addrspace(5) nocapture noundef readnone byref(%struct.num_regs_nested_struct) align 8 %{{.*}}) void func_reg_state_num_regs_nested_struct(int4 arg0, int arg1, num_regs_nested_struct arg2, num_regs_nested_struct arg3, num_regs_nested_struct arg4) { } // CHECK: define{{.*}} void @func_double_nested_struct_arg(<4 x i32> noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.double_nested %arg2.coerce1, i16 %arg2.coerce2) @@ -467,7 +467,7 @@ // CHECK: define{{.*}} void @func_large_struct_padding_arg_direct(i8 %arg.coerce0, i32 %arg.coerce1, i8 %arg.coerce2, i32 %arg.coerce3, i8 %arg.coerce4, i8 %arg.coerce5, i16 %arg.coerce6, i16 %arg.coerce7, [3 x i8] %arg.coerce8, i64 %arg.coerce9, i32 %arg.coerce10, i8 %arg.coerce11, i32 %arg.coerce12, i16 %arg.coerce13, i8 %arg.coerce14) void func_large_struct_padding_arg_direct(large_struct_padding arg) { } -// CHECK: define{{.*}} void @func_large_struct_padding_arg_store(ptr addrspace(1) nocapture noundef writeonly %out, ptr addrspace(5) nocapture noundef readonly byval(%struct.large_struct_padding) align 8 %arg) +// CHECK: define{{.*}} void @func_large_struct_padding_arg_store(ptr addrspace(1) nocapture noundef writeonly %out, ptr addrspace(5) nocapture noundef readonly byref(%struct.large_struct_padding) align 8 %{{.*}}) void func_large_struct_padding_arg_store(global large_struct_padding* out, large_struct_padding arg) { *out = arg; } @@ -485,7 +485,7 @@ void v4i16_reg_count(short4 arg0, short4 arg1, short4 arg2, short4 arg3, short4 arg4, short4 arg5, struct_4regs arg6) { } -// CHECK: define{{.*}} void @v4i16_pair_reg_count_over(<4 x i16> noundef %arg0, <4 x i16> noundef %arg1, <4 x i16> noundef %arg2, <4 x i16> noundef %arg3, <4 x i16> noundef %arg4, <4 x i16> noundef %arg5, <4 x i16> noundef %arg6, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_4regs) align 4 %arg7) +// CHECK: define{{.*}} void @v4i16_pair_reg_count_over(<4 x i16> noundef %arg0, <4 x i16> noundef %arg1, <4 x i16> noundef %arg2, <4 x i16> noundef %arg3, <4 x i16> noundef %arg4, <4 x i16> noundef %arg5, <4 x i16> noundef %arg6, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_4regs) align 4 %{{.*}}) void v4i16_pair_reg_count_over(short4 arg0, short4 arg1, short4 arg2, short4 arg3, short4 arg4, short4 arg5, short4 arg6, struct_4regs arg7) { } @@ -493,7 +493,7 @@ void v3i16_reg_count(short3 arg0, short3 arg1, short3 arg2, short3 arg3, short3 arg4, short3 arg5, struct_4regs arg6) { } -// CHECK: define{{.*}} void @v3i16_reg_count_over(<3 x i16> noundef %arg0, <3 x i16> noundef %arg1, <3 x i16> noundef %arg2, <3 x i16> noundef %arg3, <3 x i16> noundef %arg4, <3 x i16> noundef %arg5, <3 x i16> noundef %arg6, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_4regs) align 4 %arg7) +// CHECK: define{{.*}} void @v3i16_reg_count_over(<3 x i16> noundef %arg0, <3 x i16> noundef %arg1, <3 x i16> noundef %arg2, <3 x i16> noundef %arg3, <3 x i16> noundef %arg4, <3 x i16> noundef %arg5, <3 x i16> noundef %arg6, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_4regs) align 4 %{{.*}}) void v3i16_reg_count_over(short3 arg0, short3 arg1, short3 arg2, short3 arg3, short3 arg4, short3 arg5, short3 arg6, struct_4regs arg7) { } @@ -503,7 +503,7 @@ short2 arg8, short2 arg9, short2 arg10, short2 arg11, struct_4regs arg13) { } -// CHECK: define{{.*}} void @v2i16_reg_count_over(<2 x i16> noundef %arg0, <2 x i16> noundef %arg1, <2 x i16> noundef %arg2, <2 x i16> noundef %arg3, <2 x i16> noundef %arg4, <2 x i16> noundef %arg5, <2 x i16> noundef %arg6, <2 x i16> noundef %arg7, <2 x i16> noundef %arg8, <2 x i16> noundef %arg9, <2 x i16> noundef %arg10, <2 x i16> noundef %arg11, <2 x i16> noundef %arg12, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_4regs) align 4 %arg13) +// CHECK: define{{.*}} void @v2i16_reg_count_over(<2 x i16> noundef %arg0, <2 x i16> noundef %arg1, <2 x i16> noundef %arg2, <2 x i16> noundef %arg3, <2 x i16> noundef %arg4, <2 x i16> noundef %arg5, <2 x i16> noundef %arg6, <2 x i16> noundef %arg7, <2 x i16> noundef %arg8, <2 x i16> noundef %arg9, <2 x i16> noundef %arg10, <2 x i16> noundef %arg11, <2 x i16> noundef %arg12, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_4regs) align 4 %{{.*}}) void v2i16_reg_count_over(short2 arg0, short2 arg1, short2 arg2, short2 arg3, short2 arg4, short2 arg5, short2 arg6, short2 arg7, short2 arg8, short2 arg9, short2 arg10, short2 arg11, @@ -513,7 +513,7 @@ void v2i8_reg_count(char2 arg0, char2 arg1, char2 arg2, char2 arg3, char2 arg4, char2 arg5, struct_4regs arg6) { } -// CHECK: define{{.*}} void @v2i8_reg_count_over(<2 x i8> noundef %arg0, <2 x i8> noundef %arg1, <2 x i8> noundef %arg2, <2 x i8> noundef %arg3, <2 x i8> noundef %arg4, <2 x i8> noundef %arg5, i32 noundef %arg6, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_4regs) align 4 %arg7) +// CHECK: define{{.*}} void @v2i8_reg_count_over(<2 x i8> noundef %arg0, <2 x i8> noundef %arg1, <2 x i8> noundef %arg2, <2 x i8> noundef %arg3, <2 x i8> noundef %arg4, <2 x i8> noundef %arg5, i32 noundef %arg6, ptr addrspace(5) nocapture noundef readnone byref(%struct.struct_4regs) align 4 %{{.*}}) void v2i8_reg_count_over(char2 arg0, char2 arg1, char2 arg2, char2 arg3, char2 arg4, char2 arg5, int arg6, struct_4regs arg7) { } Index: clang/test/CodeGenOpenCL/byval.cl =================================================================== --- clang/test/CodeGenOpenCL/byval.cl +++ clang/test/CodeGenOpenCL/byval.cl @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn %s | FileCheck %s - +// RUN: %clang_cc1 -emit-llvm -o - -triple i686-pc-darwin %s | FileCheck -check-prefix=X86 %s +// RUN: %clang_cc1 -emit-llvm -o - -triple amdgcn %s | FileCheck -check-prefix=AMDGCN %s struct A { int x[100]; }; @@ -8,8 +8,10 @@ int g() { struct A a; - // CHECK: call i32 @f(ptr addrspace(5) noundef byval{{.*}}%a) + // X86: call i32 @f(ptr noundef nonnull byval(%struct.A) align 4 %a) + // AMDGCN: call i32 @f(ptr addrspace(5) noundef byref{{.*}}%a) return f(a); } -// CHECK: declare i32 @f(ptr addrspace(5) noundef byval{{.*}}) +// X86: declare i32 @f(ptr noundef byval(%struct.A) align 4) +// AMDGCN: declare i32 @f(ptr addrspace(5) noundef byref{{.*}}) Index: llvm/docs/AMDGPUUsage.rst =================================================================== --- llvm/docs/AMDGPUUsage.rst +++ llvm/docs/AMDGPUUsage.rst @@ -13812,6 +13812,10 @@ 9. All other registers are unspecified. 10. Any necessary ``s_waitcnt`` has been performed to ensure memory is available to the function. +11: Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct + arguments in C ABI. Callee is responsible for allocating stack memory and + copying the value of the struct if modified. Note that the backend still + supports byval for struct arguments. On exit from a function: