Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -30,6 +30,7 @@ class Module; struct SIProgramInfo; class Type; +class GCNSubtarget; namespace AMDGPU { @@ -173,7 +174,7 @@ void emitKernelAttrs(const Function &Func); - void emitKernelArgs(const Function &Func); + void emitKernelArgs(const Function &Func, const GCNSubtarget &ST); void emitKernelArg(const Argument &Arg); @@ -183,7 +184,7 @@ StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func); + void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST); const Metadata &getHSAMetadata() const { return HSAMetadata; Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -279,11 +279,11 @@ } } -void MetadataStreamerV2::emitKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitKernelArgs(const Function &Func, const GCNSubtarget &ST) { for (auto &Arg : Func.args()) emitKernelArg(Arg); - emitHiddenKernelArgs(Func); + emitHiddenKernelArgs(Func, ST); } void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { @@ -380,10 +380,9 @@ } } -void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { - int HiddenArgNumBytes = - getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); - +void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func, + const GCNSubtarget &ST) { + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -464,11 +463,12 @@ HSAMetadata.mKernels.push_back(Kernel::Metadata()); auto &Kernel = HSAMetadata.mKernels.back(); + const GCNSubtarget &ST = MF.getSubtarget(); Kernel.mName = std::string(Func.getName()); Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); emitKernelLanguage(Func); emitKernelAttrs(Func); - emitKernelArgs(Func); + emitKernelArgs(Func, ST); HSAMetadata.mKernels.back().mCodeProps = CodeProps; HSAMetadata.mKernels.back().mDebugProps = DebugProps; } @@ -792,6 +792,10 @@ void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, unsigned &Offset, msgpack::ArrayDocNode Args) { + // We aren't allocating these if llvm.amdgcn.implicitarg.ptr is not used. + if (Func.hasFnAttribute("amdgpu-no-implicitarg-ptr")) + return; + int HiddenArgNumBytes = getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -648,6 +648,11 @@ } unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { + // We don't allocate the segment if we know the implicit arguments weren't + // used, even if the ABI implies we need them. + if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) + return 0; + if (isMesaKernel(F)) return 16; return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2730,7 +2730,7 @@ ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 9 ; GPRIDX-NEXT: workitem_vgpr_count = 3 @@ -2821,7 +2821,7 @@ ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 9 ; MOVREL-NEXT: workitem_vgpr_count = 4 @@ -2913,7 +2913,7 @@ ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 ; GFX10-NEXT: wavefront_sgpr_count = 9 ; GFX10-NEXT: workitem_vgpr_count = 3 @@ -3559,7 +3559,7 @@ ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 6 ; GPRIDX-NEXT: workitem_vgpr_count = 2 @@ -3643,7 +3643,7 @@ ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 6 ; MOVREL-NEXT: workitem_vgpr_count = 3 @@ -3728,7 +3728,7 @@ ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 ; GFX10-NEXT: wavefront_sgpr_count = 6 ; GFX10-NEXT: workitem_vgpr_count = 2 @@ -3819,7 +3819,7 @@ ; GPRIDX-NEXT: workitem_private_segment_byte_size = 0 ; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0 ; GPRIDX-NEXT: gds_segment_byte_size = 0 -; GPRIDX-NEXT: kernarg_segment_byte_size = 28 +; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 ; GPRIDX-NEXT: wavefront_sgpr_count = 7 ; GPRIDX-NEXT: workitem_vgpr_count = 3 @@ -3906,7 +3906,7 @@ ; MOVREL-NEXT: workitem_private_segment_byte_size = 0 ; MOVREL-NEXT: workgroup_group_segment_byte_size = 0 ; MOVREL-NEXT: gds_segment_byte_size = 0 -; MOVREL-NEXT: kernarg_segment_byte_size = 28 +; MOVREL-NEXT: kernarg_segment_byte_size = 12 ; MOVREL-NEXT: workgroup_fbarrier_count = 0 ; MOVREL-NEXT: wavefront_sgpr_count = 7 ; MOVREL-NEXT: workitem_vgpr_count = 4 @@ -3994,7 +3994,7 @@ ; GFX10-NEXT: workitem_private_segment_byte_size = 0 ; GFX10-NEXT: workgroup_group_segment_byte_size = 0 ; GFX10-NEXT: gds_segment_byte_size = 0 -; GFX10-NEXT: kernarg_segment_byte_size = 28 +; GFX10-NEXT: kernarg_segment_byte_size = 12 ; GFX10-NEXT: workgroup_fbarrier_count = 0 ; GFX10-NEXT: wavefront_sgpr_count = 7 ; GFX10-NEXT: workitem_vgpr_count = 3 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -74,14 +74,9 @@ ret void } -; Mesa implies 16-bytes are always allocated, hsa requires the -; attribute for the additional space. ; ALL-LABEL: {{^}}test_no_kernargs: -; HSA: enable_sgpr_kernarg_segment_ptr = 0 -; HSA: kernarg_segment_byte_size = 0 - -; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; OS-MESA3D: kernarg_segment_byte_size = 16 +; CO-V2: enable_sgpr_kernarg_segment_ptr = 0 +; CO-V2: kernarg_segment_byte_size = 0 ; CO-V2: kernarg_segment_alignment = 4 ; HSA: s_mov_b64 [[OFFSET_NULL:s\[[0-9]+:[0-9]+\]]], 40{{$}} @@ -97,7 +92,7 @@ ; ALL-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs: ; HSA: kernarg_segment_byte_size = 48 -; OS-MESA3d: kernarg_segment_byte_size = 16 +; OS-MESA3D: kernarg_segment_byte_size = 16 ; CO-V2: kernarg_segment_alignment = 4 define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 { %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel-v3.ll @@ -76,8 +76,8 @@ ; CHECK-NEXT: - 0 ; CHECK-NOT: amdhsa.printf: -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } -attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } +attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-enqueue-kernel.ll @@ -72,8 +72,8 @@ ret void } -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } -attributes #1 = { "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } +attributes #1 = { optnone noinline "calls-enqueue-kernel" "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll @@ -1894,9 +1894,9 @@ ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 0 -attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" } -attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } -attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } +attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } !llvm.printf.fmts = !{!100, !101} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll @@ -1866,9 +1866,9 @@ ret void } -attributes #0 = { "amdgpu-implicitarg-num-bytes"="56" } -attributes #1 = { "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } -attributes #2 = { "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } +attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" } !llvm.printf.fmts = !{!100, !101} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v3.ll @@ -296,9 +296,11 @@ ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 0 -attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" } -attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" } -attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" } -attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" } -attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" } -attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" } +; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to +; avoid optimizing out the implicit argument allocation. +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" } +attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" } +attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" } +attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } +attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args.ll @@ -300,9 +300,11 @@ ret void } -attributes #0 = { "amdgpu-implicitarg-num-bytes"="8" } -attributes #1 = { "amdgpu-implicitarg-num-bytes"="16" } -attributes #2 = { "amdgpu-implicitarg-num-bytes"="24" } -attributes #3 = { "amdgpu-implicitarg-num-bytes"="32" } -attributes #4 = { "amdgpu-implicitarg-num-bytes"="48" } -attributes #5 = { "amdgpu-implicitarg-num-bytes"="56" } +; We don't have a use of llvm.amdgcn.implicitarg.ptr, so optnone to +; avoid optimizing out the implicit argument allocation. +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="8" } +attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="16" } +attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="24" } +attributes #3 = { optnone noinline "amdgpu-implicitarg-num-bytes"="32" } +attributes #4 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } +attributes #5 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" } Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent-v3.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 0 -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-absent.ll @@ -35,7 +35,7 @@ ret void } -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present-v3.ll @@ -39,7 +39,7 @@ ; CHECK-NEXT: - 1 ; CHECK-NEXT: - 0 -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hostcall-present.ll @@ -40,7 +40,7 @@ ret void } -attributes #0 = { "amdgpu-implicitarg-num-bytes"="48" } +attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="48" } !1 = !{i32 0} !2 = !{!"none"} Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -75,14 +75,10 @@ ret void } -; Mesa implies 16-bytes are always allocated, hsa requires the -; attribute for the additional space. ; ALL-LABEL: {{^}}test_no_kernargs: -; HSA: enable_sgpr_kernarg_segment_ptr = 0 -; HSA: kernarg_segment_byte_size = 0 +; CO-V2: enable_sgpr_kernarg_segment_ptr = 0 +; CO-V2: kernarg_segment_byte_size = 0 -; OS-MESA3D: enable_sgpr_kernarg_segment_ptr = 1 -; OS-MESA3D: kernarg_segment_byte_size = 16 ; CO-V2: kernarg_segment_alignment = 4 ; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}