Index: lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -20,6 +20,7 @@ /// local memory space. SmallDenseMap LocalMemoryObjects; +protected: uint64_t KernArgSize; unsigned MaxKernArgAlign; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -414,12 +414,16 @@ unsigned SISubtarget::getKernArgSegmentSize(const Function &F, unsigned ExplicitArgBytes) const { + uint64_t TotalSize = ExplicitArgBytes; unsigned ImplicitBytes = getImplicitArgNumBytes(F); - if (ImplicitBytes == 0) - return ExplicitArgBytes; - unsigned Alignment = getAlignmentForImplicitArgPtr(); - return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + if (ImplicitBytes != 0) { + unsigned Alignment = getAlignmentForImplicitArgPtr(); + TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + } + + // Being able to dereference past the end is useful for emitting scalar loads. + return alignTo(TotalSize, 4); } unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -71,8 +71,11 @@ if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; } else { - if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) + if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { KernargSegmentPtr = true; + assert(MaxKernArgAlign == 0); + MaxKernArgAlign = ST.getAlignmentForImplicitArgPtr(); + } } CallingConv::ID CC = F.getCallingConv(); Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -5,6 +5,7 @@ ; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s ; FUNC-LABEL: {{^}}i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb @@ -25,6 +26,7 @@ } ; FUNC-LABEL: {{^}}i8_zext_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb @@ -44,6 +46,7 @@ } ; FUNC-LABEL: {{^}}i8_sext_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb @@ -63,7 +66,9 @@ } ; FUNC-LABEL: {{^}}i16_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c @@ -83,7 +88,9 @@ } ; FUNC-LABEL: {{^}}i16_zext_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c @@ -102,7 +109,9 @@ } ; FUNC-LABEL: {{^}}i16_sext_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c @@ -121,7 +130,9 @@ } ; FUNC-LABEL: {{^}}i32_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c @@ -133,6 +144,7 @@ } ; FUNC-LABEL: {{^}}f32_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb @@ -145,7 +157,9 @@ } ; FUNC-LABEL: {{^}}v2i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: VTX_READ_8 ; EG: VTX_READ_8 ; MESA-GCN: buffer_load_ubyte @@ -159,7 +173,9 @@ } ; FUNC-LABEL: {{^}}v2i16_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -174,7 +190,9 @@ } ; FUNC-LABEL: {{^}}v2i32_arg: +; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb @@ -187,7 +205,9 @@ } ; FUNC-LABEL: {{^}}v2f32_arg: +; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb @@ -200,7 +220,9 @@ } ; FUNC-LABEL: {{^}}v3i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 @@ -217,7 +239,9 @@ } ; FUNC-LABEL: {{^}}v3i16_arg: +; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 + ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 @@ -233,6 +257,7 @@ ret void } ; FUNC-LABEL: {{^}}v3i32_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z @@ -247,6 +272,7 @@ } ; FUNC-LABEL: {{^}}v3f32_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z @@ -261,6 +287,7 @@ } ; FUNC-LABEL: {{^}}v4i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -281,6 +308,7 @@ } ; FUNC-LABEL: {{^}}v4i16_arg: +; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -301,6 +329,7 @@ } ; FUNC-LABEL: {{^}}v4i32_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z @@ -317,6 +346,7 @@ } ; FUNC-LABEL: {{^}}v4f32_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z @@ -332,6 +362,7 @@ } ; FUNC-LABEL: {{^}}v8i8_arg: +; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -363,6 +394,7 @@ } ; FUNC-LABEL: {{^}}v8i16_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -393,6 +425,7 @@ } ; FUNC-LABEL: {{^}}v8i32_arg: +; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z @@ -412,6 +445,7 @@ } ; FUNC-LABEL: {{^}}v8f32_arg: +; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z @@ -429,6 +463,7 @@ } ; FUNC-LABEL: {{^}}v16i8_arg: +; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -485,6 +520,7 @@ } ; FUNC-LABEL: {{^}}v16i16_arg: +; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -535,6 +571,7 @@ } ; FUNC-LABEL: {{^}}v16i32_arg: +; HSA-VI: kernarg_segment_byte_size = 128 ; HSA-VI: kernarg_segment_alignment = 6 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z @@ -562,6 +599,7 @@ } ; FUNC-LABEL: {{^}}v16f32_arg: +; HSA-VI: kernarg_segment_byte_size = 128 ; HSA-VI: kernarg_segment_alignment = 6 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z @@ -621,6 +659,9 @@ ; } ; FUNC-LABEL: {{^}}i1_arg: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + ; SI: buffer_load_ubyte ; SI: v_and_b32_e32 ; SI: buffer_store_byte @@ -631,6 +672,9 @@ } ; FUNC-LABEL: {{^}}i1_arg_zext_i32: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm @@ -641,6 +685,9 @@ } ; FUNC-LABEL: {{^}}i1_arg_zext_i64: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + ; SI: buffer_load_ubyte ; SI: buffer_store_dwordx2 ; SI: s_endpgm @@ -651,6 +698,9 @@ } ; FUNC-LABEL: {{^}}i1_arg_sext_i32: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + ; SI: buffer_load_ubyte ; SI: buffer_store_dword ; SI: s_endpgm @@ -661,6 +711,9 @@ } ; FUNC-LABEL: {{^}}i1_arg_sext_i64: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + ; SI: buffer_load_ubyte ; SI: v_bfe_i32 ; SI: v_ashrrev_i32 Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -4,6 +4,9 @@ ; ALL-LABEL: {{^}}test: ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: kernarg_segment_byte_size = 8 +; HSA: kernarg_segment_alignment = 4 + ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa @@ -17,6 +20,10 @@ } ; ALL-LABEL: {{^}}test_implicit: +; HSA: kernarg_segment_byte_size = 8 +; OS-MESA3D: kernarg_segment_byte_size = 24 +; CO-V2: kernarg_segment_alignment = 4 + ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { @@ -28,9 +35,12 @@ ret void } -; ALL-LABEL: {{^}}test_implicit_alignment -; HSA: kernarg_segment_byte_size = 10 +; ALL-LABEL: {{^}}test_implicit_alignment: +; HSA: kernarg_segment_byte_size = 12 ; OS-MESA3D: kernarg_segment_byte_size = 28 +; CO-V2: kernarg_segment_alignment = 4 + + ; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc ; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 ; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 @@ -48,6 +58,9 @@ ; ALL-LABEL: {{^}}opencl_test_implicit_alignment ; HSA: kernarg_segment_byte_size = 64 ; OS-MESA3D: kernarg_segment_byte_size = 28 +; CO-V2: kernarg_segment_alignment = 4 + + ; OS-UNKNOWN: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xc ; HSA: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x4 ; OS-MESA3D: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x3 @@ -63,7 +76,11 @@ } ; ALL-LABEL: {{^}}test_no_kernargs: -; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: kernarg_segment_byte_size = 0 +; OS-MESA3D: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 32 + ; HSA: s_load_dword s{{[0-9]+}}, s[4:5] define amdgpu_kernel void @test_no_kernargs() #1 { %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() @@ -74,9 +91,34 @@ ret void } +; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs: +; HSA: kernarg_segment_byte_size = 48 +; OS-MESA3d: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 4 +define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs() #2 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load volatile i32, i32 addrspace(4)* %arg.ptr + store volatile i32 %val, i32 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}opencl_test_implicit_alignment_no_explicit_kernargs_round_up: +; HSA: kernarg_segment_byte_size = 40 +; OS-MESA3D: kernarg_segment_byte_size = 16 +; CO-V2: kernarg_segment_alignment = 4 +define amdgpu_kernel void @opencl_test_implicit_alignment_no_explicit_kernargs_round_up() #3 { + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val = load volatile i32, i32 addrspace(4)* %arg.ptr + store volatile i32 %val, i32 addrspace(1)* null + ret void +} + declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind "amdgpu-implicitarg-num-bytes"="48" } +attributes #3 = { nounwind "amdgpu-implicitarg-num-bytes"="38" }