Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -150,6 +150,8 @@ return "amdgpu-dispatch-ptr"; case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + return "amdgpu-kernarg-segment-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::trap: case Intrinsic::debugtrap: @@ -181,7 +183,8 @@ { "amdgpu-work-group-id-y" }, { "amdgpu-work-group-id-z" }, { "amdgpu-dispatch-ptr" }, - { "amdgpu-dispatch-id" } + { "amdgpu-dispatch-id" }, + { "amdgpu-kernarg-segment-ptr" } }; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -92,7 +92,7 @@ CallingConv::ID CC = F->getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - KernargSegmentPtr = true; + KernargSegmentPtr = !F->arg_empty(); WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -154,6 +154,9 @@ PrivateMemoryInputPtr = true; } + if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) + KernargSegmentPtr = true; + // We don't need to worry about accessing spills with flat instructions. // TODO: On VI where we must use flat for global, we should be able to omit // this if it is never used for generic access. Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -8,6 +8,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 ; HSA: define void @use_workitem_id_y() #1 { @@ -182,6 +183,19 @@ ret void } +; HSA: define void @use_kernarg_segment_ptr() #12 { +define void @use_kernarg_segment_ptr() #1 { + %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @func_indirect_use_kernarg_segment_ptr() #12 { +define void @func_indirect_use_kernarg_segment_ptr() #1 { + call void @use_kernarg_segment_ptr() + ret void +} + attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind "target-cpu"="fiji" } attributes #2 = { nounwind "target-cpu"="gfx900" } @@ -198,3 +212,4 @@ ; HSA: attributes #9 = { nounwind "target-cpu"="fiji" } ; HSA: attributes #10 = { nounwind "target-cpu"="gfx900" } ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } +; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -10,6 +10,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { @@ -164,6 +165,15 @@ ret void } +; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 { +define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* + %val = load i32, i32 addrspace(2)* %bc + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 { define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 { %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)* @@ -236,3 +246,4 @@ ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } ; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } +; HSA: attributes #12 = { nounwind "amdgpu-kernarg-segment-ptr" } Index: test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -36,7 +36,7 @@ ; CHECK-LABEL: {{^}}min_1024_max_2048 ; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 7 -; CHECK: NumSGPRsForWavesPerEU: 13 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 define amdgpu_kernel void @min_1024_max_2048() #3 { Index: test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll =================================================================== --- test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -118,7 +118,7 @@ ; CHECK-LABEL: {{^}}exactly_10: ; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 13 +; CHECK: NumSGPRsForWavesPerEU: 12 ; CHECK: NumVGPRsForWavesPerEU: 24 define amdgpu_kernel void @exactly_10() #9 { %val0 = load volatile float, float addrspace(1)* @var Index: test/CodeGen/AMDGPU/hsa.ll =================================================================== --- test/CodeGen/AMDGPU/hsa.ll +++ test/CodeGen/AMDGPU/hsa.ll @@ -40,7 +40,7 @@ ; HSA-CI: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU" ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" -; HSA: .amdgpu_hsa_kernel simple +; HSA-LABEL: .amdgpu_hsa_kernel simple ; HSA: {{^}}simple: ; HSA: .amd_kernel_code_t ; HSA: enable_sgpr_private_segment_buffer = 1 @@ -65,3 +65,11 @@ store i32 0, i32 addrspace(1)* %out ret void } + +; HSA-LABEL: .amdgpu_hsa_kernel simple_no_kernargs +; HSA: enable_sgpr_kernarg_segment_ptr = 0 +define amdgpu_kernel void @simple_no_kernargs() { +entry: + store volatile i32 0, i32 addrspace(1)* undef + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -49,6 +49,18 @@ ret void } +; ALL-LABEL: {{^}}test_no_kernargs: +; HSA: enable_sgpr_kernarg_segment_ptr = 1 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5] +define amdgpu_kernel void @test_no_kernargs() #1 { + %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() + %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)* + %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10 + %value = load i32, i32 addrspace(2)* %gep + store volatile i32 %value, i32 addrspace(1)* undef + ret void +} + declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0 Index: test/CodeGen/AMDGPU/mubuf-offset-private.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,42 +5,42 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, i8* inttoptr (i32 8 to i8*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, i16* inttoptr (i32 8 to i16*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, i32* inttoptr (i32 8 to i32*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , <2 x i32>* inttoptr (i32 8 to <2 x i32>*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , <4 x i32>* inttoptr (i32 8 to <4 x i32>*) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, i8* inttoptr (i32 8 to i8*) ret void @@ -65,7 +65,7 @@ } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, i16* inttoptr (i32 8 to i16*) ret void @@ -90,28 +90,28 @@ } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, i32* inttoptr (i32 8 to i32*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, <2 x i32>* inttoptr (i32 8 to <2 x i32>*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, <4 x i32>* inttoptr (i32 8 to <4 x i32>*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s8 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, i8* inttoptr (i32 4095 to i8*) ret void @@ -119,7 +119,7 @@ ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, i8* inttoptr (i32 4096 to i8*) ret void @@ -127,7 +127,7 @@ ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s8 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, i8* inttoptr (i32 4097 to i8*) ret void Index: test/CodeGen/AMDGPU/private-access-no-objects.ll =================================================================== --- test/CodeGen/AMDGPU/private-access-no-objects.ll +++ test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -10,14 +10,14 @@ ; GCN-LABEL: {{^}}store_to_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} ; -O0 should assume spilling, so the input scratch resource descriptor ; -should be used directly without any copies. ; OPTNONE-NOT: s_mov_b32 -; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}} +; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32* undef ret void @@ -26,7 +26,7 @@ ; GCN-LABEL: {{^}}store_to_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32* inttoptr (i32 124 to i32*) @@ -36,7 +36,7 @@ ; GCN-LABEL: {{^}}load_from_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32* undef @@ -46,7 +46,7 @@ ; GCN-LABEL: {{^}}load_from_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}} +; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} ; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32* inttoptr (i32 124 to i32*) Index: test/CodeGen/AMDGPU/trap.ll =================================================================== --- test/CodeGen/AMDGPU/trap.ll +++ test/CodeGen/AMDGPU/trap.ll @@ -19,11 +19,11 @@ ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_trap: ; HSA-TRAP: enable_trap_handler = 1 @@ -45,11 +45,11 @@ ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-WARNING: warning: :0:0: in function hsa_debugtrap void (): debugtrap handler not supported ; GCN-LABEL: {{^}}hsa_debugtrap: