Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -216,7 +216,7 @@ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", - "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"}; + "amdgpu-implicitarg-ptr"}; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) NeedQueuePtr = true; @@ -305,11 +305,16 @@ Changed = true; } else { bool NonKernelOnly = false; - StringRef AttrName = intrinsicToAttrName(IID, - NonKernelOnly, NeedQueuePtr); - if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { - F.addFnAttr(AttrName); - Changed = true; + + if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { + F.addFnAttr("amdgpu-kernarg-segment-ptr"); + } else { + StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, + NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } } } } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3747,6 +3747,14 @@ return false; } case Intrinsic::amdgcn_kernarg_segment_ptr: + if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { + B.setInstr(MI); + // This only makes sense to call in a kernel, so just lower to null. + B.buildConstant(MI.getOperand(0).getReg(), 0); + MI.eraseFromParent(); + return true; + } + return legalizePreloadedArgIntrin( MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); case Intrinsic::amdgcn_implicitarg_ptr: Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1724,8 +1724,10 @@ if (Info.hasQueuePtr()) ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); - if (Info.hasKernargSegmentPtr()) - ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + // Implicit arg ptr takes the place of the kernarg segment pointer. This is a + // constant offset from the kernarg segment. + if (Info.hasImplicitArgPtr()) + ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); if (Info.hasDispatchID()) ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); @@ -1740,9 +1742,6 @@ if (Info.hasWorkGroupIDZ()) ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); - - if (Info.hasImplicitArgPtr()) - ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); } // Allocate special inputs passed in user SGPRs. @@ -2448,12 +2447,11 @@ AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { AMDGPUFunctionArgInfo::DISPATCH_PTR, AMDGPUFunctionArgInfo::QUEUE_PTR, - AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, AMDGPUFunctionArgInfo::DISPATCH_ID, AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z }; for (auto InputID : InputRegs) { @@ -5735,6 +5733,11 @@ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { + if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { + // This only makes sense to call in a kernel, so just lower to null. + return DAG.getConstant(0, DL, VT); + } + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.kernarg.segment.ptr.ll @@ -114,6 +114,16 @@ ret void } +; ALL-LABEL: {{^}}func_kernarg_segment_ptr: +; ALL: s_mov_b32 [[S_LO:s[0-9]+]], 0{{$}} +; ALL: s_mov_b32 [[S_HI:s[0-9]+]], 0{{$}} +; ALL: v_mov_b32_e32 v0, [[S_LO]]{{$}} +; ALL: v_mov_b32_e32 v1, [[S_HI]]{{$}} +define i8 addrspace(4)* @func_kernarg_segment_ptr() { + %ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + ret i8 addrspace(4)* %ptr +} + declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 Index: llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -231,7 +231,7 @@ ret void } -; HSA: define void @func_indirect_use_kernarg_segment_ptr() #14 { +; HSA: define void @func_indirect_use_kernarg_segment_ptr() #11 { define void @func_indirect_use_kernarg_segment_ptr() #1 { call void @use_kernarg_segment_ptr() ret void Index: llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -66,10 +66,10 @@ ret void } +; Not really supported in callable functions. ; GCN-LABEL: {{^}}use_kernarg_segment_ptr: -; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4 -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5 -; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0{{$}} define hidden void @use_kernarg_segment_ptr() #1 { %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* @@ -79,10 +79,6 @@ ; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 -; GCN-NOT: s[4:5] -; GCN-NOT: s4 -; GCN-NOT: s5 -; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 { call void @use_kernarg_segment_ptr() ret void @@ -437,9 +433,9 @@ %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* - %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -521,9 +517,9 @@ %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* - %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -590,9 +586,9 @@ %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)* %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc - %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 - %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)* - %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc + %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 + %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)* + %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc %val3 = call i64 @llvm.amdgcn.dispatch.id() call void asm sideeffect "; use $0", "s"(i64 %val3) @@ -614,6 +610,7 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #0 declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0 declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -165,18 +165,13 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA-DAG: v_mov_b32_e32 v0, s4 -; MESA-DAG: v_mov_b32_e32 v1, s5 -; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; MESA: v_mov_b32_e32 v0, s6 -; MESA: v_mov_b32_e32 v1, s7 +; GCN-DAG: v_mov_b32_e32 v0, s4 +; GCN-DAG: v_mov_b32_e32 v1, s5 +; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 + ; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; HSA: v_mov_b32_e32 v0, s4 -; HSA: v_mov_b32_e32 v1, s5 -; HSA: flat_load_dword v0, v[0:1] -; HSA: v_mov_b32_e32 v0, s6 -; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] ; GCN: s_waitcnt vmcnt(0) @@ -192,20 +187,12 @@ ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA-DAG: v_mov_b32_e32 v0, s4 -; MESA-DAG: v_mov_b32_e32 v1, s5 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; MESA-DAG: v_mov_b32_e32 v0, s6 -; MESA-DAG: v_mov_b32_e32 v1, s7 -; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 - +; GCN-DAG: v_mov_b32_e32 v0, s4 +; GCN-DAG: v_mov_b32_e32 v1, s5 +; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 -; HSA: v_mov_b32_e32 v0, s4 -; HSA: v_mov_b32_e32 v1, s5 -; HSA: flat_load_dword v0, v[0:1] - -; HSA: v_mov_b32_e32 v0, s6 -; HSA: v_mov_b32_e32 v1, s7 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; HSA: flat_load_dword v0, v[0:1] ; GCN: s_waitcnt vmcnt(0) @@ -220,8 +207,8 @@ } ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: -; GCN: s_add_u32 s6, s4, 0x70 -; GCN: s_addc_u32 s7, s5, 0 +; GCN: s_add_u32 s4, s4, 0x70 +; GCN: s_addc_u32 s5, s5, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { call void @func_kernarg_implicitarg_ptr()