Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -156,8 +156,9 @@ case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; case Intrinsic::amdgcn_kernarg_segment_ptr: - case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::trap: case Intrinsic::debugtrap: @@ -190,7 +191,8 @@ { "amdgpu-work-group-id-z" }, { "amdgpu-dispatch-ptr" }, { "amdgpu-dispatch-id" }, - { "amdgpu-kernarg-segment-ptr" } + { "amdgpu-kernarg-segment-ptr" }, + { "amdgpu-implicitarg-ptr" } }; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -764,7 +764,8 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; + unsigned getKernArgSegmentSize(const MachineFunction &MF, + unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -23,6 +23,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; + SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, bool Signed, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -899,6 +899,13 @@ DAG.getConstant(Offset, SL, PtrVT)); } +SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, + const SDLoc &SL) const { + auto MFI = DAG.getMachineFunction().getInfo(); + uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); +} + SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, @@ -3029,8 +3036,9 @@ TRI->getPreloadedValue(MF, Reg), VT); } case Intrinsic::amdgcn_implicitarg_ptr: { - unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); - return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + if (MFI->isEntryFunction()) + return getImplicitArgPtr(DAG, DL); + report_fatal_error("amdgcn.implicitarg.ptr not implemented for functions"); } case Intrinsic::amdgcn_kernarg_segment_ptr: { unsigned Reg Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -186,6 +186,10 @@ // Other shaders indirect 64-bits at sgpr[0:1] bool ImplicitBufferPtr : 1; + // Pointer to where the ABI inserts special kernel arguments separate from the + // user arguments. This is an offset from the KernargSegmentPtr. + bool ImplicitArgPtr : 1; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -346,6 +350,10 @@ return WorkItemIDZ; } + bool hasImplicitArgPtr() const { + return ImplicitArgPtr; + } + bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -93,11 +93,17 @@ // FIXME: Not really a system SGPR. PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; + if (F->hasFnAttribute("amdgpu-implicitarg-ptr")) + ImplicitArgPtr = true; + } else { + if (F->hasFnAttribute("amdgpu-implicitarg-ptr")) + KernargSegmentPtr = true; } CallingConv::ID CC = F->getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - KernargSegmentPtr = !F->arg_empty(); + if (!F->arg_empty()) + KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -237,52 +237,59 @@ ret void } -; HSA: define void @use_implicitarg_ptr() #14 { +; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 { +define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 { + %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef + ret void +} + +; HSA: define void @use_implicitarg_ptr() #15 { define void @use_implicitarg_ptr() #1 { %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef ret void } -; HSA: define void @func_indirect_use_implicitarg_ptr() #14 { +; HSA: define void @func_indirect_use_implicitarg_ptr() #15 { define void @func_indirect_use_implicitarg_ptr() #1 { call void @use_implicitarg_ptr() ret void } -; HSA: declare void @external.func() #15 +; HSA: declare void @external.func() #16 declare void @external.func() #3 -; HSA: define internal void @defined.func() #15 { +; HSA: define internal void @defined.func() #16 { define internal void @defined.func() #3 { ret void } -; HSA: define void @func_call_external() #15 { +; HSA: define void @func_call_external() #16 { define void @func_call_external() #3 { call void @external.func() ret void } -; HSA: define void @func_call_defined() #15 { +; HSA: define void @func_call_defined() #16 { define void @func_call_defined() #3 { call void @defined.func() ret void } -; HSA: define void @func_call_asm() #15 { +; HSA: define void @func_call_asm() #16 { define void @func_call_asm() #3 { call void asm sideeffect "", ""() #3 ret void } -; HSA: define amdgpu_kernel void @kern_call_external() #16 { +; HSA: define amdgpu_kernel void @kern_call_external() #17 { define amdgpu_kernel void @kern_call_external() #3 { call void @external.func() ret void } -; HSA: define amdgpu_kernel void @func_kern_defined() #16 { +; HSA: define amdgpu_kernel void @func_kern_defined() #17 { define amdgpu_kernel void @func_kern_defined() #3 { call void @defined.func() ret void @@ -308,5 +315,6 @@ ; HSA: attributes #12 = { nounwind "target-cpu"="gfx900" } ; HSA: attributes #13 = { nounwind "amdgpu-queue-ptr" "target-cpu"="gfx900" } ; HSA: attributes #14 = { nounwind "amdgpu-kernarg-segment-ptr" "target-cpu"="fiji" } -; HSA: attributes #15 = { nounwind } -; HSA: attributes #16 = { nounwind "amdgpu-flat-scratch" } +; HSA: attributes #15 = { nounwind "amdgpu-implicitarg-ptr" "target-cpu"="fiji" } +; HSA: attributes #16 = { nounwind } +; HSA: attributes #17 = { nounwind "amdgpu-flat-scratch" } Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA,HSA-NOENV %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa-opencl -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA,HSA-OPENCL %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s + +; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty: +; GCN: enable_sgpr_kernarg_segment_ptr = 1 + +; HSA-NOENV: kernarg_segment_byte_size = 0 +; HSA-OPENCL: kernarg_segment_byte_size = 32 +; MESA: kernarg_segment_byte_size = 16 + +; HSA: s_load_dword s0, s[4:5], 0x0 +define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 { + %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* + %load = load volatile i32, i32 addrspace(2)* %cast + ret void +} + +; GCN-LABEL: {{^}}kernel_implicitarg_ptr: +; GCN: enable_sgpr_kernarg_segment_ptr = 1 + +; HSA-NOENV: kernarg_segment_byte_size = 112 +; HSA-OPENCL: kernarg_segment_byte_size = 144 +; MESA: kernarg_segment_byte_size = 464 + +; HSA: s_load_dword s0, s[4:5], 0x1c +define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { + %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() + %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)* + %load = load volatile i32, i32 addrspace(2)* %cast + ret void +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #2 + +attributes #0 = { nounwind noinline } +attributes #1 = { nounwind noinline } +attributes #2 = { nounwind readnone speculatable }