Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -284,7 +284,7 @@ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) { + if (ST.isAmdCodeObjectV2(MF)) { PreloadedPrivateBufferReg = TRI->getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } @@ -363,14 +363,14 @@ // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasPrivateMemoryInputPtr()) { + if (MFI->hasImplicitBufferPtr()) { unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); BuildMI(MBB, I, DL, Mov64, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } else { const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); @@ -385,7 +385,7 @@ MachineMemOperand::MODereferenceable, 0, 0); BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01) - .addReg(PreloadedPrivateBufferReg) + .addReg(MFI->getImplicitBufferPtrUserSGPR()) .addImm(0) // offset .addImm(0) // glc .addMemOperand(MMO) Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1061,10 +1061,10 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass); - CCInfo.AllocateReg(PrivateMemoryPtrReg); + if (Info.hasImplicitBufferPtr()) { + unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); + MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); + CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -1227,7 +1227,7 @@ } } - if (NeedSP){ + if (NeedSP) { unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); @@ -2998,7 +2998,11 @@ switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + if (getSubtarget()->isAmdCodeObjectV2(MF)) + return emitNonHSAIntrinsicError(DAG, DL, VT); + + unsigned Reg = TRI->getPreloadedValue(MF, + SIRegisterInfo::IMPLICIT_BUFFER_PTR); return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); } case Intrinsic::amdgcn_dispatch_ptr: Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -97,7 +97,7 @@ unsigned StackPtrOffsetReg; // Input registers for non-HSA ABI - unsigned PrivateMemoryPtrUserSGPR; + unsigned ImplicitBufferPtrUserSGPR; // Input registers setup for the HSA ABI. // User SGPRs in allocation order. @@ -179,7 +179,7 @@ // Private memory buffer // Compute directly in sgpr[0:1] // Other shaders indirect 64-bits at sgpr[0:1] - bool PrivateMemoryInputPtr : 1; + bool ImplicitBufferPtr : 1; MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); @@ -236,7 +236,7 @@ unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); unsigned addDispatchID(const SIRegisterInfo &TRI); unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI); + unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -341,8 +341,8 @@ return WorkItemIDZ; } - bool hasPrivateMemoryInputPtr() const { - return PrivateMemoryInputPtr; + bool hasImplicitBufferPtr() const { + return ImplicitBufferPtr; } unsigned getNumUserSGPRs() const { @@ -396,8 +396,8 @@ return QueuePtrUserSGPR; } - unsigned getPrivateMemoryPtrUserSGPR() const { - return PrivateMemoryPtrUserSGPR; + unsigned getImplicitBufferPtrUserSGPR() const { + return ImplicitBufferPtrUserSGPR; } bool hasSpilledSGPRs() const { Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -74,7 +74,7 @@ WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), - PrivateMemoryInputPtr(false) { + ImplicitBufferPtr(false) { const SISubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); @@ -150,7 +150,7 @@ DispatchID = true; } else if (ST.isMesaGfxShader(MF)) { if (HasStackObjects || MaySpill) - PrivateMemoryInputPtr = true; + ImplicitBufferPtr = true; } // We don't need to worry about accessing spills with flat instructions. @@ -203,11 +203,11 @@ return FlatScratchInitUserSGPR; } -unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) { - PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg( +unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { + ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); NumUserSGPRs += 2; - return PrivateMemoryPtrUserSGPR; + return ImplicitBufferPtrUserSGPR; } /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -197,12 +197,13 @@ WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, // VGPRS: - FIRST_VGPR_VALUE = 15, + FIRST_VGPR_VALUE = 16, WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 16, - WORKITEM_ID_Z = 17 + WORKITEM_ID_Y = 17, + WORKITEM_ID_Z = 18 }; /// \brief Returns the physical register that \p Value is stored in. Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1352,12 +1352,11 @@ case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - if (ST.isAmdCodeObjectV2(MF)) { - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - } - assert(MFI->hasPrivateMemoryInputPtr()); - return MFI->PrivateMemoryPtrUserSGPR; + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::IMPLICIT_BUFFER_PTR: + assert(MFI->hasImplicitBufferPtr()); + return MFI->ImplicitBufferPtrUserSGPR; case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll @@ -0,0 +1,24 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target +define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target +define void @test_func(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Requires stack object to not assert +; GCN-LABEL: {{^}}test_ps: +; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: ; return +define amdgpu_ps i32 @test_ps() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +; GCN-LABEL: {{^}}test_cs: +; GCN: s_mov_b64 s[4:5], s[0:1] +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s0, s[0:1], 0x0 +define amdgpu_cs i32 @test_cs() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind }