Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -312,6 +312,10 @@ GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; +def int_amdgcn_queue_ptr : + GCCBuiltin<"__builtin_amdgcn_queue_ptr">, + Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + // __builtin_amdgcn_interp_p1 , , , def int_amdgcn_interp_p1 : GCCBuiltin<"__builtin_amdgcn_interp_p1">, Index: lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -104,7 +104,8 @@ }; static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" } }; // TODO: We should not add the attributes if the known compile time workgroup Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -745,6 +745,12 @@ CCInfo.AllocateReg(DispatchPtrReg); } + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(QueuePtrReg); + } + if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); @@ -1450,6 +1456,7 @@ switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + case Intrinsic::amdgcn_queue_ptr: { if (!Subtarget->isAmdHsaOS()) { DiagnosticInfoUnsupported BadIntrin( *MF.getFunction(), "unsupported hsa intrinsic without hsa target", @@ -1458,8 +1465,11 @@ return DAG.getUNDEF(VT); } + auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + TRI->getPreloadedValue(MF, Reg), VT); + } case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq: Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -116,6 +116,9 @@ if (F->hasFnAttribute("amdgpu-dispatch-ptr")) DispatchPtr = true; + + if (F->hasFnAttribute("amdgpu-queue-ptr")) + QueuePtr = true; } // We don't need to worry about accessing spills with flat instructions. Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -917,7 +917,8 @@ assert(MFI->hasDispatchPtr()); return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: - llvm_unreachable("not implemented"); + assert(MFI->hasQueuePtr()); + return MFI->QueuePtrUserSGPR; case SIRegisterInfo::WORKITEM_ID_X: assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; Index: test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll =================================================================== --- test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -9,6 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 +declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 ; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 { @@ -154,6 +155,15 @@ ret void } +; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 { +define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() + %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* + %val = load i32, i32 addrspace(2)* %bc + store i32 %val, i32 addrspace(1)* %ptr + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } @@ -168,3 +178,4 @@ ; HSA: attributes #8 = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; HSA: attributes #9 = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" } ; HSA: attributes #10 = { nounwind "amdgpu-dispatch-ptr" } +; HSA: attributes #11 = { nounwind "amdgpu-queue-ptr" } Index: test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: in function test{{.*}}: unsupported hsa intrinsic without hsa target + +; GCN-LABEL: {{^}}test: +; GCN: enable_sgpr_queue_ptr = 1 +; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +define void @test(i32 addrspace(1)* %out) { + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + +attributes #0 = { nounwind readnone }