diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -393,7 +393,7 @@ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr()) { + if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -1090,7 +1090,7 @@ if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr()) + if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -50,7 +50,9 @@ // TODO: We should not add the attributes if the known compile time workgroup // size is 1 for y/z. static ImplicitArgumentMask -intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { +intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, + bool HasApertureRegs, bool SupportsGetDoorBellID) { + unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; @@ -76,13 +78,23 @@ return DISPATCH_ID; case Intrinsic::amdgcn_implicitarg_ptr: return IMPLICIT_ARG_PTR; + // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access + // queue_ptr. case Intrinsic::amdgcn_queue_ptr: + NeedsImplicit = (CodeObjectVersion == 5); + return QUEUE_PTR; case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - // TODO: Does not require the queue pointer on gfx9+ + if (HasApertureRegs) + return NOT_IMPLICIT_INPUT; + // Under V5, we need implicitarg_ptr + offsets to access private_base or + // shared_base. For pre-V5, however, need to access them through queue_ptr + + // offsets. + return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; case Intrinsic::trap: - case Intrinsic::debugtrap: - IsQueuePtr = true; + if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. + return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; + NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5. return QUEUE_PTR; default: return NOT_IMPLICIT_INPUT; @@ -129,6 +141,12 @@ return ST.hasApertureRegs(); } + /// Check if the subtarget supports GetDoorbellID. + bool supportsGetDoorbellID(Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.supportsGetDoorbellID(); + } + std::pair getFlatWorkGroupSizes(const Function &F) { const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getFlatWorkGroupSizes(F); @@ -381,7 +399,10 @@ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); - bool NeedsQueuePtr = false; + bool NeedsImplicit = false; + auto &InfoCache = static_cast(A.getInfoCache()); + bool HasApertureRegs = InfoCache.hasApertureRegs(*F); + bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); for (Function *Callee : AAEdges.getOptimisticEdges()) { Intrinsic::ID IID = Callee->getIntrinsicID(); @@ -394,19 +415,25 @@ bool NonKernelOnly = false; ImplicitArgumentMask AttrMask = - intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr); + intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, + HasApertureRegs, SupportsGetDoorbellID); if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); } } - if (!NeedsQueuePtr) { - NeedsQueuePtr = checkForQueuePtr(A); - } + // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base. + if (NeedsImplicit) + removeAssumedBits(IMPLICIT_ARG_PTR); - if (NeedsQueuePtr) { - removeAssumedBits(QUEUE_PTR); + if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { + // Under V5, we need implicitarg_ptr + offsets to access private_base or + // shared_base. We do not actually need queue_ptr. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) + removeAssumedBits(IMPLICIT_ARG_PTR); + else + removeAssumedBits(QUEUE_PTR); } if (funcRetrievesHostcallPtr(A)) { @@ -419,6 +446,11 @@ removeAssumedBits(HEAP_PTR); } + if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) { + assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); + removeAssumedBits(QUEUE_PTR); + } + return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } @@ -515,6 +547,14 @@ return funcRetrievesImplicitKernelArg(A, OAS); } + bool funcRetrievesQueuePtr(Attributor &A) { + if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) + return false; + auto Pos = llvm::AMDGPU::getQueuePtrImplicitArgPosition(); + AAPointerInfo::OffsetAndSize OAS(Pos, 8); + return funcRetrievesImplicitKernelArg(A, OAS); + } + bool funcRetrievesImplicitKernelArg(Attributor &A, AAPointerInfo::OffsetAndSize OAS) { // Check if this is a call to the implicitarg_ptr builtin and it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -453,7 +453,7 @@ CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -1043,8 +1043,9 @@ Offset += 72; // Reserved. - // hidden_private_base and hidden_shared_base are only used by GFX8. - if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // hidden_private_base and hidden_shared_base are only when the subtarget has + // ApertureRegs. + if (!ST.hasApertureRegs()) { emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args); } else diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2115,7 +2115,7 @@ if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (Info.hasQueuePtr()) + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2162,7 +2162,7 @@ CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -60,6 +60,9 @@ /// \returns The offset of the heap ptr argument from implicitarg_ptr unsigned getHeapPtrImplicitArgPosition(); +/// \returns The offset of the queue ptr argument from implicitarg_ptr +unsigned getQueuePtrImplicitArgPosition(); + /// \returns Code object version. unsigned getAmdhsaCodeObjectVersion(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -163,6 +163,13 @@ return 0; } +unsigned getQueuePtrImplicitArgPosition() { + if (AmdhsaCodeObjectVersion == 5) + return 200; + llvm_unreachable("queue_ptr is supported only by code object version 5"); + return 0; +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -26,7 +26,7 @@ ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -40,7 +40,7 @@ ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -79,7 +79,7 @@ ; ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 @@ -94,7 +94,7 @@ ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -26,7 +26,7 @@ ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -40,7 +40,7 @@ ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -79,7 +79,7 @@ ; ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 @@ -94,7 +94,7 @@ ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll @@ -93,7 +93,7 @@ ; GFX8-NEXT: - .offset: 220 ; GFX8-NEXT: .size: 4 ; GFX8-NEXT: .value_kind: hidden_shared_base -; CHECK-NEXT: - .address_space: global +; CHECK: - .address_space: global ; CHECK-NEXT: .offset: 224 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_queue_ptr diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -24,9 +24,16 @@ } ; CHECK: - .args: -; CHECK: .offset: 208 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 200 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 204 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_multigrid_sync_arg +; GFX9-NOT: .value_kind: hidden_private_base +; CKECK-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: is_shared_requires_queue_ptr ; CHECK: .symbol: is_shared_requires_queue_ptr.kd define amdgpu_kernel void @is_shared_requires_queue_ptr(i8* %ptr) { @@ -37,9 +44,16 @@ } ; CHECK: - .args: -; CHECK: .offset: 208 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 200 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 204 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_private_base +; GFX9-NOT: .value_kind: hidden_shared_base +; CKECK-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: is_private_requires_queue_ptr ; CHECK: .symbol: is_private_requires_queue_ptr.kd define amdgpu_kernel void @is_private_requires_queue_ptr(i8* %ptr) { @@ -50,9 +64,20 @@ } ; CHECK: - .args: -; CHECK: .offset: 200 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 192 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 196 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; PRE-GFX9-NEXT: .address_space: global +; PRE-GFX9-NEXT: .offset: 200 +; PRE-GFX9-NEXT: .size: 8 +; PRE-GFX9-NEXT: .value_kind: hidden_queue_ptr +; GFX9-NOT: .value_kind: hidden_private_base +; GFX9-NOT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: trap_requires_queue_ptr ; CHECK: .symbol: trap_requires_queue_ptr.kd define amdgpu_kernel void @trap_requires_queue_ptr() { @@ -60,17 +85,6 @@ unreachable } -; CHECK: - .args: -; CHECK: .offset: 200 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr -; CHECK: .name: debugtrap_requires_queue_ptr -; CHECK: .symbol: debugtrap_requires_queue_ptr.kd -define amdgpu_kernel void @debugtrap_requires_queue_ptr() { - call void @llvm.debugtrap() - unreachable -} - ; CHECK: - .args: ; CHECK: .offset: 208 ; CHECK-NEXT: .size: 8 diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-queueptr-v5.ll @@ -0,0 +1,301 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 --amdhsa-code-object-version=5 < %s | FileCheck --check-prefix=CHECK %s + +declare void @function1() + +declare void @function2() #0 + +; Function Attrs: noinline +define void @function3(i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink) #2 { + store i8 addrspace(4)* %argptr, i8 addrspace(4)* addrspace(1)* %sink, align 8 + ret void +} + +; Function Attrs: noinline +define void @function4(i64 %arg, i64* %a) #2 { + store i64 %arg, i64* %a + ret void +} + +; Function Attrs: noinline +define void @function5(i8 addrspace(4)* %ptr, i64* %sink) #2 { + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 168 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + store i64 %x, i64* %sink + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare align 4 i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1 + +; CHECK: amdhsa.kernels: +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel10 +define amdgpu_kernel void @test_kernel10(i8* %a) { + store i8 3, i8* %a, align 1 + ret void +} + +; Call to an extern function + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel20 +define amdgpu_kernel void @test_kernel20(i8* %a) { + call void @function1() + store i8 3, i8* %a, align 1 + ret void +} + +; Explicit attribute on kernel + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel21 +define amdgpu_kernel void @test_kernel21(i8* %a) #0 { + call void @function1() + store i8 3, i8* %a, align 1 + ret void +} + +; Explicit attribute on extern callee + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel22 +define amdgpu_kernel void @test_kernel22(i8* %a) { + call void @function2() + store i8 3, i8* %a, align 1 + ret void +} + +; Access more bytes than the pointer size + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel30 +define amdgpu_kernel void @test_kernel30(i128* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192 + %cast = bitcast i8 addrspace(4)* %gep to i128 addrspace(4)* + %x = load i128, i128 addrspace(4)* %cast + store i128 %x, i128* %a + ret void +} + +; Typical load of queue pointer + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel40 +define amdgpu_kernel void @test_kernel40(i64* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + store i64 %x, i64* %a + ret void +} + +; Typical usage, overriden by explicit attribute on kernel + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel41 +define amdgpu_kernel void @test_kernel41(i64* %a) #0 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + store i64 %x, i64* %a + ret void +} + +; Access to implicit arg before the queue pointer + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel42 +define amdgpu_kernel void @test_kernel42(i64* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 192 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + store i64 %x, i64* %a + ret void +} + +; Access to implicit arg after the queue pointer + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel43 +define amdgpu_kernel void @test_kernel43(i64* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + store i64 %x, i64* %a + ret void +} + +; Accessing a byte just before the queue pointer + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel44 +define amdgpu_kernel void @test_kernel44(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 199 + %x = load i8, i8 addrspace(4)* %gep, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Accessing a byte inside the queue pointer + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel45 +define amdgpu_kernel void @test_kernel45(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200 + %x = load i8, i8 addrspace(4)* %gep, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Accessing a byte inside the queue pointer + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel46 +define amdgpu_kernel void @test_kernel46(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 207 + %x = load i8, i8 addrspace(4)* %gep, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Accessing a byte just after the queue pointer + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel47 +define amdgpu_kernel void @test_kernel47(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 208 + %x = load i8, i8 addrspace(4)* %gep, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Access with an unknown offset + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel50 +define amdgpu_kernel void @test_kernel50(i8* %a, i32 %b) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 %b + %x = load i8, i8 addrspace(4)* %gep, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Multiple geps reaching the queue pointer argument. + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel51 +define amdgpu_kernel void @test_kernel51(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16 + %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 184 + %x = load i8, i8 addrspace(4)* %gep2, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Multiple geps not reaching the queue pointer argument. + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel52 +define amdgpu_kernel void @test_kernel52(i8* %a) { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep1 = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 16 + %gep2 = getelementptr inbounds i8, i8 addrspace(4)* %gep1, i64 16 + %x = load i8, i8 addrspace(4)* %gep2, align 1 + store i8 %x, i8* %a, align 1 + ret void +} + +; Queue pointer used inside a function call + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel60 +define amdgpu_kernel void @test_kernel60(i64* %a) #2 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 200 + %cast = bitcast i8 addrspace(4)* %gep to i64 addrspace(4)* + %x = load i64, i64 addrspace(4)* %cast + call void @function4(i64 %x, i64* %a) + ret void +} + +; Queue pointer retrieved inside a function call; chain of geps + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel61 +define amdgpu_kernel void @test_kernel61(i64* %a) #2 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i64 32 + call void @function5(i8 addrspace(4)* %gep, i64* %a) + ret void +} + +; Pointer captured + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel70 +define amdgpu_kernel void @test_kernel70(i8 addrspace(4)* addrspace(1)* %sink) #2 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42 + store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink, align 8 + ret void +} + +; Pointer captured inside function call + +; CHECK: - .args: +; CHECK: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel71 +define amdgpu_kernel void @test_kernel71(i8 addrspace(4)* addrspace(1)* %sink) #2 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42 + call void @function3(i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* %sink) + ret void +} + +; Ineffective pointer capture + +; CHECK: - .args: +; CHECK-NOT: hidden_queue_ptr +; CHECK-LABEL: .name: test_kernel72 +define amdgpu_kernel void @test_kernel72() #2 { + %ptr = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, i8 addrspace(4)* %ptr, i32 42 + store i8 addrspace(4)* %gep, i8 addrspace(4)* addrspace(1)* undef, align 8 + ret void +} + +attributes #0 = { "amdgpu-no-queue-ptr" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { noinline } diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll --- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -1,9 +1,8 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=HSA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=DOORBELL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=DOORBELL %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=HSA %s declare void @llvm.trap() #0 -declare void @llvm.debugtrap() #1 ; HSA: .amdhsa_kernel trap ; HSA-NEXT: .amdhsa_group_segment_fixed_size 0 @@ -13,6 +12,14 @@ ; HSA-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .end_amdhsa_kernel +; DOORBELL: .amdhsa_kernel trap +; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 +; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 +; DOORBELL-NEXT: .amdhsa_kernarg_size 8 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6 +; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; DOORBELL: .end_amdhsa_kernel + define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { store volatile i32 1, i32 addrspace(1)* %arg0 call void @llvm.trap() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -26,7 +26,7 @@ ; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -28,7 +28,7 @@ ; GFX9-DAG: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] ; GCN: s_cbranch_vccnz diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -38,7 +38,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: trap: ; HSA-TRAP-GFX900-V4: ; %bb.0: -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: trap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -391,7 +391,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -643,7 +643,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap: ; HSA-TRAP-GFX900-V4: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -762,7 +762,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -793,7 +793,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V2: ; %bb.0: -; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -806,7 +806,7 @@ ; ; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V3: ; %bb.0: -; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -819,7 +819,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 @@ -852,7 +852,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: enable_mem_ordered = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_fwd_progress = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 8 +; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 6 ; HSA-TRAP-GFX803-V2-NEXT: enable_trap_handler = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -864,7 +864,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: enable_exception = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -884,7 +884,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX803-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX803-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-TRAP-GFX803-V2-NEXT: workitem_vgpr_count = 4 ; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_count = 0 @@ -900,7 +900,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-TRAP-GFX803-V2-NEXT: .end_amd_kernel_code_t ; HSA-TRAP-GFX803-V2-NEXT: ; %bb.0: -; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V2-NEXT: s_waitcnt lgkmcnt(0) @@ -915,7 +915,7 @@ ; ; HSA-TRAP-GFX803-V3-LABEL: debugtrap: ; HSA-TRAP-GFX803-V3: ; %bb.0: -; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0) @@ -930,7 +930,7 @@ ; ; HSA-TRAP-GFX803-V4-LABEL: debugtrap: ; HSA-TRAP-GFX803-V4: ; %bb.0: -; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -965,7 +965,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 8 +; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 6 ; HSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -977,7 +977,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: enable_exception = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -997,7 +997,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -1013,7 +1013,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t ; HSA-TRAP-GFX900-V2-NEXT: ; %bb.0: -; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -1027,7 +1027,7 @@ ; ; HSA-TRAP-GFX900-V3-LABEL: debugtrap: ; HSA-TRAP-GFX900-V3: ; %bb.0: -; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -1041,7 +1041,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: debugtrap: ; HSA-TRAP-GFX900-V4: ; %bb.0: -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 @@ -1075,7 +1075,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 8 +; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 6 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_trap_handler = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -1087,7 +1087,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: enable_exception = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -1107,7 +1107,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -1123,7 +1123,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t ; HSA-NOTRAP-GFX900-V2-NEXT: ; %bb.0: -; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -1136,7 +1136,7 @@ ; ; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap: ; HSA-NOTRAP-GFX900-V3: ; %bb.0: -; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -1149,7 +1149,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll --- a/llvm/test/CodeGen/AMDGPU/trap.ll +++ b/llvm/test/CodeGen/AMDGPU/trap.ll @@ -51,11 +51,11 @@ ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_debugtrap: ; HSA-TRAP: enable_trap_handler = 0