Index: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -393,7 +393,7 @@ KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr()) { + if (MFI.needsQueuePtrUserSGPRs()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -1084,7 +1084,7 @@ if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr()) + if (MFI->needsQueuePtrUserSGPRs()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) Index: llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -50,7 +50,9 @@ // TODO: We should not add the attributes if the known compile time workgroup // size is 1 for y/z. static ImplicitArgumentMask -intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) { +intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, + bool HasApertureRegs, bool SupportsGetDoorBellID ) { + unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; @@ -77,13 +79,18 @@ case Intrinsic::amdgcn_implicitarg_ptr: return IMPLICIT_ARG_PTR; case Intrinsic::amdgcn_queue_ptr: + NeedsImplicit = (CodeObjectVersion == 5); + return QUEUE_PTR; case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: - // TODO: Does not require the queue pointer on gfx9+ + if (HasApertureRegs) + return NOT_IMPLICIT_INPUT; + return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; case Intrinsic::trap: - case Intrinsic::debugtrap: - IsQueuePtr = true; - return QUEUE_PTR; + if (SupportsGetDoorBellID) + return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; + NeedsImplicit = (CodeObjectVersion == 5); + return QUEUE_PTR; default: return NOT_IMPLICIT_INPUT; } @@ -129,6 +136,12 @@ return ST.hasApertureRegs(); } + /// Check if the subtarget supports GetDoorbellID. + bool supportsGetDoorbellID(Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.supportsGetDoorbellID(); + } + std::pair getFlatWorkGroupSizes(const Function &F) { const GCNSubtarget &ST = TM.getSubtarget(F); return ST.getFlatWorkGroupSizes(F); @@ -381,7 +394,10 @@ bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); - bool NeedsQueuePtr = false; + bool NeedsImplicit = false; + auto &InfoCache = static_cast(A.getInfoCache()); + bool HasApertureRegs = InfoCache.hasApertureRegs(*F); + bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); for (Function *Callee : AAEdges.getOptimisticEdges()) { Intrinsic::ID IID = Callee->getIntrinsicID(); @@ -394,19 +410,26 @@ bool NonKernelOnly = false; ImplicitArgumentMask AttrMask = - intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr); + intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, + HasApertureRegs, SupportsGetDoorbellID); if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); } } - if (!NeedsQueuePtr) { - NeedsQueuePtr = checkForQueuePtr(A); - } + // Need implicit arg ptr to reference queue_ptr, private_base, and + // shared_base etc. + if (NeedsImplicit) + removeAssumedBits(IMPLICIT_ARG_PTR); - if (NeedsQueuePtr) { - removeAssumedBits(QUEUE_PTR); + if (checkForQueuePtr(A)) { + // For code object version 5, we need private_base or shared_base from + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) + removeAssumedBits(IMPLICIT_ARG_PTR); + else + removeAssumedBits(QUEUE_PTR); } if (funcRetrievesHostcallPtr(A)) { Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -479,7 +479,7 @@ CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.needsQueuePtrUserSGPRs()) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -1036,8 +1036,9 @@ Offset += 72; // Reserved. - // hidden_private_base and hidden_shared_base are only used by GFX8. - if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // hidden_private_base and hidden_shared_base are only when the subtarget has + // ApertureRegs. + if (!ST.hasApertureRegs()) { emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args); emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args); } else Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2106,7 +2106,7 @@ if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (Info.hasQueuePtr()) + if (Info.needsQueuePtrUserSGPRs()) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2153,7 +2153,7 @@ CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr()) { + if (Info.needsQueuePtrUserSGPRs()) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); Index: llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -649,6 +649,10 @@ return QueuePtr; } + bool needsQueuePtrUserSGPRs() const { + return QueuePtr && AMDGPU::getAmdhsaCodeObjectVersion() < 5; + } + bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -57,6 +57,9 @@ /// \returns The offset of the hostcall pointer argument from implicitarg_ptr unsigned getHostcallImplicitArgPosition(); +/// \returns amdhsa code object version. +unsigned getAmdhsaCodeObjectVersion(); + struct GcnBufferFormatInfo { unsigned Format; unsigned BitsPerComp; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -152,6 +152,10 @@ } } +unsigned getAmdhsaCodeObjectVersion() { + return AmdhsaCodeObjectVersion; +} + #define GET_MIMGBaseOpcodesTable_IMPL #define GET_MIMGDimInfoTable_IMPL #define GET_MIMGInfoTable_IMPL Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -26,7 +26,7 @@ ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -40,7 +40,7 @@ ; ; GFX10-LABEL: is_private_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -79,7 +79,7 @@ ; ; GFX9-LABEL: is_private_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 @@ -94,7 +94,7 @@ ; ; GFX10-LABEL: is_private_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -26,7 +26,7 @@ ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc @@ -40,7 +40,7 @@ ; ; GFX10-LABEL: is_local_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] glc dlc @@ -79,7 +79,7 @@ ; ; GFX9-LABEL: is_local_sgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 @@ -94,7 +94,7 @@ ; ; GFX10-LABEL: is_local_sgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16) ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-hidden-args-v5.ll @@ -89,7 +89,7 @@ ; GFX8-NEXT: - .offset: 220 ; GFX8-NEXT: .size: 4 ; GFX8-NEXT: .value_kind: hidden_shared_base -; CHECK-NEXT: - .address_space: global +; CHECK: - .address_space: global ; CHECK-NEXT: .offset: 224 ; CHECK-NEXT: .size: 8 ; CHECK-NEXT: .value_kind: hidden_queue_ptr Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll +++ llvm/test/CodeGen/AMDGPU/hsa-metadata-queue-ptr-v5.ll @@ -24,9 +24,16 @@ } ; CHECK: - .args: -; CHECK: .offset: 208 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 200 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 204 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_multigrid_sync_arg +; GFX9-NOT: .value_kind: hidden_private_base +; CKECK-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: is_shared_requires_queue_ptr ; CHECK: .symbol: is_shared_requires_queue_ptr.kd define amdgpu_kernel void @is_shared_requires_queue_ptr(i8* %ptr) { @@ -37,9 +44,16 @@ } ; CHECK: - .args: -; CHECK: .offset: 208 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 200 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 204 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_private_base +; GFX9-NOT: .value_kind: hidden_shared_base +; CKECK-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: is_private_requires_queue_ptr ; CHECK: .symbol: is_private_requires_queue_ptr.kd define amdgpu_kernel void @is_private_requires_queue_ptr(i8* %ptr) { @@ -50,9 +64,20 @@ } ; CHECK: - .args: -; CHECK: .offset: 200 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr +; CHECK: .value_kind: hidden_multigrid_sync_arg +; PRE-GFX9: .offset: 192 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_private_base +; PRE-GFX9-NEXT: .offset: 196 +; PRE-GFX9-NEXT: .size: 4 +; PRE-GFX9-NEXT: .value_kind: hidden_shared_base +; PRE-GFX9-NEXT: .address_space: global +; PRE-GFX9-NEXT: .offset: 200 +; PRE-GFX9-NEXT: .size: 8 +; PRE-GFX9-NEXT: .value_kind: hidden_queue_ptr +; GFX9-NOT: .value_kind: hidden_private_base +; GFX9-NOT: .value_kind: hidden_shared_base +; GFX9-NOT: .value_kind: hidden_queue_ptr ; CHECK: .name: trap_requires_queue_ptr ; CHECK: .symbol: trap_requires_queue_ptr.kd define amdgpu_kernel void @trap_requires_queue_ptr() { @@ -60,17 +85,6 @@ unreachable } -; CHECK: - .args: -; CHECK: .offset: 200 -; CHECK-NEXT: .size: 8 -; CHECK-NEXT: .value_kind: hidden_queue_ptr -; CHECK: .name: debugtrap_requires_queue_ptr -; CHECK: .symbol: debugtrap_requires_queue_ptr.kd -define amdgpu_kernel void @debugtrap_requires_queue_ptr() { - call void @llvm.debugtrap() - unreachable -} - ; CHECK: - .args: ; CHECK: .offset: 208 ; CHECK-NEXT: .size: 8 Index: llvm/test/CodeGen/AMDGPU/kernarg-size.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/kernarg-size.ll +++ llvm/test/CodeGen/AMDGPU/kernarg-size.ll @@ -1,9 +1,8 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=HSA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=DOORBELL %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=DOORBELL %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 < %s | FileCheck --check-prefix=HSA %s declare void @llvm.trap() #0 -declare void @llvm.debugtrap() #1 ; HSA: .amdhsa_kernel trap ; HSA-NEXT: .amdhsa_group_segment_fixed_size 0 @@ -13,6 +12,14 @@ ; HSA-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 ; HSA: .end_amdhsa_kernel +; DOORBELL: .amdhsa_kernel trap +; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0 +; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0 +; DOORBELL-NEXT: .amdhsa_kernarg_size 8 +; DOORBELL-NEXT: .amdhsa_user_sgpr_count 6 +; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1 +; DOORBELL: .end_amdhsa_kernel + define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) { store volatile i32 1, i32 addrspace(1)* %arg0 call void @llvm.trap() Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -26,7 +26,7 @@ ; GFX9-DAG: s_getreg_b32 [[APERTURE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -28,7 +28,7 @@ ; GFX9-DAG: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16 ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} -; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}} +; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] ; GCN: s_cbranch_vccnz Index: llvm/test/CodeGen/AMDGPU/trap-abis.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -38,7 +38,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -242,7 +242,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: trap: ; HSA-TRAP-GFX900-V4: ; %bb.0: -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -340,7 +340,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: trap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -393,7 +393,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: ; %entry -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; NOHSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -651,7 +651,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: non_entry_trap: ; HSA-TRAP-GFX900-V4: ; %bb.0: ; %entry -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; HSA-TRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -773,7 +773,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: non_entry_trap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: ; %entry -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NOTRAP-GFX900-V4-NEXT: global_load_dword v1, v0, s[0:1] glc @@ -805,7 +805,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) { ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V2: ; %bb.0: -; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -818,7 +818,7 @@ ; ; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V3: ; %bb.0: -; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -831,7 +831,7 @@ ; ; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap: ; NOHSA-TRAP-GFX900-V4: ; %bb.0: -; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24 +; NOHSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; NOHSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 @@ -864,7 +864,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: enable_mem_ordered = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_fwd_progress = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 8 +; HSA-TRAP-GFX803-V2-NEXT: user_sgpr_count = 6 ; HSA-TRAP-GFX803-V2-NEXT: enable_trap_handler = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -876,7 +876,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: enable_exception = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-TRAP-GFX803-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -896,7 +896,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX803-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX803-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-TRAP-GFX803-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-TRAP-GFX803-V2-NEXT: workitem_vgpr_count = 4 ; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX803-V2-NEXT: reserved_vgpr_count = 0 @@ -912,7 +912,7 @@ ; HSA-TRAP-GFX803-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-TRAP-GFX803-V2-NEXT: .end_amd_kernel_code_t ; HSA-TRAP-GFX803-V2-NEXT: ; %bb.0: -; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V2-NEXT: s_waitcnt lgkmcnt(0) @@ -927,7 +927,7 @@ ; ; HSA-TRAP-GFX803-V3-LABEL: debugtrap: ; HSA-TRAP-GFX803-V3: ; %bb.0: -; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt lgkmcnt(0) @@ -942,7 +942,7 @@ ; ; HSA-TRAP-GFX803-V4-LABEL: debugtrap: ; HSA-TRAP-GFX803-V4: ; %bb.0: -; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX803-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 1 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v3, 2 ; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt lgkmcnt(0) @@ -977,7 +977,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 8 +; HSA-TRAP-GFX900-V2-NEXT: user_sgpr_count = 6 ; HSA-TRAP-GFX900-V2-NEXT: enable_trap_handler = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -989,7 +989,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: enable_exception = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-TRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -1009,7 +1009,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-TRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-TRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-TRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-TRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-TRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -1025,7 +1025,7 @@ ; HSA-TRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-TRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t ; HSA-TRAP-GFX900-V2-NEXT: ; %bb.0: -; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -1039,7 +1039,7 @@ ; ; HSA-TRAP-GFX900-V3-LABEL: debugtrap: ; HSA-TRAP-GFX900-V3: ; %bb.0: -; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -1053,7 +1053,7 @@ ; ; HSA-TRAP-GFX900-V4-LABEL: debugtrap: ; HSA-TRAP-GFX900-V4: ; %bb.0: -; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-TRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-TRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 @@ -1087,7 +1087,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: enable_mem_ordered = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_fwd_progress = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 8 +; HSA-NOTRAP-GFX900-V2-NEXT: user_sgpr_count = 6 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_trap_handler = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_x = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_workgroup_id_y = 0 @@ -1099,7 +1099,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: enable_exception = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_private_segment_buffer = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_ptr = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 1 +; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_queue_ptr = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_dispatch_id = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -1119,7 +1119,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: gds_segment_byte_size = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: kernarg_segment_byte_size = 8 ; HSA-NOTRAP-GFX900-V2-NEXT: workgroup_fbarrier_count = 0 -; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 8 +; HSA-NOTRAP-GFX900-V2-NEXT: wavefront_sgpr_count = 6 ; HSA-NOTRAP-GFX900-V2-NEXT: workitem_vgpr_count = 3 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_first = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: reserved_vgpr_count = 0 @@ -1135,7 +1135,7 @@ ; HSA-NOTRAP-GFX900-V2-NEXT: runtime_loader_kernel_symbol = 0 ; HSA-NOTRAP-GFX900-V2-NEXT: .end_amd_kernel_code_t ; HSA-NOTRAP-GFX900-V2-NEXT: ; %bb.0: -; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V2-NEXT: v_mov_b32_e32 v2, 2 @@ -1148,7 +1148,7 @@ ; ; HSA-NOTRAP-GFX900-V3-LABEL: debugtrap: ; HSA-NOTRAP-GFX900-V3: ; %bb.0: -; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V3-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V3-NEXT: v_mov_b32_e32 v2, 2 @@ -1161,7 +1161,7 @@ ; ; HSA-NOTRAP-GFX900-V4-LABEL: debugtrap: ; HSA-NOTRAP-GFX900-V4: ; %bb.0: -; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; HSA-NOTRAP-GFX900-V4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v1, 1 ; HSA-NOTRAP-GFX900-V4-NEXT: v_mov_b32_e32 v2, 2 Index: llvm/test/CodeGen/AMDGPU/trap.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/trap.ll +++ llvm/test/CodeGen/AMDGPU/trap.ll @@ -51,11 +51,11 @@ ; MESA-TRAP: .section .AMDGPU.config ; MESA-TRAP: .long 47180 -; MESA-TRAP-NEXT: .long 208 +; MESA-TRAP-NEXT: .long 204 ; NOMESA-TRAP: .section .AMDGPU.config ; NOMESA-TRAP: .long 47180 -; NOMESA-TRAP-NEXT: .long 144 +; NOMESA-TRAP-NEXT: .long 140 ; GCN-LABEL: {{^}}hsa_debugtrap: ; HSA-TRAP: enable_trap_handler = 0