diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -164,6 +164,28 @@ NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; +// Enumerate different types of result-returning VMEM operations. Although +// s_waitcnt orders them all with a single vmcnt counter, in the absence of +// s_waitcnt only instructions of the same VmemType are guaranteed to write +// their results in order -- so there is no need to insert an s_waitcnt between +// two instructions of the same type that write the same vgpr. +enum VmemType { + // BUF instructions and MIMG instructions without a sampler. + VMEM_NOSAMPLER, + // MIMG instructions with a sampler. + VMEM_SAMPLER, +}; + +VmemType getVmemType(const MachineInstr &Inst) { + assert(SIInstrInfo::isVMEM(Inst)); + if (!SIInstrInfo::isMIMG(Inst)) + return VMEM_NOSAMPLER; + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode()); + return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler + ? VMEM_SAMPLER + : VMEM_NOSAMPLER; +} + void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { switch (T) { case VM_CNT: @@ -281,6 +303,18 @@ LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } + // Return true if there might be pending writes to the specified vgpr by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { + assert(GprNo < NUM_ALL_VGPRS); + return VgprVmemTypes[GprNo] & ~(1 << V); + } + + void clearVgprVmemTypes(int GprNo) { + assert(GprNo < NUM_ALL_VGPRS); + VgprVmemTypes[GprNo] = 0; + } + void print(raw_ostream &); void dump() { print(dbgs()); } @@ -337,6 +371,9 @@ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -617,8 +654,15 @@ if (!Op.isReg() || !Op.isDef()) continue; RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); - if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS) - continue; + if (T == VM_CNT) { + if (Interval.first >= NUM_ALL_VGPRS) + continue; + if (SIInstrInfo::isVMEM(Inst)) { + VmemType V = getVmemType(Inst); + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) + VgprVmemTypes[RegNo] |= 1 << V; + } + } for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, T, CurrScore); } @@ -982,8 +1026,17 @@ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(*MRI, Op.getReg())) { - ScoreBrackets.determineWait( - VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the + // previous write and this write are the same type of VMEM + // instruction, in which case they're guaranteed to write their + // results in order anyway. + if (Op.isUse() || !SIInstrInfo::isVMEM(MI) || + ScoreBrackets.hasOtherPendingVmemTypes(RegNo, + getVmemType(MI))) { + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.clearVgprVmemTypes(RegNo); + } if (Op.isDef()) { ScoreBrackets.determineWait( EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); @@ -1296,6 +1349,14 @@ RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); } + if (T == VM_CNT) { + for (int J = 0; J <= VgprUB; J++) { + unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; + RegStrictDom |= NewVmemTypes != VgprVmemTypes[J]; + VgprVmemTypes[J] = NewVmemTypes; + } + } + if (T == LGKM_CNT) { for (int J = 0; J <= SgprUB; J++) { RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -16,7 +16,7 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]] -; W64: s_waitcnt vmcnt(0) +; W64: s_nop 0 ; W64: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[CMP]] ; W64: s_cbranch_execnz [[LOOPBB]] @@ -34,7 +34,7 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]] -; W32: s_waitcnt vmcnt(0) +; W32: s_nop 0 ; W32: buffer_load_format_x [[RES:v[0-9]+]], v4, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]] ; W32: s_cbranch_execnz [[LOOPBB]] @@ -59,7 +59,7 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]] -; W64: s_waitcnt vmcnt(0) +; W64: s_nop 0 ; W64: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[CMP]] ; W64: s_cbranch_execnz [[LOOPBB0]] @@ -77,7 +77,7 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7] ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]] -; W64: s_waitcnt vmcnt(0) +; W64: s_nop 0 ; W64: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[CMP]] ; W64: s_cbranch_execnz [[LOOPBB1]] @@ -99,7 +99,7 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]] -; W32: s_waitcnt vmcnt(0) +; W32: s_nop 0 ; W32: buffer_load_format_x [[RES0:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]] ; W32: s_cbranch_execnz [[LOOPBB0]] @@ -117,7 +117,7 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7] ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]] -; W32: s_waitcnt vmcnt(0) +; W32: s_nop 0 ; W32: buffer_load_format_x [[RES1:v[0-9]+]], v8, s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]] ; W32: s_cbranch_execnz [[LOOPBB1]] @@ -150,7 +150,7 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]] -; W64: s_waitcnt vmcnt(0) +; W64: s_nop 0 ; W64: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[CMP]] ; W64: s_cbranch_execnz [[LOOPBB0]] @@ -171,7 +171,7 @@ ; W64: v_cmp_eq_u64_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7] ; W64: s_and_b64 [[CMP:s\[[0-9]+:[0-9]+\]]], vcc, [[CMP0]] ; W64: s_and_saveexec_b64 [[CMP]], [[CMP]] -; W64: s_waitcnt vmcnt(0) +; W64: s_nop 0 ; W64: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W64: s_xor_b64 exec, exec, [[CMP]] ; W64: s_cbranch_execnz [[LOOPBB1]] @@ -196,7 +196,7 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[2:3] ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]] -; W32: s_waitcnt vmcnt(0) +; W32: s_nop 0 ; W32: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]] ; W32: s_cbranch_execnz [[LOOPBB0]] @@ -217,7 +217,7 @@ ; W32: v_cmp_eq_u64_e64 [[CMP0:s[0-9]+]], s{{\[}}[[SRSRC2]]:[[SRSRC3]]{{\]}}, v[6:7] ; W32: s_and_b32 [[CMP:s[0-9]+]], vcc_lo, [[CMP0]] ; W32: s_and_saveexec_b32 [[CMP]], [[CMP]] -; W32: s_waitcnt vmcnt(0) +; W32: s_nop 0 ; W32: buffer_load_format_x [[RES]], [[IDX]], s{{\[}}[[SRSRC0]]:[[SRSRC3]]{{\]}}, 0 idxen ; W32: s_xor_b32 exec_lo, exec_lo, [[CMP]] ; W32: s_cbranch_execnz [[LOOPBB1]] @@ -240,11 +240,8 @@ ; W64-O0: [[LOOPBB0:BB[0-9]+_[0-9]+]]: ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]] @@ -278,11 +275,8 @@ ; W64-O0: [[LOOPBB1:BB[0-9]+_[0-9]+]]: ; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], {{.*}} ; 4-byte Folded Reload -; W64-O0: s_waitcnt vmcnt(0) ; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], {{.*}} ; 4-byte Folded Reload ; W64-O0: s_waitcnt vmcnt(0) ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP0:[0-9]+]], v[[VRSRC0]] diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -742,8 +742,8 @@ ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16 ; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshl_b64 v[6:7], v[6:7], v13 diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -9,7 +9,6 @@ ; DEFAULT: s_load_dwordx4 ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: buffer_load_format_xyzw -; DEFAULT: s_waitcnt vmcnt(0) ; DEFAULT: buffer_load_format_xyzw ; DEFAULT: s_waitcnt vmcnt(0) ; DEFAULT: exp diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmem-waw.mir @@ -0,0 +1,70 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s + +# Two buffer loads with overlapping outputs. No waitcnt required. +--- +name: buffer_buffer +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; GFX9-LABEL: name: buffer_buffer + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec +... + +# Two tbuffer loads with overlapping outputs. No waitcnt required. +--- +name: tbuffer_tbuffer +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; GFX9-LABEL: name: tbuffer_tbuffer + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = TBUFFER_LOAD_FORMAT_X_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec +... + +# Two gathers with overlapping outputs. (Note gathers can't be trimmed because +# dmask means something different.) No waitcnt required. +--- +name: gather_gather +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX9-LABEL: name: gather_gather + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr10_vgpr11_vgpr12_vgpr13 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr0_vgpr1_vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr13_vgpr14_vgpr15_vgpr16 = IMAGE_GATHER4_LZ_O_V4_V3 $vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +... + +# Image load vs image sample. Waitcnt required because they are not guaranteed +# to write their results in order, despite both using the s_waitcnt vmcnt +# counter. +--- +name: nosampler_sampler +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9-LABEL: name: nosampler_sampler + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GFX9: S_WAITCNT 3952 + ; GFX9: $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec + $vgpr4 = IMAGE_LOAD_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 2, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = IMAGE_SAMPLE_L_V1_V4 $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9_sgpr10_sgpr11, 8, 0, 0, 0, 0, 0, 0, -1, 0, implicit $exec +...