diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -245,8 +245,8 @@ const SIRegisterInfo *TRI, unsigned OpNo) const; bool counterOutOfOrder(InstCounterType T) const; - bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; void determineWait(InstCounterType T, unsigned ScoreToWait, AMDGPU::Waitcnt &Wait) const; void applyWaitcnt(const AMDGPU::Waitcnt &Wait); @@ -418,7 +418,7 @@ } if (DebugCounter::isCounterSet(ForceLgkmCounter) && - DebugCounter::shouldExecute(ForceLgkmCounter)) { + DebugCounter::shouldExecute(ForceLgkmCounter)) { ForceEmitWaitcnt[LGKM_CNT] = true; } else { ForceEmitWaitcnt[LGKM_CNT] = false; @@ -442,6 +442,9 @@ WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); + bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, const MachineInstr *MI); }; } // end anonymous namespace @@ -708,22 +711,19 @@ /// Simplify the waitcnt, in the sense of removing redundant counts, and return /// whether a waitcnt instruction is needed at all. -bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { - return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | - simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | - simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) | - simplifyWaitcnt(VS_CNT, Wait.VsCnt); +void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + simplifyWaitcnt(VM_CNT, Wait.VmCnt); + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); + simplifyWaitcnt(VS_CNT, Wait.VsCnt); } -bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, +void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, unsigned &Count) const { const unsigned LB = getScoreLB(T); const unsigned UB = getScoreUB(T); - if (Count < UB && UB - Count > LB) - return true; - - Count = ~0u; - return false; + if (Count >= UB || UB - Count <= LB) + Count = ~0u; } void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait, @@ -798,6 +798,107 @@ return new SIInsertWaitcnts(); } +/// Combine consecutive waitcnt instructions that precede \p MI and follow +/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added +/// by previous passes. Currently this pass conservatively assumes that these +/// preexisting waitcnt are required for correctness. +bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, + MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, + const MachineInstr *MI) { + bool Modified = false; + MachineInstr *WaitcntInstr = nullptr; + MachineInstr *WaitcntVsCntInstr = nullptr; + for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); + &*II != MI; II = NextI, ++NextI) { + if (II->isDebugInstr()) + continue; + + if (II->getOpcode() == AMDGPU::S_WAITCNT) { + // Conservatively update required wait if this waitcnt was added in an + // earlier pass. In this case it will not exist in the tracked waitcnt + // set. + if (!TrackedWaitcntSet.count(&*II)) { + unsigned IEnc = II->getOperand(0).getImm(); + AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); + Wait = Wait.combined(OldWait); + } + + // Merge consecutive waitcnt of the same type by erasing multiples. + if (!WaitcntInstr) { + WaitcntInstr = &*II; + } else { + II->eraseFromParent(); + Modified = true; + } + + } else { + assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); + if (!TrackedWaitcntSet.count(&*II)) { + unsigned OldVSCnt = + TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); + Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); + } + + if (!WaitcntVsCntInstr) { + WaitcntVsCntInstr = &*II; + } else { + II->eraseFromParent(); + Modified = true; + } + } + } + + // Updated encoding of merged waitcnt with the required wait. + if (WaitcntInstr) { + if (Wait.hasWaitExceptVsCnt()) { + unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); + unsigned OldEnc = WaitcntInstr->getOperand(0).getImm(); + if (OldEnc != NewEnc) { + WaitcntInstr->getOperand(0).setImm(NewEnc); + Modified = true; + } + ScoreBrackets.applyWaitcnt(Wait); + Wait.VmCnt = ~0u; + Wait.LgkmCnt = ~0u; + Wait.ExpCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr + << '\n'); + } else { + WaitcntInstr->eraseFromParent(); + Modified = true; + } + } + + if (WaitcntVsCntInstr) { + if (Wait.hasWaitVsCnt()) { + assert(ST->hasVscnt()); + unsigned OldVSCnt = + TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16) + ->getImm(); + if (Wait.VsCnt != OldVSCnt) { + TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16) + ->setImm(Wait.VsCnt); + Modified = true; + } + ScoreBrackets.applyWaitcnt(Wait); + Wait.VsCnt = ~0u; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Old Instr: " << MI + << "New Instr: " << *WaitcntVsCntInstr << '\n'); + } else { + WaitcntVsCntInstr->eraseFromParent(); + Modified = true; + } + } + + return Modified; +} + static bool readsVCCZ(const MachineInstr &MI) { unsigned Opc = MI.getOpcode(); return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && @@ -833,12 +934,12 @@ MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { setForceEmitWaitcnt(); - bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); if (MI.isMetaInstruction()) return false; AMDGPU::Waitcnt Wait; + bool Modified = false; // See if this instruction has a forced S_WAITCNT VM. // TODO: Handle other cases of NeedsWaitcntVmBefore() @@ -1053,32 +1154,8 @@ } } - // Early-out if no wait is indicated. - if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { - bool Modified = false; - if (OldWaitcntInstr) { - for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); - &*II != &MI; II = NextI, ++NextI) { - if (II->isDebugInstr()) - continue; - - if (TrackedWaitcntSet.count(&*II)) { - TrackedWaitcntSet.erase(&*II); - II->eraseFromParent(); - Modified = true; - } else if (II->getOpcode() == AMDGPU::S_WAITCNT) { - int64_t Imm = II->getOperand(0).getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); - } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - auto W = TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); - ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt(~0u, ~0u, ~0u, W)); - } - } - } - return Modified; - } + // Verify that the wait is actually needed. + ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt()); @@ -1092,57 +1169,19 @@ if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; - ScoreBrackets.applyWaitcnt(Wait); - - AMDGPU::Waitcnt OldWait; - bool Modified = false; - if (OldWaitcntInstr) { - for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II); - &*II != &MI; II = NextI, NextI++) { - if (II->isDebugInstr()) - continue; - - if (II->getOpcode() == AMDGPU::S_WAITCNT) { - unsigned IEnc = II->getOperand(0).getImm(); - AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc); - OldWait = OldWait.combined(IWait); - if (!TrackedWaitcntSet.count(&*II)) - Wait = Wait.combined(IWait); - unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait); - if (IEnc != NewEnc) { - II->getOperand(0).setImm(NewEnc); - Modified = true; - } - Wait.VmCnt = ~0u; - Wait.LgkmCnt = ~0u; - Wait.ExpCnt = ~0u; - } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - - unsigned ICnt = TII->getNamedOperand(*II, AMDGPU::OpName::simm16) - ->getImm(); - OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt); - if (!TrackedWaitcntSet.count(&*II)) - Wait.VsCnt = std::min(Wait.VsCnt, ICnt); - if (Wait.VsCnt != ICnt) { - TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->setImm(Wait.VsCnt); - Modified = true; - } - Wait.VsCnt = ~0u; - } - - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *II << '\n'); - - if (!Wait.hasWait()) - return Modified; - } + // Try to merge the required wait with preexisting waitcnt instructions. + // Also erase redundant waitcnt. + Modified = + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI); + } else { + // Update waitcnt brackets after determining the required wait. + ScoreBrackets.applyWaitcnt(Wait); } - if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) { + // Build new waitcnt instructions unless no wait is needed or the old waitcnt + // instruction was modified to handle the required wait. + if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) @@ -1155,7 +1194,7 @@ << "New Instr: " << *SWaitInst << '\n'); } - if (Wait.VsCnt != ~0u) { + if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); auto SWaitInst = @@ -1430,7 +1469,8 @@ Iter != E;) { MachineInstr &Inst = *Iter; - // Track pre-existing waitcnts from earlier iterations. + // Track pre-existing waitcnts that were added in earlier iterations or by + // the memory legalizer. if (Inst.getOpcode() == AMDGPU::S_WAITCNT || (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && @@ -1556,6 +1596,28 @@ TrackedWaitcntSet.clear(); BlockInfos.clear(); + bool Modified = false; + + if (!MFI->isEntryFunction()) { + // Wait for any outstanding memory operations that the input registers may + // depend on. We can't track them and it's better to the wait after the + // costly call sequence. + + // TODO: Could insert earlier and schedule more liberally with operations + // that only use caller preserved registers. + MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); + if (ST->hasVscnt()) + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + + Modified = true; + } // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. @@ -1563,7 +1625,6 @@ BlockInfos.insert({MBB, BlockInfo(MBB)}); std::unique_ptr Brackets; - bool Modified = false; bool Repeat; do { Repeat = false; @@ -1662,27 +1723,5 @@ } } } - - if (!MFI->isEntryFunction()) { - // Wait for any outstanding memory operations that the input registers may - // depend on. We can't track them and it's better to the wait after the - // costly call sequence. - - // TODO: Could insert earlier and schedule more liberally with operations - // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - MachineBasicBlock::iterator I = EntryBB.begin(); - for (MachineBasicBlock::iterator E = EntryBB.end(); - I != E && (I->isPHI() || I->isMetaInstruction()); ++I) - ; - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - if (ST->hasVscnt()) - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - - Modified = true; - } - return Modified; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -477,6 +477,14 @@ return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u; } + bool hasWaitExceptVsCnt() const { + return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u; + } + + bool hasWaitVsCnt() const { + return VsCnt != ~0u; + } + bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -184,7 +184,6 @@ ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 @@ -357,7 +356,6 @@ ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -72,7 +72,6 @@ ; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -788,7 +788,6 @@ ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x104, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 @@ -1419,7 +1418,6 @@ ; GFX9-NEXT: s_lshl_b32 s1, s0, 2 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_add_u32 s1, 0x4004, s1 ; GFX9-NEXT: scratch_store_dword off, v0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -630,7 +630,6 @@ ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] @@ -706,7 +705,6 @@ ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] @@ -731,7 +729,6 @@ ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -192,7 +192,6 @@ ; NOLOOP: s_mov_b32 m0, 0{{$}} ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: load_dword define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 { @@ -220,7 +219,6 @@ ; NOLOOP: s_mov_b32 m0, 0 ; NOLOOP: ds_gws_init v0 offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; NOLOOP-NEXT: ds_gws_barrier v0 offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -188,7 +188,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[2:3], v0 ; GFX7-NEXT: s_endpgm ; @@ -204,7 +203,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -220,7 +218,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -248,7 +245,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -285,7 +281,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[2:3], v0 ; GFX7-NEXT: s_endpgm ; @@ -303,7 +298,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -321,7 +315,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -351,7 +344,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1121,7 +1113,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1138,7 +1129,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1155,7 +1145,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1182,7 +1171,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1218,7 +1206,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1237,7 +1224,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1256,7 +1242,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1285,7 +1270,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1322,7 +1306,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1341,7 +1324,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1360,7 +1342,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1389,7 +1370,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2456,7 +2436,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2478,7 +2457,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2500,7 +2478,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2532,7 +2509,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2575,7 +2551,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2599,7 +2574,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2623,7 +2597,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2657,7 +2630,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2701,7 +2673,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2725,7 +2696,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2749,7 +2719,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2783,7 +2752,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2826,7 +2794,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2848,7 +2815,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2870,7 +2836,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2902,7 +2867,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2945,7 +2909,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2969,7 +2932,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2993,7 +2955,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3027,7 +2988,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3071,7 +3031,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3095,7 +3054,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3119,7 +3077,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3153,7 +3110,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3197,7 +3153,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3221,7 +3176,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3245,7 +3199,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3279,7 +3232,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3323,7 +3275,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3347,7 +3298,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3371,7 +3321,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3405,7 +3354,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -188,7 +188,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[2:3], v0 ; GFX7-NEXT: s_endpgm ; @@ -204,7 +203,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -220,7 +218,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -248,7 +245,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -285,7 +281,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[2:3], v0 ; GFX7-NEXT: s_endpgm ; @@ -303,7 +298,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -321,7 +315,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -351,7 +344,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s3 -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[2:3], v0 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1121,7 +1113,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1138,7 +1129,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1155,7 +1145,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1182,7 +1171,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1218,7 +1206,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1237,7 +1224,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1256,7 +1242,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1285,7 +1270,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -1322,7 +1306,6 @@ ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -1341,7 +1324,6 @@ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1360,7 +1342,6 @@ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -1389,7 +1370,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2456,7 +2436,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2478,7 +2457,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2500,7 +2478,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2532,7 +2509,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2575,7 +2551,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2599,7 +2574,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2623,7 +2597,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2657,7 +2630,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2701,7 +2673,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2725,7 +2696,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2749,7 +2719,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2783,7 +2752,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2826,7 +2794,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2848,7 +2815,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2870,7 +2836,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -2902,7 +2867,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -2945,7 +2909,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -2969,7 +2932,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2993,7 +2955,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3027,7 +2988,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3071,7 +3031,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3095,7 +3054,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3119,7 +3077,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3153,7 +3110,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3197,7 +3153,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3221,7 +3176,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3245,7 +3199,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3279,7 +3232,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; @@ -3323,7 +3275,6 @@ ; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -3347,7 +3298,6 @@ ; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3371,7 +3321,6 @@ ; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 ; GFX10-CU-NEXT: s_endpgm ; @@ -3405,7 +3354,6 @@ ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -298,7 +298,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -202,7 +202,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -297,7 +296,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1103,7 +1101,6 @@ ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1194,7 +1191,6 @@ ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1289,7 +1285,6 @@ ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2363,7 +2358,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2479,7 +2473,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2599,7 +2592,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2716,7 +2708,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2832,7 +2823,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2952,7 +2942,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3072,7 +3061,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; @@ -3192,7 +3180,6 @@ ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 ; GFX10-WGP-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-debug.mir @@ -25,6 +25,7 @@ # VM-NEXT: S_NOP 0 # ZERO: S_WAITCNT 0 +# ZERO-NEXT: S_NOP 0 # ZERO-NEXT: S_WAITCNT 0 # ZERO-NEXT: S_NOP 0 # ZERO-NEXT: S_WAITCNT 0 @@ -32,6 +33,8 @@ name: waitcnt-debug liveins: +machineFunctionInfo: + isEntryFunction: true body: | bb.0: S_NOP 0 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting-vscnt.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s + +--- +name: test_waitcnt_preexisting_vscnt_unmodified +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_unmodified + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + S_WAITCNT_VSCNT undef $sgpr_null, 0 + S_BARRIER + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_vscnt_needs_vscnt +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_needs_vscnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + S_WAITCNT_VSCNT undef $sgpr_null, 1 + S_BARRIER + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_vscnt_with_other_waitcnt +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_with_other_waitcnt + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT 112 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + S_WAITCNT 112 + S_WAITCNT_VSCNT undef $sgpr_null, 0 + S_BARRIER + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_vscnt_combined +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + S_WAITCNT_VSCNT undef $sgpr_null, 0 + S_WAITCNT_VSCNT undef $sgpr_null, 1 + S_WAITCNT_VSCNT undef $sgpr_null, 2 + S_BARRIER + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_vscnt_combined_both_types +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX10-LABEL: name: test_waitcnt_preexisting_vscnt_combined_both_types + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + ; GFX10: S_WAITCNT 0 + ; GFX10: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10: S_BARRIER + ; GFX10: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_WAITCNT 112 + ; GFX10: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX10: S_ENDPGM 0 + GLOBAL_STORE_DWORD $vgpr0_vgpr1, $vgpr2, 0, 0, implicit $exec + S_WAITCNT 0 + S_WAITCNT_VSCNT undef $sgpr_null, 1 + S_WAITCNT 0 + S_WAITCNT_VSCNT undef $sgpr_null, 2 + S_WAITCNT 0 + S_BARRIER + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-preexisting.mir @@ -1,37 +1,192 @@ -# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GCN %s - -# GCN-LABEL: name: test{{$}} -# GCN: S_WAITCNT -16257 -# GCN: DS_READ2_B32 -# GCN: DS_READ2_B32 -# GCN: S_WAITCNT 383{{$}} -# GCN-NEXT: $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec -# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec -# GCN-NEXT: S_WAITCNT 127{{$}} -# GCN-NEXT: $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec ---- | - define amdgpu_cs void @test() { - ret void - } -... ---- -name: test -body: | - bb.0: - liveins: $sgpr0, $sgpr1, $vgpr0 - - renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 480, 0 - renamable $vgpr13 = V_LSHLREV_B32_e32 2, killed $vgpr0, implicit $exec - S_WAITCNT -16257 - renamable $vgpr0_vgpr1 = DS_READ2_B32 renamable $vgpr13, 0, 1, 0, implicit $m0, implicit $exec - renamable $vgpr2_vgpr3 = DS_READ2_B32 renamable $vgpr13, 2, 3, 0, implicit $m0, implicit $exec - renamable $vgpr1 = V_OR_B32_e32 1, killed $vgpr1, implicit $exec - renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr0, killed $vgpr1, implicit $exec - renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr2, killed $vgpr1, implicit $exec - renamable $vgpr1 = V_MAX_U32_e32 killed $vgpr3, killed $vgpr1, implicit $exec - $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $exec - IMAGE_STORE_V4_V2 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, -1, 1, 0, 0, 0, 0, 0, implicit $exec :: (store 16) +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefixes=GFX9 %s + +--- +name: test_waitcnt_preexisting_lgkmcnt_unmodified +body: | + bb.0: + liveins: $vgpr0 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_unmodified + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec + ; GFX9: S_WAITCNT 49279 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_ENDPGM 0 + $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec + S_WAITCNT 49279 + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_vmcnt_unmodified +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_unmodified + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GFX9: S_WAITCNT 3952 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_ENDPGM 0 + $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 3952 + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +# Respect preexisting waitcnt and add required wait. + +--- +name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt +body: | + bb.0: + liveins: $vgpr0 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_vmcnt_needs_lgkmcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec + ; GFX9: S_WAITCNT 112 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_ENDPGM 0 + $vgpr0_vgpr1 = DS_READ2_B32 $vgpr0, 0, 1, 0, implicit $m0, implicit $exec + S_WAITCNT 3952 + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_lgkmcnt_needs_vmcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GFX9: S_WAITCNT 112 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_ENDPGM 0 + $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + S_WAITCNT 49279 + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +# Apply wait for all counters from preexisting waitcnt regardless of the wait +# required by the next instruction. + +--- +name: test_waitcnt_preexisting_apply_all_counters +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_apply_all_counters + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + ; GFX9: $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr4_vgpr5 = GLOBAL_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr6_vgpr7 = DS_READ2_B32 $vgpr2, 0, 1, 0, implicit $m0, implicit $exec + S_WAITCNT 0 + $vgpr6 = V_OR_B32_e32 1, killed $vgpr6, implicit $exec + $vgpr0 = FLAT_LOAD_DWORD $vgpr4_vgpr5, 0, 0, implicit $exec, implicit $flat_scr + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr +... + +--- +name: test_waitcnt_preexisting_combine_waitcnt +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 0 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 0 + S_WAITCNT 0 + S_WAITCNT 0 + S_WAITCNT 0 + S_WAITCNT 0 + S_WAITCNT 0 + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr +... + +--- +name: test_waitcnt_preexisting_combine_waitcnt_diff_counters +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_combine_waitcnt_diff_counters + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 112 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 49279 + S_WAITCNT 3952 + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr +... + +# Apply preexisting waitcnt when no wait is immediately needed. +# FIXME: Move waitcnt as late as possible. + +--- +name: test_waitcnt_preexisting_early_wait +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX9-LABEL: name: test_waitcnt_preexisting_early_wait + ; GFX9: S_WAITCNT 0 + ; GFX9: $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_WAITCNT 0 + ; GFX9: S_NOP 0 + ; GFX9: S_NOP 0 + ; GFX9: S_NOP 0 + ; GFX9: FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; GFX9: S_ENDPGM 0 + $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 + FLAT_STORE_DWORD $vgpr0_vgpr1, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +# Combine preexisting waitcnt with wait added to the start of a non-entry function. + +--- +name: test_waitcnt_preexisting_func_start +body: | + bb.0: + ; GFX9-LABEL: name: test_waitcnt_preexisting_func_start + ; GFX9: S_WAITCNT 0 + ; GFX9: S_ENDPGM 0 + S_WAITCNT 0 S_ENDPGM 0 ...