diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -458,6 +458,7 @@ #endif // NDEBUG } + bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, @@ -1194,12 +1195,50 @@ return Modified; } -// This is a flat memory operation. Check to see if it has memory -// tokens for both LDS and Memory, and if so mark it as a flat. +// This is a flat memory operation. Check to see if it has memory tokens other +// than LDS. Other address spaces supported by flat memory operations involve +// global memory. +bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { + assert(TII->isFLAT(MI)); + + // All flat instructions use the VMEM counter. + assert(TII->usesVM_CNT(MI)); + + // If there are no memory operands then conservatively assume the flat + // operation may access VMEM. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves VMEM. + // Flat operations only supported FLAT, LOCAL (LDS), or address spaces + // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION + // (GDS) address space is not supported by flat operations. Therefore, simply + // return true unless only the LDS address space is found. + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + assert(AS != AMDGPUAS::REGION_ADDRESS); + if (AS != AMDGPUAS::LOCAL_ADDRESS) + return true; + } + + return false; +} + +// This is a flat memory operation. Check to see if it has memory tokens for +// either LDS or FLAT. bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + assert(TII->isFLAT(MI)); + + // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. + if (!TII->usesLGKM_CNT(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access LDS. if (MI.memoperands_empty()) return true; + // See if any memory operand specifies an address space that involves LDS. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) @@ -1226,7 +1265,10 @@ } else if (TII->isFLAT(Inst)) { assert(Inst.mayLoadOrStore()); - if (TII->usesVM_CNT(Inst)) { + int FlatASCount = 0; + + if (mayAccessVMEMThroughFlat(Inst)) { + ++FlatASCount; if (!ST->hasVscnt()) ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); else if (Inst.mayLoad() && @@ -1236,15 +1278,19 @@ ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst); } - if (TII->usesLGKM_CNT(Inst)) { + if (mayAccessLDSThroughFlat(Inst)) { + ++FlatASCount; ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); - - // This is a flat memory operation, so note it - it will require - // that both the VM and LGKM be flushed to zero if it is pending when - // a VM or LGKM dependency occurs. - if (mayAccessLDSThroughFlat(Inst)) - ScoreBrackets->setPendingFlat(); } + + // A Flat memory operation must access at least one address space. + assert(FlatASCount); + + // This is a flat memory operation that access both VMEM and LDS, so note it + // - it will require that both the VM and LGKM be flushed to zero if it is + // pending when a VM or LGKM dependency occurs. + if (FlatASCount > 1) + ScoreBrackets->setPendingFlat(); } else if (SIInstrInfo::isVMEM(Inst) && // TODO: get a better carve out. Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 && diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -103,7 +103,7 @@ ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef @@ -527,7 +527,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -628,13 +628,13 @@ ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -711,7 +711,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -758,7 +758,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -805,7 +805,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -874,13 +874,13 @@ ; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -923,7 +923,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -969,7 +969,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1018,7 +1018,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xff -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 @@ -1064,7 +1064,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1111,7 +1111,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_add_f32_e32 v2, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -72,7 +72,7 @@ ; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1] ; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 m0, s0, 1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_movrels_b32_e32 v1, v3 ; GFX8-NEXT: v_movrels_b32_e32 v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 @@ -180,13 +180,13 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v10, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v11, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 @@ -206,7 +206,7 @@ ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v18, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v19, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 @@ -219,7 +219,7 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] @@ -577,7 +577,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i128_idx0: @@ -612,7 +612,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: v_mov_b32_e32 v2, v6 @@ -655,7 +655,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v1, v9 ; GFX8-NEXT: v_mov_b32_e32 v2, v10 @@ -698,7 +698,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v12 ; GFX8-NEXT: v_mov_b32_e32 v1, v13 ; GFX8-NEXT: v_mov_b32_e32 v2, v14 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -41,7 +41,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 1 ; GFX8-NEXT: s_lshl_b32 s0, s1, 4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -54,7 +54,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX7-NEXT: s_and_b32 s1, s2, 1 ; GFX7-NEXT: s_lshl_b32 s0, s1, 4 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -85,7 +85,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -98,7 +98,7 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -187,14 +187,14 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 0 @@ -214,7 +214,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -222,7 +222,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -243,7 +243,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -251,7 +251,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -272,7 +272,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -280,7 +280,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr @@ -333,7 +333,7 @@ ; GFX8-NEXT: s_lshr_b32 s0, s2, 1 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -394,7 +394,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc @@ -560,7 +560,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx0: @@ -590,7 +590,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -622,7 +622,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -654,7 +654,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -686,7 +686,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -718,7 +718,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -750,7 +750,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -782,7 +782,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -58,7 +58,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -129,7 +129,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -369,7 +369,7 @@ ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -432,7 +432,7 @@ ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -497,7 +497,7 @@ ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -562,7 +562,7 @@ ; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v0 @@ -685,7 +685,7 @@ ; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_lshl_b32 s0, s1, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -793,7 +793,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1125,7 +1125,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -1188,7 +1188,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -1253,7 +1253,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -1318,7 +1318,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -1382,7 +1382,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -1445,7 +1445,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -1510,7 +1510,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -1575,7 +1575,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -1748,7 +1748,7 @@ ; GFX8-NEXT: s_lshr_b32 s0, s2, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: s_and_b32 s1, s2, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 @@ -1927,7 +1927,7 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 @@ -2140,7 +2140,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -2203,7 +2203,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -2268,7 +2268,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -2333,7 +2333,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 @@ -2397,7 +2397,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -2460,7 +2460,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -2525,7 +2525,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -2590,7 +2590,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 @@ -2654,7 +2654,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 @@ -2717,7 +2717,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 @@ -2782,7 +2782,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 @@ -2847,7 +2847,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 @@ -2911,7 +2911,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 @@ -2974,7 +2974,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 @@ -3039,7 +3039,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 @@ -3104,7 +3104,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX8-NEXT: s_movk_i32 s4, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2511,7 +2511,7 @@ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; MOVREL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr @@ -2543,7 +2543,7 @@ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; MOVREL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 32 @@ -2575,7 +2575,7 @@ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; MOVREL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v1 ; MOVREL-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr @@ -2610,7 +2610,7 @@ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; MOVREL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt vmcnt(0) ; MOVREL-NEXT: v_mov_b32_e32 v0, v5 ; MOVREL-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -50,9 +50,9 @@ ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_med3_f32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[6:7], v0 ; VI-NEXT: s_endpgm @@ -145,14 +145,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v4, 0x80000000, v7 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -253,9 +253,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e32 v4, s2, v7 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, |v2|, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -352,11 +352,11 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_sub_f32_e64 v4, s2, |v7| -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_sub_f32_e64 v2, s2, |v2| -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_f32_e64 v3, s2, |v3| ; VI-NEXT: v_med3_f32 v2, v4, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -461,13 +461,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_f32_e32 v4, 1.0, v7 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_f32_e32 v2, 2.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_f32_e32 v3, 4.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 @@ -585,13 +585,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v5, v4, v2 ; VI-NEXT: v_max_f32_e32 v2, v4, v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; VI-NEXT: v_min_f32_e32 v2, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -52,9 +52,9 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 ; VI-NEXT: v_rcp_f32_e32 v3, v3 ; VI-NEXT: v_mul_f32_e32 v1, v1, v3 @@ -112,7 +112,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v1, v0 ; VI-NEXT: v_mul_f16_e32 v1, v2, v1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 @@ -167,7 +167,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v1, v0 ; VI-NEXT: v_mul_f16_e32 v1, v2, v1 ; VI-NEXT: v_trunc_f16_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -92,7 +92,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -110,7 +110,7 @@ ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -324,7 +324,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -342,7 +342,7 @@ ; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -382,7 +382,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -400,7 +400,7 @@ ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -439,7 +439,7 @@ ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 @@ -457,7 +457,7 @@ ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 ; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -564,7 +564,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -588,7 +588,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, s3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 @@ -902,7 +902,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 @@ -926,7 +926,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -977,7 +977,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1001,7 +1001,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1053,7 +1053,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1077,7 +1077,7 @@ ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1253,7 +1253,7 @@ ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] @@ -1704,7 +1704,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] @@ -1799,7 +1799,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] @@ -1895,7 +1895,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] @@ -2201,11 +2201,11 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] @@ -2912,11 +2912,11 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] @@ -3059,11 +3059,11 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] @@ -3207,11 +3207,11 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -31,7 +31,7 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -96,7 +96,7 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 @@ -162,7 +162,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: flat_load_ushort v1, v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 @@ -230,7 +230,7 @@ ; GFX8-NEXT: flat_load_ushort v1, v[1:2] ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 @@ -297,7 +297,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 @@ -361,7 +361,7 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -423,7 +423,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -484,7 +484,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 @@ -628,7 +628,7 @@ ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 @@ -1122,7 +1122,7 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 @@ -1241,7 +1241,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, 8 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 @@ -1361,7 +1361,7 @@ ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_mov_b32_e32 v5, 8 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xff -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 @@ -1734,7 +1734,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 @@ -2577,7 +2577,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 @@ -2767,7 +2767,7 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 @@ -2960,7 +2960,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v1 @@ -3592,7 +3592,7 @@ ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 @@ -5035,7 +5035,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v4 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -5351,7 +5351,7 @@ ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v4 @@ -5670,7 +5670,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v10 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -41,7 +41,7 @@ ; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: flat_store_dword v[0:1], v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global_constexpr_cast: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -200,7 +200,7 @@ ; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -214,7 +214,7 @@ ; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32: @@ -248,7 +248,7 @@ ; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -264,7 +264,7 @@ ; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32_offset: @@ -379,7 +379,7 @@ ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -400,7 +400,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: @@ -1366,7 +1366,7 @@ ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1381,7 +1381,7 @@ ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i64: @@ -1417,7 +1417,7 @@ ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -1434,7 +1434,7 @@ ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i64_offset: @@ -1557,7 +1557,7 @@ ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; @@ -1579,7 +1579,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -196,7 +196,7 @@ ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -210,7 +210,7 @@ ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -245,7 +245,7 @@ ; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -261,7 +261,7 @@ ; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -379,7 +379,7 @@ ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -400,7 +400,7 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -720,7 +720,7 @@ ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -735,7 +735,7 @@ ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -772,7 +772,7 @@ ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; @@ -789,7 +789,7 @@ ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -915,7 +915,7 @@ ; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: v_mov_b32_e32 v5, 0 ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; @@ -937,7 +937,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: v_mov_b32_e32 v5, 0 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -1493,7 +1493,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s3 ; CI-NEXT: s_waitcnt lgkmcnt(1) ; CI-NEXT: flat_store_dword v[0:1], v4 -; CI-NEXT: s_waitcnt lgkmcnt(1) +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dword v[2:3], v5 ; CI-NEXT: s_endpgm ; @@ -1513,7 +1513,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt lgkmcnt(1) ; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dword v[2:3], v5 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -888,7 +888,7 @@ ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1027,7 +1027,7 @@ ; GFX8-NEXT: s_and_b32 s2, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_nop 3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -36,7 +36,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -103,7 +103,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -153,7 +153,7 @@ ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -174,7 +174,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -225,7 +225,7 @@ ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -246,7 +246,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -312,7 +312,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -374,7 +374,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -436,7 +436,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -498,7 +498,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -544,7 +544,7 @@ ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm @@ -562,7 +562,7 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -606,7 +606,7 @@ ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm @@ -624,7 +624,7 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -668,7 +668,7 @@ ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm @@ -686,7 +686,7 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -730,7 +730,7 @@ ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm @@ -748,7 +748,7 @@ ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm @@ -994,7 +994,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1052,7 +1052,7 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1116,9 +1116,9 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1190,7 +1190,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: flat_load_dword v1, v[2:3] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -49,7 +49,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -237,7 +237,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v4 ; GFX8-NEXT: v_mul_i32_i24_e32 v0, -7, v0 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -18,7 +18,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -52,7 +52,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i8_to_i64: @@ -84,7 +84,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i16_to_i64: @@ -116,7 +116,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i64: @@ -150,7 +150,7 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i96: @@ -187,7 +187,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i128: diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -143,7 +143,7 @@ ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dword v0, v[0:1] -; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm @@ -219,7 +219,7 @@ ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -397,7 +397,7 @@ ; FLAT-NEXT: s_mov_b32 s0, 0x10203 ; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f ; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0 ; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0 ; FLAT-NEXT: v_and_b32_e32 v1, s1, v2 @@ -691,7 +691,7 @@ ; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0 ; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0 ; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -37,7 +37,7 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -88,7 +88,7 @@ ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm @@ -145,7 +145,7 @@ ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_mov_b32 s12, s2 ; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -297,7 +297,7 @@ ; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_mov_b32 s8, s6 ; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v4, s12, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 @@ -388,7 +388,7 @@ ; VI-NEXT: s_mov_b32 s13, s3 ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v4, s16, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 @@ -448,7 +448,7 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -95,7 +95,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc @@ -165,7 +165,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v1 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; VI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc @@ -247,7 +247,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v4, v3 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; VI-NEXT: v_cndmask_b32_e32 v3, 32, v4, vcc @@ -539,7 +539,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v5, v0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5 ; VI-NEXT: v_ffbh_u32_e32 v6, v1 @@ -626,7 +626,7 @@ ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] ; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_ffbh_u32_e32 v5, v2 @@ -705,7 +705,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -772,7 +772,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -844,7 +844,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc @@ -921,7 +921,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc @@ -992,7 +992,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1135,7 +1135,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -409,7 +409,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -452,7 +452,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -498,7 +498,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 @@ -545,7 +545,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 @@ -613,13 +613,13 @@ ; VI-NEXT: flat_load_ubyte v5, v[6:7] ; VI-NEXT: flat_load_ubyte v6, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v3, v5 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v6 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -692,7 +692,7 @@ ; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_mov_b32 s7, s11 ; VI-NEXT: s_movk_i32 s0, 0x900 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -792,19 +792,19 @@ ; VI-NEXT: flat_load_ubyte v7, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v6 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v7 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 @@ -856,7 +856,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v7 @@ -907,7 +907,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -952,7 +952,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -996,7 +996,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1058,10 +1058,10 @@ ; VI-NEXT: flat_load_ubyte v5, v[6:7] ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v1 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v1 @@ -1107,7 +1107,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1150,7 +1150,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1194,7 +1194,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1238,7 +1238,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1286,7 +1286,7 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -12,7 +12,7 @@ ; GFX7-ALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-ALIGNED-NEXT: flat_load_ushort v0, v[0:1] ; GFX7-ALIGNED-NEXT: flat_load_ushort v1, v[2:3] -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] @@ -24,7 +24,7 @@ ; GFX7-UNALIGNED-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; GFX7-UNALIGNED-NEXT: flat_load_ushort v0, v[0:1] ; GFX7-UNALIGNED-NEXT: flat_load_ushort v1, v[2:3] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] @@ -113,11 +113,11 @@ ; GFX7-ALIGNED-NEXT: flat_load_ubyte v1, v[6:7] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v4, v[4:5] ; GFX7-ALIGNED-NEXT: flat_load_ubyte v2, v[2:3] -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -128,7 +128,7 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align1: @@ -218,14 +218,14 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align4: diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -37,7 +37,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -89,7 +89,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -141,7 +141,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -193,7 +193,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -35,7 +35,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -87,7 +87,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -139,7 +139,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -191,7 +191,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -243,7 +243,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -295,7 +295,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -347,7 +347,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -399,7 +399,7 @@ ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -101,9 +101,9 @@ ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 ; VI-NEXT: v_rcp_f32_e32 v5, v5 ; VI-NEXT: v_mul_f32_e32 v3, v3, v5 @@ -196,7 +196,7 @@ ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v3, v2 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3 ; VI-NEXT: v_trunc_f16_e32 v3, v3 @@ -286,7 +286,7 @@ ; VI-NEXT: flat_load_ushort v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f16_e32 v3, v2 ; VI-NEXT: v_mul_f16_e32 v3, v4, v3 ; VI-NEXT: v_trunc_f16_e32 v3, v3 @@ -390,7 +390,7 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 ; VI-NEXT: v_rcp_f32_e32 v6, v5 @@ -483,7 +483,7 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 ; VI-NEXT: v_trunc_f32_e32 v3, v3 @@ -565,7 +565,7 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f32_e32 v3, v2 ; VI-NEXT: v_mul_f32_e32 v3, v4, v3 ; VI-NEXT: v_trunc_f32_e32 v3, v3 @@ -684,7 +684,7 @@ ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 @@ -812,7 +812,7 @@ ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 @@ -916,7 +916,7 @@ ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_mul_f64 v[6:7], v[2:3], v[6:7] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] @@ -1077,10 +1077,10 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_rcp_f32_e32 v7, v7 @@ -1336,10 +1336,10 @@ ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 ; VI-NEXT: v_rcp_f32_e32 v9, v9 @@ -1513,7 +1513,7 @@ ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 ; VI-NEXT: v_rcp_f32_e32 v8, v7 @@ -1739,7 +1739,7 @@ ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 ; VI-NEXT: v_rcp_f32_e32 v12, v11 @@ -1962,7 +1962,7 @@ ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2731,7 +2731,7 @@ ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: flat_load_ushort v1, v[2:3] ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(1) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2739,6 +2739,7 @@ ; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2 ; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -1413,7 +1413,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x38,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_0.5_i16: @@ -1449,7 +1449,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xb800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xb8,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_0.5_i16: @@ -1485,7 +1485,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3c00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x3c,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_1.0_i16: @@ -1521,7 +1521,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xbc00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xbc,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_1.0_i16: @@ -1558,7 +1558,7 @@ ; VI-NEXT: s_movk_i32 s4, 0x4000 ; encoding: [0x00,0x40,0x04,0xb0] ; VI-NEXT: v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: shl_inline_imm_2.0_i16: @@ -1595,7 +1595,7 @@ ; VI-NEXT: s_movk_i32 s4, 0xc000 ; encoding: [0x00,0xc0,0x04,0xb0] ; VI-NEXT: v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: shl_inline_imm_neg_2.0_i16: @@ -1631,7 +1631,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x4400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x44,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_4.0_i16: @@ -1667,7 +1667,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xc400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xc4,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_4.0_i16: @@ -1703,7 +1703,7 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3118, v2 ; encoding: [0xff,0x04,0x04,0x52,0x18,0x31,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] +; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_inv2pi_i16: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -535,7 +535,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -553,7 +553,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -597,7 +597,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshr_b32 s0, s4, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, s0, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -617,7 +617,7 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; CI-NEXT: v_or_b32_e32 v2, s0, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -659,7 +659,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, 53, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -677,7 +677,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; CI-NEXT: v_or_b32_e32 v0, 53, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -720,7 +720,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -737,7 +737,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -778,7 +778,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -795,7 +795,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -836,7 +836,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -854,7 +854,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -894,7 +894,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v0, 53, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -912,7 +912,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; CI-NEXT: v_or_b32_e32 v0, 53, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -954,7 +954,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -971,7 +971,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -1012,7 +1012,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -1029,7 +1029,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 @@ -1141,7 +1141,7 @@ ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -1162,7 +1162,7 @@ ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 ; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm @@ -1214,10 +1214,10 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v2, v2, s0, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1240,10 +1240,10 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 s0, 0x12341234 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v2, v2, s0, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -1288,7 +1288,7 @@ ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v0, s0, v4, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1308,7 +1308,7 @@ ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v0, s0, v4, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1353,7 +1353,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1372,7 +1372,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1418,7 +1418,7 @@ ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1438,7 +1438,7 @@ ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1483,7 +1483,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1502,7 +1502,7 @@ ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1548,7 +1548,7 @@ ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1568,7 +1568,7 @@ ; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: v_mov_b32_e32 v4, s4 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v1, s0, v4, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -1625,10 +1625,10 @@ ; VI-NEXT: s_lshl_b32 s0, s1, 16 ; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1653,10 +1653,10 @@ ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4 -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1717,7 +1717,7 @@ ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1745,7 +1745,7 @@ ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -128,7 +128,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -186,7 +186,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -240,7 +240,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -302,7 +302,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -369,7 +369,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -436,7 +436,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm @@ -504,7 +504,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 ; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -60,7 +60,7 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, v2 ; TONGA-NEXT: flat_store_dword v[4:5], v3 -; TONGA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_2d_f16_tfe: @@ -76,7 +76,7 @@ ; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: v_mov_b32_e32 v0, v2 ; GFX81-NEXT: flat_store_dword v[4:5], v3 -; GFX81-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: image_sample_2d_f16_tfe: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -34,7 +34,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX8-NEXT: v_fract_f16_e32 v0, v0 ; GFX8-NEXT: v_cos_f16_e32 v2, v0 @@ -102,7 +102,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX8-NEXT: v_fract_f16_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -34,7 +34,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX8-NEXT: v_fract_f16_e32 v0, v0 ; GFX8-NEXT: v_sin_f16_e32 v2, v0 @@ -102,7 +102,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 ; GFX8-NEXT: v_fract_f16_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -99,7 +99,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -205,7 +205,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -247,7 +247,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load half, half addrspace(3)* %in @@ -288,7 +288,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -331,7 +331,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -372,7 +372,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -415,7 +415,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -458,7 +458,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -502,7 +502,7 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -694,10 +694,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -737,10 +737,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -780,10 +780,10 @@ ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -823,10 +823,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -868,10 +868,10 @@ ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -913,10 +913,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -958,7 +958,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -998,7 +998,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] ; FIXME: the and above should be removable @@ -1040,7 +1040,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1080,7 +1080,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1122,7 +1122,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1164,7 +1164,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1205,7 +1205,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1247,7 +1247,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 @@ -1288,7 +1288,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1328,7 +1328,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1367,7 +1367,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1407,7 +1407,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1447,7 +1447,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1488,7 +1488,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1530,7 +1530,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1570,7 +1570,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1612,7 +1612,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1652,10 +1652,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1695,10 +1695,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_ushort v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1739,10 +1739,10 @@ ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1784,10 +1784,10 @@ ; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc ; GFX803-NEXT: flat_load_sbyte v0, v[0:1] ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1835,7 +1835,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1885,7 +1885,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1937,7 +1937,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1989,7 +1989,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -2043,7 +2043,7 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -93,7 +93,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -165,7 +165,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 @@ -234,7 +234,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0 ; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 @@ -300,7 +300,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8 ; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -361,7 +361,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD @@ -423,7 +423,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 ; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 @@ -503,7 +503,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -21,7 +21,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v0, v0, v1 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm @@ -69,7 +69,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v3, v5, v2 ; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -127,10 +127,10 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_max_i16_e32 v6, v5, v7 ; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v4, v4, v8 ; VI-NEXT: v_or_b32_e32 v5, v6, v5 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -187,7 +187,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v6, v1, v3 ; VI-NEXT: v_max_i16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_max_i16_e32 v3, v0, v2 @@ -241,7 +241,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_i16_e32 v0, v0, v1 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm @@ -289,7 +289,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v0, v0, v1 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm @@ -337,7 +337,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v0, v0, v1 ; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm @@ -384,7 +384,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_max_u16_e32 v3, v5, v2 ; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -201,7 +201,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 @@ -368,7 +368,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc ; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] @@ -463,7 +463,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -92,7 +92,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -164,7 +164,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 @@ -233,7 +233,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v4, v2 @@ -299,7 +299,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -361,7 +361,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 @@ -424,7 +424,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 @@ -505,7 +505,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -36,7 +36,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -107,9 +107,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: flat_store_dword v[0:1], v3 @@ -188,7 +188,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -254,7 +254,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -320,7 +320,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -386,7 +386,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -452,7 +452,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -518,7 +518,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -584,7 +584,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -696,7 +696,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm @@ -765,7 +765,7 @@ ; VI-NEXT: flat_load_ushort v0, v[1:2] ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm @@ -840,9 +840,9 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: flat_store_short v[0:1], v3 @@ -925,7 +925,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -997,7 +997,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, -7, v3 ; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1070,7 +1070,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 64, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1142,7 +1142,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_add_u16_e32 v3, -7, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1212,7 +1212,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -1280,7 +1280,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -1349,7 +1349,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -1422,7 +1422,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1491,7 +1491,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -1560,7 +1560,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1633,7 +1633,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, -16, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1702,7 +1702,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 @@ -1771,7 +1771,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; VI-NEXT: v_add_u16_e32 v3, -16, v3 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 @@ -1844,7 +1844,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1918,7 +1918,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -1992,7 +1992,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -2066,7 +2066,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, s2, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 @@ -2137,7 +2137,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm @@ -2204,7 +2204,7 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -37,7 +37,7 @@ ; VI-NEXT: flat_load_dword v1, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v2, v0, v1 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -184,7 +184,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -229,7 +229,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0x3df ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -272,7 +272,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 1 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v2, v0 @@ -314,7 +314,7 @@ ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -361,7 +361,7 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -410,7 +410,7 @@ ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v1, v2 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -466,7 +466,7 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_e32 v0, v4, v2 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -518,7 +518,7 @@ ; VI-NEXT: flat_load_dword v1, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -575,7 +575,7 @@ ; VI-NEXT: flat_load_dword v1, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -40,7 +40,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 4, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %load0 = load i32, i32 addrspace(1)* undef @@ -69,7 +69,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 4, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %load0 = load float, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir @@ -2,7 +2,7 @@ # GCN-LABEL: waitcnt-back-edge-loop # GCN: bb.2 -# GCN: S_WAITCNT 112 +# GCN: S_WAITCNT 3952 # GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $mode, implicit $exec --- diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -5,11 +5,11 @@ ; GCN-LABEL: {{^}}testKernel ; GCN: BB0_1: -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: v_cmp_eq_f32_e32 -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: v_cmp_eq_f32_e32 -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: v_cmp_eq_f32_e32 @data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -5,7 +5,7 @@ ; GCN-LABEL: barrier_vmcnt_global: ; GFX8: flat_load_dword ; GFX9_10: global_load_dword -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX9_10: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) { @@ -28,7 +28,7 @@ ; GCN-LABEL: barrier_vscnt_global: ; GFX8: flat_store_dword ; GFX9_10: global_store_dword -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX9: s_waitcnt vmcnt(0){{$}} ; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier @@ -54,7 +54,7 @@ ; GCN-LABEL: barrier_vmcnt_vscnt_global: ; GFX8: flat_load_dword ; GFX9_10: global_load_dword -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX9_10: s_waitcnt vmcnt(0){{$}} ; GFX10: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_barrier @@ -182,7 +182,7 @@ ; GCN-LABEL: load_vmcnt_global: ; GFX8: flat_load_dword ; GFX9_10: global_load_dword -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} ; GFX9_10: s_waitcnt vmcnt(0){{$}} ; GCN-NEXT: {{global|flat}}_store_dword define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir @@ -53,19 +53,35 @@ # CHECK: FLAT_LOAD_DWORD # CHECK: FLAT_LOAD_DWORDX4 # Global loads will return in order so we should: -# s_waitcnt vmcnt(1) lgkmcnt(1) -# CHECK-NEXT: S_WAITCNT 369 +# s_waitcnt vmcnt(1) +# CHECK-NEXT: S_WAITCNT 3953 # CHECK-LABEL: bb.1: # CHECK: FLAT_LOAD_DWORD -# GFX89: S_WAITCNT 112 +# s_waitcnt vmcnt(0) +# GFX89: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 # CHECK-LABEL: bb.2: # CHECK: FLAT_LOAD_DWORD -# GFX89: S_WAITCNT 112 +# s_waitcnt vmcnt(0) +# GFX89: S_WAITCNT 3952 # CHECK: FLAT_LOAD_DWORDX4 +# CHECK-LABEL: bb.3: +# s_waitcnt vmcnt(0) +# GFX89: S_WAITCNT 3952 +# CHECK: FLAT_LOAD_DWORD +# CHECK: FLAT_LOAD_DWORD +# s_waitcnt vmcnt(0) lgkmcnt(0) +# GFX89: S_WAITCNT 112 + +# CHECK-LABEL: bb.4: +# GFX89-NOT: S_WAITCNT +# CHECK: FLAT_LOAD_DWORD +# s_waitcnt vmcnt(0) lgkmcnt(0) +# GFX89: S_WAITCNT 112 + name: flat_zero_waitcnt body: | @@ -84,9 +100,22 @@ S_BRANCH %bb.2 bb.2: + successors: %bb.3 $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) $vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 $vgpr7_vgpr8, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16 from %ir.flat16) $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec + S_BRANCH %bb.3 + + bb.3: + successors: %bb.4 + $vgpr3 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.global4) + $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec + S_BRANCH %bb.4 + + bb.4: + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4 from %ir.flat4) + $vgpr0 = V_MOV_B32_e32 $vgpr5, implicit $exec S_ENDPGM 0 ... --- @@ -291,7 +320,7 @@ --- # CHECK-LABEL: name: subregs16bit # CHECK: S_WAITCNT 112 -# CHECK-NEXT: V_NOP_e32 +# CHECK-NEXT: V_NOP_e32 name: subregs16bit machineFunctionInfo: diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -282,7 +282,7 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 ; VI-NEXT: v_or_b32_e32 v2, 4, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0