diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -462,6 +462,9 @@ bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr); + bool generateWaitcntInstAfter(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -824,8 +827,17 @@ !MI.getOperand(1).isUndef(); } +// For jumps like function calls and returns, we insert waitcnts at the jump +// destination. The jump adds latency because new instructions need to be +// fetched, waiting for outstanding memory operations after the jump means +// that the memory latency can be overlapped with the jump latency. + +// FIXME callWaitsOnFunctionEntry and callWaitsOnFunctionReturn should depend +// on the calling convention. We need to track the calling convention for +// call instructions, so it is available here. + /// \returns true if the callee inserts an s_waitcnt 0 on function entry. -static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { +static bool callWaitsOnFunctionEntry() { // Currently all conventions wait, but this may not always be the case. // // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make @@ -835,8 +847,14 @@ /// \returns true if the callee is expected to wait for any outstanding waits /// before returning. -static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { - return true; +static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return false; } + +/// \returns true if the instruction is an s_sendmsg of gs-done. +static bool isSendGsDoneMessage(const MachineInstr &MI) { + return (MI.getOpcode() == AMDGPU::S_SENDMSG || + MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && + (MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == + AMDGPU::SendMsg::ID_GS_DONE; } /// Generate s_waitcnt instruction to be placed before cur_Inst. @@ -870,19 +888,16 @@ Wait.VmCnt = 0; } - // All waits must be resolved at call return. - // NOTE: this could be improved with knowledge of all call sites or - // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || - (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); - } - // Resolve vm waits before gs-done. - else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || - MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && - ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == - AMDGPU::SendMsg::ID_GS_DONE)) { + (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry())) { + // All waits must be resolved at call return. + // NOTE: this could be improved with knowledge of all call sites or + // with knowledge of the called routines. + if (callWaitsOnFunctionReturn(MI)) + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); + } else if (isSendGsDoneMessage(MI)) { + // Resolve vm waits before gs-done. Wait.VmCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. @@ -955,7 +970,7 @@ } } - if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { + if (MI.isCall() && callWaitsOnFunctionEntry()) { // The function is going to insert a wait on everything in its prolog. // This still needs to be careful if the call target is a load (e.g. a GOT // load). We also need to check WAW depenancy with saved PC. @@ -1194,6 +1209,27 @@ return Modified; } +bool SIInsertWaitcnts::generateWaitcntInstAfter(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + // Insert waitcnts after function calls (that are not tail calls) + if (!MI.isCall() || MI.isTerminator() || callWaitsOnFunctionReturn(MI)) + return false; + + auto I = ++MI.getIterator(); + // Don't insert waitcnt if this function returns immediately + if (I.isEnd() || I->isReturn()) + return false; + + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), I, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(0); + if (ST->hasVscnt()) + BuildMI(*MI.getParent(), I, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); + return true; +} + // This is a flat memory operation. Check to see if it has memory // tokens for both LDS and Memory, and if so mark it as a flat. bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { @@ -1269,13 +1305,8 @@ } else if (TII->isSMRD(Inst)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst); } else if (Inst.isCall()) { - if (callWaitsOnFunctionReturn(Inst)) { - // Act as a wait on everything - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); - } else { - // May need to way wait for anything. - ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); - } + // Act as a wait on everything. Either the callee waits or we insert a wait. + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } else { switch (Inst.getOpcode()) { case AMDGPU::S_SENDMSG: @@ -1452,6 +1483,7 @@ // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstAfter(Inst, ScoreBrackets, OldWaitcntInstr); OldWaitcntInstr = nullptr; updateEventWaitcntAfter(Inst, &ScoreBrackets); @@ -1629,7 +1661,7 @@ } } - if (!MFI->isEntryFunction()) { + if (!MFI->isEntryFunction() && callWaitsOnFunctionEntry()) { // Wait for any outstanding memory operations that the input registers may // depend on. We can't track them and it's better to the wait after the // costly call sequence. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -94,7 +94,6 @@ ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: @@ -103,7 +102,6 @@ ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -15,7 +15,6 @@ ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB0_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -42,7 +41,6 @@ ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB1_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = icmp ne i32 %value, 0 @@ -71,7 +69,6 @@ ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB2_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %c = trunc i32 %value to i1 @@ -102,7 +99,6 @@ ; CHECK-NEXT: global_load_dword v0, v[0:1], off ; CHECK-NEXT: BB3_2: ; %endif ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: %value = load i32, i32 addrspace(1)* %ptr @@ -167,7 +163,6 @@ ; CHECK-NEXT: BB4_5: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: BB4_6: ; %bb12 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: %tmp = load i32, i32 addrspace(4)* @external_constant diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll @@ -73,7 +73,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4: @@ -100,7 +99,6 @@ ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv, align 4 %alloca = alloca i32, i32 %n, addrspace(5) @@ -177,7 +175,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_sub_u32 s32, s32, 0x400 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16: @@ -204,7 +201,6 @@ ; GFX10-NEXT: s_sub_u32 s32, s32, 0x200 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv, align 16 %alloca = alloca i32, i32 %n, addrspace(5) @@ -285,7 +281,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_sub_u32 s32, s32, 0x1000 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align32: @@ -314,7 +309,6 @@ ; GFX10-NEXT: s_sub_u32 s32, s32, 0x800 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %n = load i32, i32 addrspace(4)* @gv %alloca = alloca i32, i32 %n, align 32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -280,7 +280,6 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 %idx @@ -568,7 +567,6 @@ ; GCN-NEXT: s_mov_b32 s33, s6 ; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <128 x i16>, <128 x i16> addrspace(1)* %ptr %elt = extractelement <128 x i16> %vec, i32 %idx @@ -853,7 +851,6 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b32 s33, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr %elt = extractelement <32 x i64> %vec, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -558,7 +558,6 @@ ; GCN-LABEL: extractelement_sgpr_v4i128_idx0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(4)* %ptr %element = extractelement <4 x i128> %vector, i32 0 @@ -615,14 +614,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i128_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i128_idx0: @@ -632,7 +629,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i16.ll @@ -132,7 +132,6 @@ ; GCN-LABEL: extractelement_sgpr_v4i16_idx0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %vector = load <4 x i16>, <4 x i16> addrspace(4)* %ptr %element = extractelement <4 x i16> %vector, i32 0 @@ -180,21 +179,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v4i16_idx0: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <4 x i16>, <4 x i16> addrspace(1)* %ptr %element = extractelement <4 x i16> %vector, i32 0 @@ -457,7 +453,6 @@ ; GCN-LABEL: extractelement_sgpr_v8i16_idx0: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %vector = load <8 x i16>, <8 x i16> addrspace(4)* %ptr %element = extractelement <8 x i16> %vector, i32 0 @@ -553,14 +548,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: extractelement_vgpr_v8i16_idx0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: extractelement_vgpr_v8i16_idx0: @@ -570,7 +563,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %vector = load <8 x i16>, <8 x i16> addrspace(1)* %ptr %element = extractelement <8 x i16> %vector, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2530,7 +2530,6 @@ ; GPRIDX-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GPRIDX-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GPRIDX-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: v_extract_v64i32_32: @@ -2543,7 +2542,6 @@ ; MOVREL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; MOVREL-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; MOVREL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; MOVREL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_setpc_b64 s[30:31] %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %elt = extractelement <64 x i32> %vec, i32 32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmax_legacy.ll @@ -213,7 +213,6 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_write_b32 v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_test_fmax_legacy_ule_f32_multi_use: @@ -224,7 +223,6 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_write_b32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ogt float %a, %b %val0 = select i1 %cmp, float %a, float %b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmin_legacy.ll @@ -196,7 +196,6 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_write_b32 v0, v1 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_test_fmin_legacy_ule_f32_multi_use: @@ -207,7 +206,6 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_write_b32 v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] %cmp = fcmp ule float %a, %b %val0 = select i1 %cmp, float %a, float %b diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image_ls_mipmap_zero.ll @@ -14,7 +14,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -48,7 +46,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d: @@ -63,7 +60,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -82,7 +78,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_3d: @@ -97,7 +92,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -116,7 +110,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1darray: @@ -131,7 +124,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -150,7 +142,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2darray: @@ -165,7 +156,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) @@ -184,7 +174,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_cube: @@ -199,7 +188,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %u, i32 0, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -826,7 +826,6 @@ ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; GPRIDX-NEXT: s_waitcnt vmcnt(0) ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: dyn_insertelement_v8f64_const_s_v_v: @@ -891,7 +890,6 @@ ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off -; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: %insert = insertelement <8 x double> , double %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll @@ -17,7 +17,6 @@ ; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: ds_write_b32 v0, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global: @@ -27,7 +26,6 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX9-NEXT: s_trap 2 ; GFX9-NEXT: ds_write_b32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store float 0.0, float addrspace(3)* @lds, align 4 ret void @@ -41,7 +39,6 @@ ; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 ; GFX8-NEXT: flat_store_dword v[0:1], v0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global_constexpr_cast: @@ -50,7 +47,6 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX9-NEXT: s_trap 2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store i32 ptrtoint (float addrspace(3)* @lds to i32), i32 addrspace(1)* undef, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll @@ -9,7 +9,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fadd_f32_ss: @@ -17,7 +16,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -30,7 +28,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fadd_f32_ss_offset: @@ -38,7 +35,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -90,14 +86,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fadd_f32_vv: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -109,14 +103,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fadd_f32_vv_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -129,14 +121,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fadd_f32_vv_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void @@ -148,14 +138,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fadd_f32_vv_offset_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -168,14 +156,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fadd_f32_vv_volatile: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll @@ -15,7 +15,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmax_f32_ss: @@ -23,7 +22,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss ; GFX8-MIR: bb.1 (%ir-block.0): @@ -57,7 +55,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmax_f32_ss_offset: @@ -65,7 +62,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset ; GFX8-MIR: bb.1 (%ir-block.0): @@ -176,14 +172,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmax_f32_vv: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv ; GFX8-MIR: bb.1 (%ir-block.0): @@ -216,14 +210,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmax_f32_vv_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset ; GFX8-MIR: bb.1 (%ir-block.0): @@ -257,14 +249,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmax_f32_vv_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_nortn ; GFX8-MIR: bb.1 (%ir-block.0): @@ -295,14 +285,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmax_f32_vv_offset_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_offset_nortn ; GFX8-MIR: bb.1 (%ir-block.0): @@ -334,14 +322,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmax_f32_vv_volatile: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX8-MIR-LABEL: name: ds_fmax_f32_vv_volatile ; GFX8-MIR: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll @@ -9,7 +9,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmin_f32_ss: @@ -17,7 +16,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -30,7 +28,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmin_f32_ss_offset: @@ -38,7 +35,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -90,14 +86,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmin_f32_vv: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret float %ret @@ -109,14 +103,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmin_f32_vv_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -129,14 +121,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmin_f32_vv_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 false) ret void @@ -148,14 +138,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmin_f32_vv_offset_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 offset:512 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) @@ -168,14 +156,12 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ds_fmin_f32_vv_volatile: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %ret = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr, float %val, i32 0, i32 0, i1 true) ret float %ret diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -9,7 +9,6 @@ ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) ret i32 %ret @@ -28,7 +27,6 @@ ; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 ; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) @@ -42,7 +40,6 @@ ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) ret void @@ -61,7 +58,6 @@ ; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 ; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo ; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -6,7 +6,6 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) ret void @@ -23,7 +22,6 @@ ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) @@ -41,7 +39,6 @@ ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) @@ -71,7 +68,6 @@ ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) ret void @@ -88,7 +84,6 @@ ; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 ; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc ; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off -; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -14,7 +14,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_swap_i32_1d: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -49,7 +47,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1d: @@ -64,7 +61,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -84,7 +80,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_sub_i32_1d: @@ -99,7 +94,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -119,7 +113,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smin_i32_1d: @@ -134,7 +127,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -154,7 +146,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umin_i32_1d: @@ -169,7 +160,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -189,7 +179,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smax_i32_1d: @@ -204,7 +193,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -224,7 +212,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umax_i32_1d: @@ -239,7 +226,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -259,7 +245,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_and_i321d: @@ -274,7 +259,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -294,7 +278,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_or_i32_1d: @@ -309,7 +292,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -329,7 +311,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_xor_i32_1d: @@ -344,7 +325,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -364,7 +344,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_inc_i32_1d: @@ -379,7 +358,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -399,7 +377,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_dec_i32_1d: @@ -414,7 +391,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -434,7 +410,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_cmpswap_i32_1d: @@ -449,7 +424,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i16(i32 %cmp, i32 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -472,7 +446,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2d: @@ -489,7 +462,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -514,7 +486,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_3d: @@ -534,7 +505,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i16(i32 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -559,7 +529,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_cube: @@ -579,7 +548,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i16(i32 %data, i16 %s, i16 %t, i16 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -602,7 +570,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1darray: @@ -619,7 +586,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i16(i32 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -644,7 +610,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darray: @@ -664,7 +629,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -689,7 +653,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s8 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: @@ -709,7 +672,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -734,7 +696,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v3, v5, v2 ; GFX9-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: @@ -754,7 +715,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i16(i32 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -774,7 +734,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1d_slc: @@ -789,7 +748,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc slc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -809,7 +767,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_swap_i64_1d: @@ -824,7 +781,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -844,7 +800,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1d: @@ -859,7 +814,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -879,7 +833,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_sub_i64_1d: @@ -894,7 +847,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -914,7 +866,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smin_i64_1d: @@ -929,7 +880,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -949,7 +899,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umin_i64_1d: @@ -964,7 +913,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -984,7 +932,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smax_i64_1d: @@ -999,7 +946,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1019,7 +965,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umax_i64_1d: @@ -1034,7 +979,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1054,7 +998,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_and_i64_1d: @@ -1069,7 +1012,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1089,7 +1031,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_or_i64_1d: @@ -1104,7 +1045,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1124,7 +1064,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_xor_i64_1d: @@ -1139,7 +1078,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1159,7 +1097,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_inc_i64_1d: @@ -1174,7 +1111,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1194,7 +1130,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_dec_i64_1d: @@ -1209,7 +1144,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1229,7 +1163,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_cmpswap_i64_1d: @@ -1244,7 +1177,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i16(i64 %cmp, i64 %swap, i16 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1267,7 +1199,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2d: @@ -1284,7 +1215,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i16(i64 %data, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -1309,7 +1239,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_3d: @@ -1329,7 +1258,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i16(i64 %data, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -1354,7 +1282,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_cube: @@ -1374,7 +1301,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i16(i64 %data, i16 %s, i16 %t, i16 %face , <8 x i32> %rsrc, i32 0, i32 0) @@ -1397,7 +1323,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1darray: @@ -1414,7 +1339,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i16(i64 %data, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1439,7 +1363,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darray: @@ -1459,7 +1382,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1484,7 +1406,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s8 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: @@ -1504,7 +1425,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1529,7 +1449,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX9-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: @@ -1549,7 +1468,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i16(i64 %data, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1569,7 +1487,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc slc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1d_slc: @@ -1584,7 +1501,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc slc a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i16(i64 %data, i16 %s, <8 x i32> %rsrc, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -16,7 +16,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_swap_i32_1d: @@ -30,7 +29,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_swap_i32_1d: @@ -45,7 +43,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -65,7 +62,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_1d: @@ -79,7 +75,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1d: @@ -94,7 +89,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -114,7 +108,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_sub_i32_1d: @@ -128,7 +121,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_sub_i32_1d: @@ -143,7 +135,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -163,7 +154,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_smin_i32_1d: @@ -177,7 +167,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smin_i32_1d: @@ -192,7 +181,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -212,7 +200,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_umin_i32_1d: @@ -226,7 +213,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umin_i32_1d: @@ -241,7 +227,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -261,7 +246,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_smax_i32_1d: @@ -275,7 +259,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smax_i32_1d: @@ -290,7 +273,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -310,7 +292,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_umax_i32_1d: @@ -324,7 +305,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umax_i32_1d: @@ -339,7 +319,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -359,7 +338,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_and_i32_1d: @@ -373,7 +351,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_and_i32_1d: @@ -388,7 +365,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -408,7 +384,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_or_i32_1d: @@ -422,7 +397,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_or_i32_1d: @@ -437,7 +411,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -457,7 +430,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_xor_i32_1d: @@ -471,7 +443,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_xor_i32_1d: @@ -486,7 +457,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -506,7 +476,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_inc_i32_1d: @@ -520,7 +489,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_inc_i32_1d: @@ -535,7 +503,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -555,7 +522,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_dec_i32_1d: @@ -569,7 +535,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_dec_i32_1d: @@ -584,7 +549,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -604,7 +568,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_cmpswap_i32_1d: @@ -618,7 +581,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_cmpswap_i32_1d: @@ -633,7 +595,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -653,7 +614,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_2d: @@ -667,7 +627,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2d: @@ -682,7 +641,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -702,7 +660,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_3d: @@ -716,7 +673,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_3d: @@ -731,7 +687,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -751,7 +706,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_cube: @@ -765,7 +719,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_cube: @@ -780,7 +733,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -800,7 +752,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_1darray: @@ -814,7 +765,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1darray: @@ -829,7 +779,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -849,7 +798,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_2darray: @@ -863,7 +811,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darray: @@ -878,7 +825,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -898,7 +844,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_2dmsaa: @@ -912,7 +857,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: @@ -927,7 +871,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -947,7 +890,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_2darraymsaa: @@ -961,7 +903,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: @@ -976,7 +917,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -996,7 +936,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i32_1d_slc: @@ -1010,7 +949,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 unorm glc slc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i32_1d_slc: @@ -1025,7 +963,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc slc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -1045,7 +982,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_swap_i64_1d: @@ -1059,7 +995,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_swap_i64_1d: @@ -1074,7 +1009,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1094,7 +1028,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_1d: @@ -1108,7 +1041,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1d: @@ -1123,7 +1055,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1143,7 +1074,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_sub_i64_1d: @@ -1157,7 +1087,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_sub_i64_1d: @@ -1172,7 +1101,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_sub v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.sub.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1192,7 +1120,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_smin_i64_1d: @@ -1206,7 +1133,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smin_i64_1d: @@ -1221,7 +1147,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smin v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1241,7 +1166,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_umin_i64_1d: @@ -1255,7 +1179,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umin_i64_1d: @@ -1270,7 +1193,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umin v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umin.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1290,7 +1212,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_smax_i64_1d: @@ -1304,7 +1225,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_smax_i64_1d: @@ -1319,7 +1239,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_smax v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.smax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1339,7 +1258,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_umax_i64_1d: @@ -1353,7 +1271,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_umax_i64_1d: @@ -1368,7 +1285,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_umax v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.umax.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1388,7 +1304,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_and_i64_1d: @@ -1402,7 +1317,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_and_i64_1d: @@ -1417,7 +1331,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_and v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.and.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1437,7 +1350,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_or_i64_1d: @@ -1451,7 +1363,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_or_i64_1d: @@ -1466,7 +1377,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_or v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.or.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1486,7 +1396,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_xor_i64_1d: @@ -1500,7 +1409,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_xor_i64_1d: @@ -1515,7 +1423,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_xor v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.xor.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1535,7 +1442,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_inc_i64_1d: @@ -1549,7 +1455,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_inc_i64_1d: @@ -1564,7 +1469,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_inc v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.inc.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1584,7 +1488,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_dec_i64_1d: @@ -1598,7 +1501,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_dec_i64_1d: @@ -1613,7 +1515,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_dec v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.dec.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1633,7 +1534,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_cmpswap_i64_1d: @@ -1647,7 +1547,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_cmpswap_i64_1d: @@ -1662,7 +1561,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -1682,7 +1580,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_2d: @@ -1696,7 +1593,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2d: @@ -1711,7 +1607,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2d.i64.i32(i64 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -1731,7 +1626,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_3d: @@ -1745,7 +1639,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_3d: @@ -1760,7 +1653,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.3d.i64.i32(i64 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -1780,7 +1672,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_cube: @@ -1794,7 +1685,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_cube: @@ -1809,7 +1699,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_CUBE unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.cube.i64.i32(i64 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) @@ -1829,7 +1718,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_1darray: @@ -1843,7 +1731,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1darray: @@ -1858,7 +1745,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:3], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1darray.i64.i32(i64 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1878,7 +1764,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_2darray: @@ -1892,7 +1777,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darray: @@ -1907,7 +1791,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darray.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -1927,7 +1810,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_2dmsaa: @@ -1941,7 +1823,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 unorm glc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: @@ -1956,7 +1837,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:4], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2dmsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1976,7 +1856,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v[2:5], s[0:7] dmask:0x3 unorm glc da -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_2darraymsaa: @@ -1990,7 +1869,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v[2:5], s[0:7] dmask:0x3 unorm glc da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: @@ -2005,7 +1883,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v[2:5], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm glc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.2darraymsaa.i64.i32(i64 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -2025,7 +1902,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc slc -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: atomic_add_i64_1d_slc: @@ -2039,7 +1915,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 unorm glc slc -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_add_i64_1d_slc: @@ -2054,7 +1929,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_atomic_add v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc slc -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call i64 @llvm.amdgcn.image.atomic.add.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -24,7 +24,6 @@ ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2d: @@ -48,7 +47,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v0, v0, 0xffff, v1 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -79,7 +77,6 @@ ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_cube: @@ -106,7 +103,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -137,7 +133,6 @@ ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2darray: @@ -164,7 +159,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -193,7 +187,6 @@ ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_2d: @@ -217,7 +210,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -248,7 +240,6 @@ ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_cl_2d: @@ -275,7 +266,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -306,7 +296,6 @@ ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_cl_2d: @@ -333,7 +322,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -362,7 +350,6 @@ ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_b_2d: @@ -386,7 +373,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v1, v1, 0xffff, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -415,7 +401,6 @@ ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_b_2d: @@ -439,7 +424,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v2, v2, 0xffff, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -470,7 +454,6 @@ ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_b_cl_2d: @@ -497,7 +480,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -528,7 +510,6 @@ ; GFX9-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: @@ -555,7 +536,6 @@ ; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -583,7 +563,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s12 ; GFX9-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_l_2d: @@ -607,7 +586,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -635,7 +613,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, s12 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_l_2d: @@ -659,7 +636,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -685,7 +661,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_lz_2d: @@ -706,7 +681,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -732,7 +706,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_lz_2d: @@ -753,7 +726,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f16(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -21,7 +21,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2d: @@ -43,7 +42,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -69,7 +67,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_cube: @@ -91,7 +88,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -117,7 +113,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2darray: @@ -139,7 +134,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -165,7 +159,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_2d: @@ -187,7 +180,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -213,7 +205,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_cl_2d: @@ -235,7 +226,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -261,7 +251,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_cl_2d: @@ -283,7 +272,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -309,7 +297,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_b_2d: @@ -331,7 +318,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -357,7 +343,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_b_2d: @@ -379,7 +364,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -405,7 +389,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_b_cl_2d: @@ -427,7 +410,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -453,7 +435,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_b_cl_2d: @@ -475,7 +456,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -498,7 +478,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_l_2d: @@ -517,7 +496,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -540,7 +518,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_l_2d: @@ -559,7 +536,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -582,7 +558,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_lz_2d: @@ -601,7 +576,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -624,7 +598,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_c_lz_2d: @@ -643,7 +616,6 @@ ; GFX10NSA-NEXT: s_mov_b32 s11, s13 ; GFX10NSA-NEXT: ; implicit-def: $vcc_hi ; GFX10NSA-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -669,7 +641,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2d_dmask_2: @@ -691,7 +662,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -717,7 +687,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2d_dmask_4: @@ -739,7 +708,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -765,7 +733,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10NSA-LABEL: gather4_2d_dmask_8: @@ -787,7 +754,6 @@ ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D -; GFX10NSA-NEXT: s_waitcnt vmcnt(0) ; GFX10NSA-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.o.dim.ll @@ -21,7 +21,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_o_2d: @@ -43,7 +42,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -69,7 +67,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_o_2d: @@ -91,7 +88,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -117,7 +113,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_cl_o_2d: @@ -139,7 +134,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_cl_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -165,7 +159,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_cl_o_2d: @@ -187,7 +180,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -213,7 +205,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_b_o_2d: @@ -235,7 +226,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -261,7 +251,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_b_o_2d: @@ -283,7 +272,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -306,7 +294,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_b_cl_o_2d: @@ -325,7 +312,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -351,7 +337,6 @@ ; GFX6-NEXT: s_wqm_b64 exec, exec ; GFX6-NEXT: s_and_b64 exec, exec, s[14:15] ; GFX6-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_b_cl_o_2d: @@ -373,7 +358,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10-NEXT: image_gather4_c_b_cl_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.2d.v4f32.f32.f32(i32 1, i32 %offset, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -396,7 +380,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_o_2d: @@ -415,7 +398,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_l_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -438,7 +420,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_o_2d: @@ -457,7 +438,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_c_l_o v[0:3], v[0:7], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -480,7 +460,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_lz_o_2d: @@ -499,7 +478,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -522,7 +500,6 @@ ; GFX6-NEXT: s_mov_b32 s10, s12 ; GFX6-NEXT: s_mov_b32 s11, s13 ; GFX6-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_lz_o_2d: @@ -541,7 +518,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 %offset, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.a16.ll @@ -14,7 +14,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1d: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -48,7 +46,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2d: @@ -63,7 +60,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -82,7 +78,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_3d: @@ -97,7 +92,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -116,7 +110,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_cube: @@ -131,7 +124,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -150,7 +142,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1darray: @@ -165,7 +156,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -184,7 +174,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darray: @@ -199,7 +188,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -218,7 +206,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2dmsaa: @@ -233,7 +220,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -252,7 +238,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darraymsaa: @@ -267,7 +252,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.getresinfo.ll @@ -15,7 +15,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_1d: @@ -29,7 +28,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1d: @@ -44,7 +42,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -63,7 +60,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_2d: @@ -77,7 +73,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2d: @@ -92,7 +87,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -111,7 +105,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_3d: @@ -125,7 +118,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_3d: @@ -140,7 +132,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -159,7 +150,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_cube: @@ -173,7 +163,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_cube: @@ -188,7 +177,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -207,7 +195,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_1darray: @@ -221,7 +208,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1darray: @@ -236,7 +222,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -255,7 +240,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_2darray: @@ -269,7 +253,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darray: @@ -284,7 +267,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -303,7 +285,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_2dmsaa: @@ -317,7 +298,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2dmsaa: @@ -332,7 +312,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -351,7 +330,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: getresinfo_2darraymsaa: @@ -365,7 +343,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darraymsaa: @@ -380,7 +357,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -16,7 +16,6 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 -; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_f16_x: @@ -30,7 +29,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_f16_x: @@ -44,7 +42,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f16_x: @@ -59,7 +56,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -77,7 +73,6 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 -; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_f16_y: @@ -91,7 +86,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_f16_y: @@ -105,7 +99,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f16_y: @@ -120,7 +113,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -138,7 +130,6 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 -; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_f16_z: @@ -152,7 +143,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_f16_z: @@ -166,7 +156,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f16_z: @@ -181,7 +170,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -199,7 +187,6 @@ ; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-UNPACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 -; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: ; return to shader part epilog ; ; GFX8-PACKED-LABEL: load_1d_f16_w: @@ -213,7 +200,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_f16_w: @@ -227,7 +213,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f16_w: @@ -242,7 +227,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call half @llvm.amdgcn.image.load.1d.half.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret half %v @@ -277,7 +261,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x3 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v2f16_xy: @@ -291,7 +274,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x3 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f16_xy: @@ -306,7 +288,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -341,7 +322,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x5 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v2f16_xz: @@ -355,7 +335,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x5 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f16_xz: @@ -370,7 +349,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -405,7 +383,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x9 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v2f16_xw: @@ -419,7 +396,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x9 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f16_xw: @@ -434,7 +410,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -469,7 +444,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v0, v0, s[0:7] dmask:0x6 unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v2f16_yz: @@ -483,7 +457,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x6 unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f16_yz: @@ -498,7 +471,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x half> @llvm.amdgcn.image.load.1d.v2f16.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x half> %v @@ -543,7 +515,6 @@ ; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 ; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 ; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf unorm d16 -; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-PACKED-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: load_1d_v4f16_xyzw: @@ -557,7 +528,6 @@ ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf unorm d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v4f16_xyzw: @@ -572,7 +542,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x half> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.ll @@ -15,7 +15,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_x: @@ -29,7 +28,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_x: @@ -44,7 +42,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 1, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -62,7 +59,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_y: @@ -76,7 +72,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x2 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_y: @@ -91,7 +86,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -109,7 +103,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_z: @@ -123,7 +116,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x4 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_z: @@ -138,7 +130,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 4, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -156,7 +147,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_f32_w: @@ -170,7 +160,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_f32_w: @@ -185,7 +174,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret float %v @@ -203,7 +191,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v2f32_xy: @@ -217,7 +204,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f32_xy: @@ -232,7 +218,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 3, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -250,7 +235,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v2f32_xz: @@ -264,7 +248,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f32_xz: @@ -279,7 +262,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x5 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 5, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -297,7 +279,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v2f32_xw: @@ -311,7 +292,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f32_xw: @@ -326,7 +306,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -344,7 +323,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v2f32_yz: @@ -358,7 +336,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v2f32_yz: @@ -373,7 +350,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <2 x float> %v @@ -391,7 +367,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v3f32_xyz: @@ -405,7 +380,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v3f32_xyz: @@ -420,7 +394,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x float> @llvm.amdgcn.image.load.1d.v3f32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x float> %v @@ -438,7 +411,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: load_1d_v4f32_xyzw: @@ -452,7 +424,6 @@ ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 ; GFX8-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_v4f32_xyzw: @@ -467,7 +438,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -14,7 +14,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d_v4f32_xyzw: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -53,7 +51,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d_v4f32_xyzw_tfe: @@ -72,7 +69,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 @@ -99,7 +95,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d_v4f32_xyzw_tfe_lwe: @@ -118,7 +113,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -19,7 +19,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: @@ -39,7 +38,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -66,7 +64,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[5:6], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: @@ -90,7 +87,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 @@ -120,7 +116,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[5:6], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: @@ -144,7 +139,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -14,7 +14,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -53,7 +51,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: @@ -72,7 +69,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 @@ -99,7 +95,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: @@ -118,7 +113,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2darraymsaa.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -19,7 +19,6 @@ ; GFX9-NEXT: s_mov_b32 s7, s9 ; GFX9-NEXT: v_and_or_b32 v1, v2, v3, s8 ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw: @@ -39,7 +38,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -66,7 +64,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[5:6], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: @@ -90,7 +87,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 @@ -120,7 +116,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[5:6], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: @@ -144,7 +139,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -14,7 +14,6 @@ ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 ; GFX6-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw: @@ -29,7 +28,6 @@ ; GFX10-NEXT: s_mov_b32 s7, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) ret <4 x float> %v @@ -53,7 +51,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: @@ -72,7 +69,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 @@ -99,7 +95,6 @@ ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: @@ -118,7 +113,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[5:6], v4, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog %v = call { <4 x float>, i32 } @llvm.amdgcn.image.load.3d.sl_v4f32i32s.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) %v.vec = extractvalue { <4 x float>, i32 } %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -10,7 +10,6 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 ; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -27,7 +26,6 @@ ; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 ; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -47,7 +45,6 @@ ; GFX10-NEXT: v_and_or_b32 v2, v3, v11, v4 ; GFX10-NEXT: v_and_or_b32 v3, v5, v11, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -63,7 +60,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -80,7 +76,6 @@ ; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -96,7 +91,6 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -113,7 +107,6 @@ ; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 ; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -129,7 +122,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 ; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -146,7 +138,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -162,7 +153,6 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 ; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -179,7 +169,6 @@ ; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 ; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -195,7 +184,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 ; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -212,7 +200,6 @@ ; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -228,7 +215,6 @@ ; GFX10-NEXT: v_and_or_b32 v0, v0, v7, s12 ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -245,7 +231,6 @@ ; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 ; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -261,7 +246,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v7, s12 ; GFX10-NEXT: v_and_or_b32 v2, v2, v7, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -278,7 +262,6 @@ ; GFX10-NEXT: v_and_or_b32 v1, v1, v8, v2 ; GFX10-NEXT: v_and_or_b32 v2, v3, v8, v10 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v2, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -295,7 +278,6 @@ ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 ; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -312,7 +294,6 @@ ; GFX10-NEXT: v_and_or_b32 v2, v2, v9, v3 ; GFX10-NEXT: v_and_or_b32 v3, v4, v9, v11 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v3, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.a16.ll @@ -18,7 +18,6 @@ ; GFX9-NEXT: s_mov_b32 s10, s12 ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_1d: @@ -37,7 +36,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -63,7 +61,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_2d: @@ -84,7 +81,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -110,7 +106,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_1d: @@ -131,7 +126,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -157,7 +151,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_2d: @@ -178,7 +171,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -204,7 +196,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s12 ; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_o_1d: @@ -225,7 +216,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f16(i32 15, i32 %offset, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -251,7 +241,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_o_2d: @@ -272,7 +261,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -298,7 +286,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v2, v3, s12 ; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_o_1d: @@ -319,7 +306,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -345,7 +331,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_o_2d: @@ -366,7 +351,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -392,7 +376,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_2d: @@ -413,7 +396,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 15, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -439,7 +421,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_2d: @@ -460,7 +441,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -486,7 +466,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 ; GFX9-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_o_2d: @@ -507,7 +486,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f16(i32 15, i32 %offset, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -533,7 +511,6 @@ ; GFX9-NEXT: s_mov_b32 s11, s13 ; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GFX9-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_o_2d: @@ -554,7 +531,6 @@ ; GFX10-NEXT: s_mov_b32 s11, s13 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: image_gather4_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f16(i32 15, i32 %offset, float %zcompare, half %s, half %t, half 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.ltolz.ll @@ -17,7 +17,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -40,7 +39,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float -0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -63,7 +61,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float -2.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -86,7 +83,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -109,7 +105,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -132,7 +127,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -155,7 +149,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_c_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -178,7 +171,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -201,7 +193,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 15, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -224,7 +215,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -247,7 +237,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_gather4_lz_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -270,7 +259,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s13 ; GCN-NEXT: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.000000e+00, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -12,7 +12,6 @@ ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -75,7 +74,6 @@ ; GFX7-UNALIGNED-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-UNALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -150,7 +148,6 @@ ; GFX9-UNALIGNED: ; %bb.0: ; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -187,7 +184,6 @@ ; GFX7-UNALIGNED-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-UNALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -231,7 +227,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_constant_v3i32_align4: @@ -241,7 +236,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4 ret <3 x i32> %load @@ -252,7 +246,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_constant_i96_align8: @@ -262,7 +255,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load i96, i96 addrspace(4)* %ptr, align 8 ret i96 %load @@ -273,7 +265,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_constant_v3i32_align8: @@ -283,7 +274,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 8 ret <3 x i32> %load @@ -294,7 +284,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_constant_v6i16_align8: @@ -368,7 +357,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_load_constant_v3i32_align16: @@ -378,7 +366,6 @@ ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 16 ret <3 x i32> %load @@ -691,7 +678,6 @@ ; GFX9-NEXT: s_mov_b32 s3, s1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v3i32_align4: @@ -700,7 +686,6 @@ ; GFX7-NEXT: s_mov_b32 s3, s1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 4 ret <3 x i32> %load @@ -713,7 +698,6 @@ ; GFX9-NEXT: s_mov_b32 s3, s1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_i96_align8: @@ -722,7 +706,6 @@ ; GFX7-NEXT: s_mov_b32 s3, s1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %load = load i96, i96 addrspace(4)* %ptr, align 8 ret i96 %load @@ -735,7 +718,6 @@ ; GFX9-NEXT: s_mov_b32 s3, s1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v3i32_align8: @@ -744,7 +726,6 @@ ; GFX7-NEXT: s_mov_b32 s3, s1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 8 ret <3 x i32> %load @@ -757,7 +738,6 @@ ; GFX9-NEXT: s_mov_b32 s3, s1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_load_dword s2, s[2:3], 0x8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_load_constant_v6i16_align8: @@ -766,7 +746,6 @@ ; GFX7-NEXT: s_mov_b32 s3, s1 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_load_dword s2, s[2:3], 0x2 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %load = load <6 x i16>, <6 x i16> addrspace(4)* %ptr, align 8 %cast = bitcast <6 x i16> %load to <3 x i32> @@ -817,7 +796,6 @@ ; GCN-LABEL: s_load_constant_v3i32_align16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 16 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.128.ll @@ -10,7 +10,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32: @@ -18,7 +17,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b128 v[0:3], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load @@ -245,7 +243,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align4: @@ -255,7 +252,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load @@ -266,7 +262,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align8: @@ -274,7 +269,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load @@ -285,7 +279,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align16: @@ -293,7 +286,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b128 v[0:3], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-local.96.ll @@ -10,7 +10,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32: @@ -18,7 +17,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b96 v[0:2], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load @@ -203,7 +201,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align4: @@ -213,7 +210,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load @@ -224,7 +220,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align8: @@ -234,7 +229,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b64 v[0:1], v0 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load @@ -245,7 +239,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align16: @@ -253,7 +246,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b96 v[0:2], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -11,7 +11,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align1: @@ -99,7 +98,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align1: @@ -171,7 +169,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_write_b128 v0, v[1:4] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: store_lds_v4i32_align1: @@ -206,7 +203,6 @@ ; GFX7-NEXT: ds_write_b8 v0, v1 offset:13 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:14 ; GFX7-NEXT: ds_write_b8 v0, v3 offset:15 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 ret void @@ -217,7 +213,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_write_b96 v0, v[1:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: store_lds_v3i32_align1: @@ -245,7 +240,6 @@ ; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 ; GFX7-NEXT: ds_write_b8 v0, v2 offset:10 ; GFX7-NEXT: ds_write_b8 v0, v4 offset:11 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 ret void diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -203,7 +203,6 @@ ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: BB2_4: ; %bb2 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] entry: br i1 %cond, label %bb0, label %bb1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -469,7 +469,6 @@ ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr: @@ -479,7 +478,6 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %val = load volatile float, float addrspace(1)* %ptr ret float %val @@ -494,7 +492,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095: @@ -505,7 +502,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 %val = load volatile float, float addrspace(1)* %gep @@ -524,7 +520,6 @@ ; GFX6-NEXT: s_mov_b32 s2, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967296: @@ -538,7 +533,6 @@ ; GFX7-NEXT: s_mov_b32 s2, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 %val = load volatile float, float addrspace(1)* %gep @@ -557,7 +551,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4294967297: @@ -571,7 +564,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 %val = load volatile float, float addrspace(1)* %gep @@ -587,7 +579,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 ; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], s4 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4096: @@ -598,7 +589,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 ; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s4 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 %val = load volatile float, float addrspace(1)* %gep @@ -613,7 +603,6 @@ ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4095: @@ -623,7 +612,6 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4095 %val = load volatile float, float addrspace(1)* %gep @@ -638,7 +626,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967296: @@ -648,7 +635,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967296 %val = load volatile float, float addrspace(1)* %gep @@ -663,7 +649,6 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4294967297: @@ -673,7 +658,6 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4294967297 %val = load volatile float, float addrspace(1)* %gep @@ -688,7 +672,6 @@ ; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: s_movk_i32 s4, 0x4000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_offset4096: @@ -698,7 +681,6 @@ ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: s_movk_i32 s4, 0x4000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i64 4096 %val = load volatile float, float addrspace(1)* %gep @@ -717,7 +699,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_sgpr_offset: @@ -731,7 +712,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset %val = load volatile float, float addrspace(1)* %gep @@ -746,7 +726,6 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: @@ -756,7 +735,6 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %soffset %val = load volatile float, float addrspace(1)* %gep @@ -771,7 +749,6 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: @@ -781,7 +758,6 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %soffset %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 256 @@ -803,7 +779,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: @@ -819,7 +794,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256 %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %soffset @@ -837,7 +811,6 @@ ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_vgpr_offset: @@ -849,7 +822,6 @@ ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset %val = load volatile float, float addrspace(1)* %gep @@ -867,7 +839,6 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_vgpr_offset_offset4095: @@ -880,7 +851,6 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset %gep1 = getelementptr float, float addrspace(1)* %gep0, i64 4095 @@ -897,7 +867,6 @@ ; GFX6-NEXT: s_addc_u32 s5, s3, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset: @@ -909,7 +878,6 @@ ; GFX7-NEXT: s_addc_u32 s5, s3, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095 %gep1 = getelementptr float, float addrspace(1)* %gep0, i32 %voffset @@ -930,7 +898,6 @@ ; GFX6-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4095: @@ -968,7 +935,6 @@ ; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: @@ -1006,7 +972,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4095: @@ -1041,7 +1006,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_vgpr_ptr_offset4294967296: @@ -1078,7 +1042,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: @@ -1116,7 +1079,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v1 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4095: @@ -1157,7 +1119,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v1 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_offset4294967296: @@ -1197,7 +1158,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4095: @@ -1233,7 +1193,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v3 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_cmpxchg_vgpr_ptr_offset4294967296: @@ -1271,7 +1230,6 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v0, v2 -; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_cmpxchg_sgpr_ptr_vgpr_offset: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -199,7 +199,6 @@ ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, s8 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -263,7 +262,6 @@ ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 ; GCN-NEXT: s_mov_b32 s33, s8 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll @@ -44,7 +44,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: zextload_global_i8_to_i64: @@ -52,7 +51,6 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i8_to_i64: @@ -63,7 +61,6 @@ ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i8, i8 addrspace(1)* %ptr %ext = zext i8 %load to i64 @@ -76,7 +73,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: zextload_global_i16_to_i64: @@ -84,7 +80,6 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i16_to_i64: @@ -95,7 +90,6 @@ ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i16, i16 addrspace(1)* %ptr %ext = zext i16 %load to i64 @@ -108,7 +102,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: zextload_global_i32_to_i64: @@ -116,7 +109,6 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i64: @@ -127,7 +119,6 @@ ; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i64 @@ -141,7 +132,6 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: zextload_global_i32_to_i96: @@ -150,7 +140,6 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i96: @@ -162,7 +151,6 @@ ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i96 @@ -177,7 +165,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: zextload_global_i32_to_i128: @@ -187,7 +174,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: zextload_global_i32_to_i128: @@ -200,7 +186,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, 0 -; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load i32, i32 addrspace(1)* %ptr %ext = zext i32 %load to i128 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_load_local.ll @@ -6,7 +6,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b32 v0, v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32(i32 addrspace(3)* %ptr) { %load = load atomic i32, i32 addrspace(3)* %ptr monotonic, align 4 @@ -18,7 +17,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i32 @atomic_load_monotonic_i32_offset(i32 addrspace(3)* %ptr) { %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16 @@ -31,7 +29,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b64 v[0:1], v0{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64(i64 addrspace(3)* %ptr) { %load = load atomic i64, i64 addrspace(3)* %ptr monotonic, align 8 @@ -43,7 +40,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i64 @atomic_load_monotonic_i64_offset(i64 addrspace(3)* %ptr) { %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i32 16 @@ -56,7 +52,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define float @atomic_load_monotonic_f32_offset(float addrspace(3)* %ptr) { %gep = getelementptr inbounds float, float addrspace(3)* %ptr, i32 16 @@ -69,7 +64,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define double @atomic_load_monotonic_f64_offset(double addrspace(3)* %ptr) { %gep = getelementptr inbounds double, double addrspace(3)* %ptr, i32 16 @@ -82,7 +76,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i8* @atomic_load_monotonic_p0i8_offset(i8* addrspace(3)* %ptr) { %gep = getelementptr inbounds i8*, i8* addrspace(3)* %ptr, i32 16 @@ -95,7 +88,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define i8 addrspace(3)* @atomic_load_monotonic_p3i8_offset(i8 addrspace(3)* addrspace(3)* %ptr) { %gep = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %ptr, i32 16 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_store_local.ll @@ -6,7 +6,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_write_b32 v0, v1{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i32(i32 addrspace(3)* %ptr, i32 %val) { store atomic i32 %val, i32 addrspace(3)* %ptr monotonic, align 4 @@ -18,7 +17,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_write_b32 v0, v1 offset:64{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i32(i32 addrspace(3)* %ptr, i32 %val) { %gep = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 16 @@ -31,7 +29,6 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_write_b64 v0, v[1:2]{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_i64(i64 addrspace(3)* %ptr, i64 %val) { store atomic i64 %val, i64 addrspace(3)* %ptr monotonic, align 8 @@ -43,11 +40,9 @@ ; GFX9-NOT: s_mov_b32 m0 ; CI-NEXT: s_mov_b32 m0 ; GCN-NEXT: ds_write_b64 v0, v[1:2] offset:128{{$}} -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @atomic_store_monotonic_offset_i64(i64 addrspace(3)* %ptr, i64 %val) { %gep = getelementptr inbounds i64, i64 addrspace(3)* %ptr, i64 16 store atomic i64 %val, i64 addrspace(3)* %gep monotonic, align 8 ret void } - diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -630,7 +630,7 @@ ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm: ; GCN: v_mov_b32_e32 v0, 42 ; GCN: s_swappc_b64 s[30:31], -; GCN-NOT: s_waitcnt +; GCN: s_waitcnt ; GCN: buffer_store_dword v0, off, s[36:39], 0 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { %val = call i32 @external_i32_func_i32(i32 42) diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -10,6 +10,7 @@ ; GCN-NEXT: s_addc_u32 s35, s35, ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s[34:35] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: #ASMSTART ; GCN-NEXT: #ASMEND @@ -30,6 +31,7 @@ ; GCN: v_writelane_b32 v40, s31, 3 ; GCN: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_swappc_b64 @@ -55,6 +57,7 @@ ; GCN: s_mov_b32 s33, s32 ; GCN: s_add_u32 s32, s32, 0x400 ; GCN: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 ; GCN: v_readlane_b32 s33, v40, 4 @@ -106,6 +109,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: ; GCN: s_mov_b32 s33, s31 ; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { %s31 = call i32 asm sideeffect "; def $0", "={s31}"() @@ -117,6 +121,7 @@ ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: ; GCN: v_mov_b32_e32 v40, v31 ; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v31, v40 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { %v31 = call i32 asm sideeffect "; def $0", "={v31}"() @@ -163,6 +168,7 @@ ; GCN-NOT: s34 ; GCN: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NOT: s34 diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -66,6 +66,7 @@ ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s34 ; GCN-NEXT: v_mov_b32_e32 v1, s35 ; GCN-NEXT: global_store_dword v[0:1], v40, off @@ -89,6 +90,7 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+12 ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NEXT: global_store_dword v[1:2], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -33,7 +33,6 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack() #0 { %alloca = alloca i32, addrspace(5) @@ -53,7 +52,6 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { %alloca = alloca i32, addrspace(5) @@ -66,7 +64,6 @@ ; GCN-NEXT: s_waitcnt ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_non_leaf() #2 { %alloca = alloca i32, addrspace(5) @@ -99,8 +96,6 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) - ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_and_call() #0 { %alloca = alloca i32, addrspace(5) @@ -135,7 +130,6 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_with_call() #0 { call void @external_void_func_void() @@ -161,7 +155,6 @@ ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 @@ -219,7 +212,6 @@ ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { %alloca = alloca i32, addrspace(5) @@ -242,7 +234,6 @@ ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: v_readlane_b32 s33, v1, 63 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) @@ -276,7 +267,6 @@ ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { %alloca = alloca i32, addrspace(5) @@ -304,7 +294,6 @@ ; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 ; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 ; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { %alloca = alloca i32, align 8192, addrspace(5) @@ -326,7 +315,6 @@ ; GCN-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 ; GCN-NEXT: v_readlane_b32 s33, v1, 2 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] define void @no_unused_non_csr_sgpr_for_fp() #1 { %alloca = alloca i32, addrspace(5) @@ -365,7 +353,6 @@ ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { %alloca = alloca i32, addrspace(5) @@ -412,7 +399,6 @@ ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #1 { %alloca = alloca i32, addrspace(5) @@ -436,7 +422,8 @@ } ; GCN-LABEL: {{^}}local_empty_func: -; GCN: s_waitcnt +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define internal void @local_empty_func() #0 { ret void diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -6,7 +6,6 @@ ; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0 ; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_x() #1 { %val = call i32 @llvm.amdgcn.workitem.id.x() @@ -19,7 +18,6 @@ ; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10 ; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_y() #1 { %val = call i32 @llvm.amdgcn.workitem.id.y() @@ -32,7 +30,6 @@ ; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10 ; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10 ; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_z() #1 { %val = call i32 @llvm.amdgcn.workitem.id.z() @@ -50,7 +47,6 @@ ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xy() #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() @@ -75,7 +71,6 @@ ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xyz() #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() @@ -97,7 +92,6 @@ ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_xz() #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.x() @@ -117,7 +111,6 @@ ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]] ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]] -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @use_workitem_id_yz() #1 { %val0 = call i32 @llvm.amdgcn.workitem.id.y() @@ -400,8 +393,7 @@ ; VARABI: v_and_b32_e32 v32, 0x3ff, v32 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 -; VARABI: s_waitcnt -; VARABI-NEXT: s_setpc_b64 +; VARABI: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31 ; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} @@ -708,8 +700,7 @@ ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]] ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]] -; VARABI: s_waitcnt -; VARABI-NEXT: s_setpc_b64 +; VARABI: s_setpc_b64 ; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31 @@ -813,8 +804,7 @@ ; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10 ; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]] -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 +; GCN: s_setpc_b64 ; GCN: ScratchSize: 0 define void @too_many_args_use_workitem_id_x_stack_yz( i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -8,7 +8,6 @@ ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1 @@ -29,7 +28,6 @@ ; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(5)* %base_lo @@ -68,7 +66,6 @@ ; GCN-NEXT: ds_read_u16 v0, v1 offset:2 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1 @@ -89,7 +86,6 @@ ; GCN-NEXT: ds_read_u16 v0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_read_u16_d16_hi v0, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(3)* %base_lo @@ -112,7 +108,6 @@ ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1 @@ -133,7 +128,6 @@ ; GCN-NEXT: global_load_ushort v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half addrspace(1)* %base_lo @@ -156,7 +150,6 @@ ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2] -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %gep_lo = getelementptr inbounds half, half* null, i64 1 @@ -177,7 +170,6 @@ ; GCN-NEXT: flat_load_ushort v0, v[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %load_lo = load half, half* %base_lo diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -39,6 +39,7 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 @@ -46,7 +47,6 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <2 x float> @func_v2f32() @@ -73,6 +73,7 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 @@ -80,7 +81,6 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <3 x float> @func_v3f32() @@ -107,6 +107,7 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 @@ -114,7 +115,6 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call <4 x half> @func_v4f16() @@ -141,6 +141,7 @@ ; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+12 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v40, 0 ; GCN-NEXT: v_readlane_b32 s5, v40, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 @@ -149,7 +150,6 @@ ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] bb0: %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct() @@ -184,6 +184,7 @@ ; GCN-NEXT: s_add_u32 s4, s4, func_v3i16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3i16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_branch BB4_3 ; GCN-NEXT: BB4_2: ; GCN-NEXT: s_mov_b32 s4, 0 @@ -229,6 +230,7 @@ ; GCN-NEXT: s_add_u32 s4, s4, func_v3f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func_v3f16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_branch BB5_3 ; GCN-NEXT: BB5_2: ; GCN-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -53,27 +53,15 @@ } define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind { -; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GCN-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 store i32 %lshr.8, i32 addrspace(1)* undef %masked = and i32 %lshr.8, 255 diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -128,7 +128,6 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align1: @@ -218,14 +217,12 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: flat_load_dword v0, v[0:1] -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_load_2xi16_align4: diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -55,7 +55,6 @@ ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 ; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2: @@ -66,7 +65,6 @@ ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 ; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_store_2xi16_align2: @@ -76,7 +74,6 @@ ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 2 @@ -111,7 +108,6 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align1: @@ -149,7 +145,6 @@ ; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1: @@ -157,7 +152,6 @@ ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_store_2xi16_align1: @@ -165,7 +159,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 1 @@ -186,14 +179,12 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align4: @@ -233,7 +224,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 store i16 1, i16 addrspace(5)* %r, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1020,7 +1020,6 @@ ; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen ; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen ; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: @@ -1063,7 +1062,6 @@ ; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen ; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen ; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: @@ -1101,7 +1099,6 @@ ; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX9-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -753,7 +753,6 @@ ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 { %arg0.0 = extractelement <3 x float> %arg0, i32 0 @@ -772,7 +771,6 @@ ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 ; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 { %arg0.0 = extractelement <3 x i32> %arg0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -4,7 +4,6 @@ ; GCN-LABEL: {{^}}i1_func_void: ; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define i1 @i1_func_void() #0 { %val = load i1, i1 addrspace(1)* undef @@ -14,7 +13,6 @@ ; FIXME: Missing and? ; GCN-LABEL: {{^}}i1_zeroext_func_void: ; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define zeroext i1 @i1_zeroext_func_void() #0 { %val = load i1, i1 addrspace(1)* undef @@ -33,7 +31,6 @@ ; GCN-LABEL: {{^}}i8_func_void: ; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i8 @i8_func_void() #0 { %val = load i8, i8 addrspace(1)* undef @@ -42,7 +39,6 @@ ; GCN-LABEL: {{^}}i8_zeroext_func_void: ; GCN: buffer_load_ubyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define zeroext i8 @i8_zeroext_func_void() #0 { %val = load i8, i8 addrspace(1)* undef @@ -51,7 +47,6 @@ ; GCN-LABEL: {{^}}i8_signext_func_void: ; GCN: buffer_load_sbyte v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define signext i8 @i8_signext_func_void() #0 { %val = load i8, i8 addrspace(1)* undef @@ -60,7 +55,6 @@ ; GCN-LABEL: {{^}}i16_func_void: ; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i16 @i16_func_void() #0 { %val = load i16, i16 addrspace(1)* undef @@ -69,7 +63,6 @@ ; GCN-LABEL: {{^}}i16_zeroext_func_void: ; GCN: buffer_load_ushort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define zeroext i16 @i16_zeroext_func_void() #0 { %val = load i16, i16 addrspace(1)* undef @@ -78,7 +71,6 @@ ; GCN-LABEL: {{^}}i16_signext_func_void: ; GCN: buffer_load_sshort v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define signext i16 @i16_signext_func_void() #0 { %val = load i16, i16 addrspace(1)* undef @@ -87,7 +79,6 @@ ; GCN-LABEL: {{^}}i32_func_void: ; GCN: buffer_load_dword v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i32 @i32_func_void() #0 { %val = load i32, i32 addrspace(1)* undef @@ -97,7 +88,6 @@ ; GCN-LABEL: {{^}}i48_func_void: ; GCN: buffer_load_dword v0, off ; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i48 @i48_func_void() #0 { %val = load i48, i48 addrspace(1)* undef, align 8 @@ -107,7 +97,6 @@ ; GCN-LABEL: {{^}}i48_zeroext_func_void: ; GCN: buffer_load_dword v0, off ; GCN-NEXT: buffer_load_ushort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define zeroext i48 @i48_zeroext_func_void() #0 { %val = load i48, i48 addrspace(1)* undef, align 8 @@ -117,7 +106,6 @@ ; GCN-LABEL: {{^}}i48_signext_func_void: ; GCN: buffer_load_dword v0, off ; GCN-NEXT: buffer_load_sshort v1, off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define signext i48 @i48_signext_func_void() #0 { %val = load i48, i48 addrspace(1)* undef, align 8 @@ -125,8 +113,6 @@ } ; GCN-LABEL: {{^}}i63_func_void: -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 define i63 @i63_func_void(i63 %val) #0 { ret i63 %val } @@ -154,7 +140,6 @@ ; GCN-LABEL: {{^}}i64_func_void: ; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i64 @i64_func_void() #0 { %val = load i64, i64 addrspace(1)* undef @@ -164,7 +149,6 @@ ; GCN-LABEL: {{^}}i65_func_void: ; GCN-DAG: buffer_load_dwordx2 v[0:1], off ; GCN-DAG: buffer_load_ubyte v2, off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define i65 @i65_func_void() #0 { %val = load i65, i65 addrspace(1)* undef @@ -173,7 +157,6 @@ ; GCN-LABEL: {{^}}f32_func_void: ; GCN: buffer_load_dword v0, off, s[4:7], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define float @f32_func_void() #0 { %val = load float, float addrspace(1)* undef @@ -182,7 +165,6 @@ ; GCN-LABEL: {{^}}f64_func_void: ; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define double @f64_func_void() #0 { %val = load double, double addrspace(1)* undef @@ -191,7 +173,6 @@ ; GCN-LABEL: {{^}}v2f64_func_void: ; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <2 x double> @v2f64_func_void() #0 { %val = load <2 x double>, <2 x double> addrspace(1)* undef @@ -200,7 +181,6 @@ ; GCN-LABEL: {{^}}v2i32_func_void: ; GCN: buffer_load_dwordx2 v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <2 x i32> @v2i32_func_void() #0 { %val = load <2 x i32>, <2 x i32> addrspace(1)* undef @@ -209,7 +189,6 @@ ; GCN-LABEL: {{^}}v3i32_func_void: ; GCN: buffer_load_dwordx3 v[0:2], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i32> @v3i32_func_void() #0 { %val = load <3 x i32>, <3 x i32> addrspace(1)* undef @@ -218,7 +197,6 @@ ; GCN-LABEL: {{^}}v4i32_func_void: ; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <4 x i32> @v4i32_func_void() #0 { %val = load <4 x i32>, <4 x i32> addrspace(1)* undef @@ -228,7 +206,6 @@ ; GCN-LABEL: {{^}}v5i32_func_void: ; GCN-DAG: buffer_load_dword v4, off ; GCN-DAG: buffer_load_dwordx4 v[0:3], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <5 x i32> @v5i32_func_void() #0 { %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef @@ -238,7 +215,6 @@ ; GCN-LABEL: {{^}}v8i32_func_void: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <8 x i32> @v8i32_func_void() #0 { %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef @@ -251,7 +227,6 @@ ; GCN-DAG: buffer_load_dwordx4 v[4:7], off ; GCN-DAG: buffer_load_dwordx4 v[8:11], off ; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <16 x i32> @v16i32_func_void() #0 { %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef @@ -268,7 +243,6 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <32 x i32> @v32i32_func_void() #0 { %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef @@ -278,7 +252,6 @@ ; GCN-LABEL: {{^}}v2i64_func_void: ; GCN: buffer_load_dwordx4 v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <2 x i64> @v2i64_func_void() #0 { %val = load <2 x i64>, <2 x i64> addrspace(1)* undef @@ -288,7 +261,6 @@ ; GCN-LABEL: {{^}}v3i64_func_void: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <3 x i64> @v3i64_func_void() #0 { %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef @@ -299,7 +271,6 @@ ; GCN-LABEL: {{^}}v4i64_func_void: ; GCN: buffer_load_dwordx4 v[0:3], off ; GCN: buffer_load_dwordx4 v[4:7], off -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <4 x i64> @v4i64_func_void() #0 { %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef @@ -311,7 +282,6 @@ ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off ; GCN-DAG: buffer_load_dwordx4 v[8:11], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <5 x i64> @v5i64_func_void() #0 { %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef @@ -324,7 +294,6 @@ ; GCN-DAG: buffer_load_dwordx4 v[4:7], off ; GCN-DAG: buffer_load_dwordx4 v[8:11], off ; GCN-DAG: buffer_load_dwordx4 v[12:15], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <8 x i64> @v8i64_func_void() #0 { %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef @@ -341,7 +310,6 @@ ; GCN-DAG: buffer_load_dwordx4 v[20:23], off ; GCN-DAG: buffer_load_dwordx4 v[24:27], off ; GCN-DAG: buffer_load_dwordx4 v[28:31], off -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define <16 x i64> @v16i64_func_void() #0 { %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef @@ -351,7 +319,6 @@ ; GCN-LABEL: {{^}}v2i16_func_void: ; GFX9: buffer_load_dword v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <2 x i16> @v2i16_func_void() #0 { %val = load <2 x i16>, <2 x i16> addrspace(1)* undef @@ -360,7 +327,6 @@ ; GCN-LABEL: {{^}}v3i16_func_void: ; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <3 x i16> @v3i16_func_void() #0 { %val = load <3 x i16>, <3 x i16> addrspace(1)* undef @@ -369,7 +335,6 @@ ; GCN-LABEL: {{^}}v4i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v4i16_func_void() #0 { %val = load <4 x i16>, <4 x i16> addrspace(1)* undef @@ -378,7 +343,6 @@ ; GCN-LABEL: {{^}}v4f16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v4f16_func_void() #0 { %val = load <4 x half>, <4 x half> addrspace(1)* undef @@ -390,7 +354,6 @@ ; GCN-LABEL: {{^}}v5i16_func_void: ; GFX9: buffer_load_dwordx2 v[0:1] ; GFX9-NEXT: global_load_short_d16 v2 -; GFX9-NEXT: s_waitcnt ; GFX9-NEXT: s_setpc_b64 define <5 x i16> @v5i16_func_void() #0 { %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef @@ -400,7 +363,6 @@ ; GCN-LABEL: {{^}}v8i16_func_void: ; GFX9-DAG: buffer_load_dwordx4 v[0:3], off -; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <8 x i16> @v8i16_func_void() #0 { %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef @@ -411,7 +373,6 @@ ; GCN-LABEL: {{^}}v16i16_func_void: ; GFX9: buffer_load_dwordx4 v[0:3], off ; GFX9: buffer_load_dwordx4 v[4:7], off -; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <16 x i16> @v16i16_func_void() #0 { %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef @@ -447,7 +408,6 @@ ; GCN-LABEL: {{^}}struct_i8_i32_func_void: ; GCN-DAG: buffer_load_dword v1 ; GCN-DAG: buffer_load_ubyte v0 -; GCN: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define {i8, i32} @struct_i8_i32_func_void() #0 { %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef @@ -507,7 +467,6 @@ ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef @@ -549,7 +508,6 @@ ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} -; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef @@ -591,7 +549,6 @@ ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}} ; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}} -; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll @@ -10,7 +10,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -25,7 +24,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -2651,7 +2651,6 @@ ; GCN-LABEL: global_inc_saddr_i32_rtn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2665,7 +2664,6 @@ ; GCN-LABEL: global_inc_saddr_i32_rtn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2705,7 +2703,6 @@ ; GCN-LABEL: global_inc_saddr_i64_rtn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2719,7 +2716,6 @@ ; GCN-LABEL: global_inc_saddr_i64_rtn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2766,7 +2762,6 @@ ; GCN-LABEL: global_dec_saddr_i32_rtn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2780,7 +2775,6 @@ ; GCN-LABEL: global_dec_saddr_i32_rtn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2820,7 +2814,6 @@ ; GCN-LABEL: global_dec_saddr_i64_rtn: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -2834,7 +2827,6 @@ ; GCN-LABEL: global_dec_saddr_i64_rtn_neg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -13,7 +13,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -28,7 +27,6 @@ ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: @@ -38,7 +36,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -59,7 +56,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: @@ -69,7 +65,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x1000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -85,7 +80,6 @@ ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: @@ -95,7 +89,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -116,7 +109,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: @@ -126,7 +118,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -142,7 +133,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -158,7 +148,6 @@ ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: @@ -168,7 +157,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -184,7 +172,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -200,7 +187,6 @@ ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: @@ -210,7 +196,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -226,7 +211,6 @@ ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: @@ -236,7 +220,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 @@ -252,7 +235,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -269,7 +251,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -286,7 +267,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -304,7 +284,6 @@ ; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -334,7 +313,6 @@ ; GCN-NEXT: v_readfirstlane_b32 s1, v2 ; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -356,7 +334,6 @@ ; GCN-NEXT: v_readfirstlane_b32 s1, v2 ; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 @@ -374,7 +351,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -390,7 +366,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -407,7 +382,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -425,7 +399,6 @@ ; GCN: ; %bb.0: ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 @@ -445,7 +418,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: @@ -453,7 +425,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset @@ -470,7 +441,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: @@ -480,7 +450,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset @@ -507,7 +476,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_f32_natural_addressing: @@ -519,7 +487,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -535,7 +502,6 @@ ; GCN-NEXT: global_load_dword v0, v[0:1], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 @@ -554,7 +520,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 @@ -571,7 +536,6 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 @@ -593,7 +557,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: @@ -605,7 +568,6 @@ ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 %zext.offset = zext i32 %voffset to i64 @@ -622,7 +584,6 @@ ; GCN-LABEL: global_load_saddr_i16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -636,7 +597,6 @@ ; GCN-LABEL: global_load_saddr_i16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -651,7 +611,6 @@ ; GCN-LABEL: global_load_saddr_f16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -664,7 +623,6 @@ ; GCN-LABEL: global_load_saddr_f16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -678,7 +636,6 @@ ; GCN-LABEL: global_load_saddr_i32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -692,7 +649,6 @@ ; GCN-LABEL: global_load_saddr_i32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -707,7 +663,6 @@ ; GCN-LABEL: global_load_saddr_f32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -720,7 +675,6 @@ ; GCN-LABEL: global_load_saddr_f32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -734,7 +688,6 @@ ; GCN-LABEL: global_load_saddr_v2i16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -748,7 +701,6 @@ ; GCN-LABEL: global_load_saddr_v2i16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -763,7 +715,6 @@ ; GCN-LABEL: global_load_saddr_v2f16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -776,7 +727,6 @@ ; GCN-LABEL: global_load_saddr_v2f16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -790,7 +740,6 @@ ; GCN-LABEL: global_load_saddr_p3: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -805,7 +754,6 @@ ; GCN-LABEL: global_load_saddr_p3_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -821,7 +769,6 @@ ; GCN-LABEL: global_load_saddr_f64: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -835,7 +782,6 @@ ; GCN-LABEL: global_load_saddr_f64_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -850,7 +796,6 @@ ; GCN-LABEL: global_load_saddr_i64: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -864,7 +809,6 @@ ; GCN-LABEL: global_load_saddr_i64_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -879,7 +823,6 @@ ; GCN-LABEL: global_load_saddr_v2f32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -892,7 +835,6 @@ ; GCN-LABEL: global_load_saddr_v2f32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -906,7 +848,6 @@ ; GCN-LABEL: global_load_saddr_v2i32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -920,7 +861,6 @@ ; GCN-LABEL: global_load_saddr_v2i32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -935,7 +875,6 @@ ; GCN-LABEL: global_load_saddr_v4i16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -949,7 +888,6 @@ ; GCN-LABEL: global_load_saddr_v4i16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -964,7 +902,6 @@ ; GCN-LABEL: global_load_saddr_v4f16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -978,7 +915,6 @@ ; GCN-LABEL: global_load_saddr_v4f16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -993,7 +929,6 @@ ; GCN-LABEL: global_load_saddr_p1: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1008,7 +943,6 @@ ; GCN-LABEL: global_load_saddr_p1_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1024,7 +958,6 @@ ; GCN-LABEL: global_load_saddr_v3f32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1037,7 +970,6 @@ ; GCN-LABEL: global_load_saddr_v3f32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1051,7 +983,6 @@ ; GCN-LABEL: global_load_saddr_v3i32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1065,7 +996,6 @@ ; GCN-LABEL: global_load_saddr_v3i32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1080,7 +1010,6 @@ ; GCN-LABEL: global_load_saddr_v6f16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1093,7 +1022,6 @@ ; GCN-LABEL: global_load_saddr_v6f16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1107,7 +1035,6 @@ ; GCN-LABEL: global_load_saddr_v4f32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1120,7 +1047,6 @@ ; GCN-LABEL: global_load_saddr_v4f32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1134,7 +1060,6 @@ ; GCN-LABEL: global_load_saddr_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1148,7 +1073,6 @@ ; GCN-LABEL: global_load_saddr_v4i32_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1163,7 +1087,6 @@ ; GCN-LABEL: global_load_saddr_v2i64: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1177,7 +1100,6 @@ ; GCN-LABEL: global_load_saddr_v2i64_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1192,7 +1114,6 @@ ; GCN-LABEL: global_load_saddr_i128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1206,7 +1127,6 @@ ; GCN-LABEL: global_load_saddr_i128_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1221,7 +1141,6 @@ ; GCN-LABEL: global_load_saddr_v2p1: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1236,7 +1155,6 @@ ; GCN-LABEL: global_load_saddr_v2p1_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1252,7 +1170,6 @@ ; GCN-LABEL: global_load_saddr_v4p3: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1267,7 +1184,6 @@ ; GCN-LABEL: global_load_saddr_v4p3_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1287,7 +1203,6 @@ ; GCN-LABEL: global_sextload_saddr_i8: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1301,7 +1216,6 @@ ; GCN-LABEL: global_sextload_saddr_i8_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1316,7 +1230,6 @@ ; GCN-LABEL: global_sextload_saddr_i16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1331,7 +1244,6 @@ ; GCN-LABEL: global_sextload_saddr_i16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1347,7 +1259,6 @@ ; GCN-LABEL: global_zextload_saddr_i8: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1361,7 +1272,6 @@ ; GCN-LABEL: global_zextload_saddr_i8_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1376,7 +1286,6 @@ ; GCN-LABEL: global_zextload_saddr_i16: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1391,7 +1300,6 @@ ; GCN-LABEL: global_zextload_saddr_i16_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1521,7 +1429,6 @@ ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1536,7 +1443,6 @@ ; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1694,7 +1600,6 @@ ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1709,7 +1614,6 @@ ; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: ; GCN: ; %bb.0: ; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/hsa-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-func.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-func.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-func.ll @@ -27,7 +27,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 36 +; ELF: Size: 32 ; ELF: Type: Function (0x2) ; ELF: } diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -1405,7 +1405,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_0.5_i16: @@ -1413,7 +1412,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x38,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_0.5_i16: @@ -1426,7 +1424,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0x3800, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half 0.5 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1441,7 +1438,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_neg_0.5_i16: @@ -1449,7 +1445,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xb800, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xb8,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_0.5_i16: @@ -1462,7 +1457,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0xb800, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half -0.5 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1477,7 +1471,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_1.0_i16: @@ -1485,7 +1478,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3c00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x3c,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_1.0_i16: @@ -1498,7 +1490,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0x3c00, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half 1.0 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1513,7 +1504,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_neg_1.0_i16: @@ -1521,7 +1511,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xbc00, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xbc,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_1.0_i16: @@ -1534,7 +1523,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0xbc00, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half -1.0 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1549,7 +1537,6 @@ ; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: shl_inline_imm_2.0_i16: @@ -1558,7 +1545,6 @@ ; VI-NEXT: s_movk_i32 s4, 0x4000 ; encoding: [0x00,0x40,0x04,0xb0] ; VI-NEXT: v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: shl_inline_imm_2.0_i16: @@ -1571,7 +1557,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_lshl_b32_e32 v2, 0x4000, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = shl i16 bitcast (half 2.0 to i16), %x store i16 %y, i16 addrspace(1)* %out @@ -1586,7 +1571,6 @@ ; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: shl_inline_imm_neg_2.0_i16: @@ -1595,7 +1579,6 @@ ; VI-NEXT: s_movk_i32 s4, 0xc000 ; encoding: [0x00,0xc0,0x04,0xb0] ; VI-NEXT: v_lshlrev_b16_e64 v2, v2, s4 ; encoding: [0x02,0x00,0x2a,0xd1,0x02,0x09,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: shl_inline_imm_neg_2.0_i16: @@ -1608,7 +1591,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_lshl_b32_e32 v2, 0xffffc000, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = shl i16 bitcast (half -2.0 to i16), %x store i16 %y, i16 addrspace(1)* %out @@ -1623,7 +1605,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_4.0_i16: @@ -1631,7 +1612,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x4400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0x44,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_4.0_i16: @@ -1644,7 +1624,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0x4400, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half 4.0 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1659,7 +1638,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_neg_4.0_i16: @@ -1667,7 +1645,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0xc400, v2 ; encoding: [0xff,0x04,0x04,0x52,0x00,0xc4,0xff,0xff] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_neg_4.0_i16: @@ -1680,7 +1657,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0xc400, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half -4.0 to i16) store i16 %y, i16 addrspace(1)* %out @@ -1695,7 +1671,6 @@ ; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe] ; ; VI-LABEL: mul_inline_imm_inv2pi_i16: @@ -1703,7 +1678,6 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf] ; VI-NEXT: v_mul_lo_u16_e32 v2, 0x3118, v2 ; encoding: [0xff,0x04,0x04,0x52,0x18,0x31,0x00,0x00] ; VI-NEXT: flat_store_short v[0:1], v2 ; encoding: [0x00,0x00,0x68,0xdc,0x00,0x02,0x00,0x00] -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; encoding: [0x70,0x00,0x8c,0xbf] ; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe] ; ; SI-LABEL: mul_inline_imm_inv2pi_i16: @@ -1716,7 +1690,6 @@ ; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_mul_u32_u24_e32 v2, 0x3118, v2 ; SI-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] %y = mul i16 %x, bitcast (half 0xH3118 to i16) store i16 %y, i16 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll --- a/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-uniform-load-shader.ll @@ -8,7 +8,6 @@ ; GCN-LABEL: ps_load_uniform_global_i32_align4: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %load = load i32, i32 addrspace(1)* %ptr, align 4 ret i32 %load @@ -18,7 +17,6 @@ ; GCN-LABEL: cs_load_uniform_global_i32_align4: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[2:3], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ; return to shader part epilog %load = load i32, i32 addrspace(1)* %ptr, align 4 ret i32 %load diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -271,7 +271,6 @@ ; GCN-NEXT: s_mov_b32 s10, s12 ; GCN-NEXT: s_mov_b32 s11, s12 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -17,7 +17,6 @@ ; GFX8-NEXT: ds_write_b32 v0, v0 ; GFX8-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX8-NEXT: s_trap 2 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: func_use_lds_global: @@ -27,7 +26,6 @@ ; GFX9-NEXT: ds_write_b32 v0, v0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[4:5] ; GFX9-NEXT: s_trap 2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store float 0.0, float addrspace(3)* @lds, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll @@ -4,7 +4,6 @@ ;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3: ;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 ;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42 -;CHECK: s_waitcnt define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) { main_body: %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) @@ -14,7 +13,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs_x3: ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) { main_body: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) @@ -24,7 +22,6 @@ ;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3: ;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt define amdgpu_ps <3 x float> @buffer_raw_load_immoffs_x3(<4 x i32> inreg) { main_body: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %0, i32 40, i32 0, i32 0) @@ -34,7 +31,6 @@ ;CHECK-LABEL: {{^}}buffer_struct_load_format_immoffs_x3: ;SI: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 ;GCNX3: buffer_load_format_xyz v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -;CHECK: s_waitcnt define amdgpu_ps <3 x float> @buffer_struct_load_format_immoffs_x3(<4 x i32> inreg) { main_body: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) @@ -44,7 +40,6 @@ ;CHECK-LABEL: {{^}}struct_buffer_load_immoffs_x3: ;SI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 ;GCNX3: buffer_load_dwordx3 v[0:2], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 -;CHECK: s_waitcnt define amdgpu_ps <3 x float> @struct_buffer_load_immoffs_x3(<4 x i32> inreg) { main_body: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) @@ -57,4 +52,3 @@ declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0 declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0 declare <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32>, i32, i32, i32, i32) #0 - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -5,7 +5,6 @@ ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) @@ -19,7 +18,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) @@ -36,7 +34,6 @@ ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen ;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc ;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0) @@ -52,7 +49,6 @@ ;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68 ;VI-NOT: s_mov ;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84 -;VI: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) @@ -63,7 +59,6 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) @@ -72,7 +67,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) @@ -81,7 +75,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -91,7 +84,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) @@ -101,7 +93,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both_reversed: ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) @@ -110,7 +101,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, off, s[0:3], 0 -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { main_body: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) @@ -119,7 +109,6 @@ ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { main_body: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -5,7 +5,6 @@ ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) @@ -19,7 +18,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0) @@ -30,7 +28,6 @@ ;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen ;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) @@ -39,7 +36,6 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) @@ -48,7 +44,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) @@ -57,7 +52,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -67,7 +61,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) @@ -77,7 +70,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both_reversed: ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) @@ -86,7 +78,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) @@ -95,7 +86,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) @@ -130,7 +120,6 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -155,7 +144,6 @@ ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -178,7 +166,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -196,7 +183,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged: ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -213,7 +199,6 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { main_body: %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) @@ -230,7 +215,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { main_body: %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) @@ -246,7 +230,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged: ;CHECK-NEXT: %bb. ;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) { main_body: %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0) @@ -315,7 +298,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) { main_body: @@ -328,7 +310,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) { main_body: @@ -341,7 +322,6 @@ ;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) { main_body: @@ -354,7 +334,6 @@ ;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8 -;CHECK-NEXT: s_waitcnt vmcnt(0) ;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll @@ -50,7 +50,6 @@ ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define amdgpu_cs float @ds_ordered_add_cs(i32 addrspace(2)* inreg %gds) { %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -62,7 +61,6 @@ ; GCN: s_mov_b32 m0, 0{{$}} ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define float @ds_ordered_add_default_cc() { %val = call i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -74,7 +72,6 @@ ; GCN: s_mov_b32 m0, 0{{$}} ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define fastcc float @ds_ordered_add_fastcc() { %val = call i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -86,7 +83,6 @@ ; GCN: s_mov_b32 m0, 0{{$}} ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define float @ds_ordered_add_func() { %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* null, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -98,7 +94,6 @@ ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:1796 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define amdgpu_ps float @ds_ordered_add_ps(i32 addrspace(2)* inreg %gds) { %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -110,7 +105,6 @@ ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:2820 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define amdgpu_vs float @ds_ordered_add_vs(i32 addrspace(2)* inreg %gds) { %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -122,7 +116,6 @@ ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:3844 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define amdgpu_gs float @ds_ordered_add_gs(i32 addrspace(2)* inreg %gds) { %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -7,7 +7,6 @@ ; GCN: s_mov_b32 m0, s0 ; VIGFX9-NEXT: s_nop 0 ; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds -; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) define amdgpu_cs float @ds_ordered_swap(i32 addrspace(2)* inreg %gds, i32 %value) { %val = call i32@llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) %r = bitcast i32 %val to float @@ -26,7 +25,6 @@ ; // Wait for expcnt(0) before modifying EXEC ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]] -; GCN-NEXT: s_waitcnt lgkmcnt(0) define amdgpu_cs float @ds_ordered_swap_conditional(i32 addrspace(2)* inreg %gds, i32 %value) { entry: %c = icmp ne i32 %value, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll @@ -6,14 +6,12 @@ ; GFX9-LABEL: load_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -25,14 +23,12 @@ ; GFX9-LABEL: load_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -45,14 +41,12 @@ ; GFX9-LABEL: load_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -66,14 +60,12 @@ ; GFX9-LABEL: load_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -87,14 +79,12 @@ ; GFX9-LABEL: load_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -107,14 +97,12 @@ ; GFX9-LABEL: load_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -128,14 +116,12 @@ ; GFX9-LABEL: load_2dmsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -149,14 +135,12 @@ ; GFX9-LABEL: load_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -171,14 +155,12 @@ ; GFX9-LABEL: load_mip_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -191,14 +173,12 @@ ; GFX9-LABEL: load_mip_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -212,14 +192,12 @@ ; GFX9-LABEL: load_mip_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -234,14 +212,12 @@ ; GFX9-LABEL: load_mip_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -256,14 +232,12 @@ ; GFX9-LABEL: load_mip_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -277,14 +251,12 @@ ; GFX9-LABEL: load_mip_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -564,14 +536,12 @@ ; GFX9-LABEL: getresinfo_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -583,14 +553,12 @@ ; GFX9-LABEL: getresinfo_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -602,14 +570,12 @@ ; GFX9-LABEL: getresinfo_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -621,14 +587,12 @@ ; GFX9-LABEL: getresinfo_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -640,14 +604,12 @@ ; GFX9-LABEL: getresinfo_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -659,14 +621,12 @@ ; GFX9-LABEL: getresinfo_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -678,14 +638,12 @@ ; GFX9-LABEL: getresinfo_2dmsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -697,14 +655,12 @@ ; GFX9-LABEL: getresinfo_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -716,14 +672,12 @@ ; GFX9-LABEL: load_1d_V1: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -735,14 +689,12 @@ ; GFX9-LABEL: load_1d_V2: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -788,14 +740,12 @@ ; GFX9-LABEL: load_1d_glc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -807,14 +757,12 @@ ; GFX9-LABEL: load_1d_slc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -826,14 +774,12 @@ ; GFX9-LABEL: load_1d_glc_slc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll @@ -6,14 +6,12 @@ ; GFX9-LABEL: load_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -25,14 +23,12 @@ ; GFX9-LABEL: load_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -45,14 +41,12 @@ ; GFX9-LABEL: load_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -66,14 +60,12 @@ ; GFX9-LABEL: load_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -87,14 +79,12 @@ ; GFX9-LABEL: load_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -107,14 +97,12 @@ ; GFX9-LABEL: load_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -128,14 +116,12 @@ ; GFX9-LABEL: load_2dmsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x30,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -149,14 +135,12 @@ ; GFX9-LABEL: load_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x38,0x1f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -171,14 +155,12 @@ ; GFX9-LABEL: load_mip_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -191,14 +173,12 @@ ; GFX9-LABEL: load_mip_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -212,14 +192,12 @@ ; GFX9-LABEL: load_mip_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -234,14 +212,12 @@ ; GFX9-LABEL: load_mip_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -256,14 +232,12 @@ ; GFX9-LABEL: load_mip_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -277,14 +251,12 @@ ; GFX9-LABEL: load_mip_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x04,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x04,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords_lo, i32 0 @@ -564,14 +536,12 @@ ; GFX9-LABEL: getresinfo_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -583,14 +553,12 @@ ; GFX9-LABEL: getresinfo_2d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x08,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -602,14 +570,12 @@ ; GFX9-LABEL: getresinfo_3d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x10,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -621,14 +587,12 @@ ; GFX9-LABEL: getresinfo_cube: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x18,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -640,14 +604,12 @@ ; GFX9-LABEL: getresinfo_1darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x20,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -659,14 +621,12 @@ ; GFX9-LABEL: getresinfo_2darray: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x28,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -678,14 +638,12 @@ ; GFX9-LABEL: getresinfo_2dmsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x30,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -697,14 +655,12 @@ ; GFX9-LABEL: getresinfo_2darraymsaa: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da ; encoding: [0x00,0xdf,0x38,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x38,0x1f,0x38,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %mip = extractelement <2 x i16> %coords, i32 0 @@ -716,14 +672,12 @@ ; GFX9-LABEL: load_1d_V1: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm a16 ; encoding: [0x00,0x98,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x18,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -735,14 +689,12 @@ ; GFX9-LABEL: load_1d_V2: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16 ; encoding: [0x00,0x99,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x00,0x19,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -788,14 +740,12 @@ ; GFX9-LABEL: load_1d_glc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16 ; encoding: [0x00,0xbf,0x00,0xf0,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x00,0x3f,0x00,0xf0,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -807,14 +757,12 @@ ; GFX9-LABEL: load_1d_slc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16 ; encoding: [0x00,0x9f,0x00,0xf2,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x00,0x1f,0x00,0xf2,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 @@ -826,14 +774,12 @@ ; GFX9-LABEL: load_1d_glc_slc: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16 ; encoding: [0x00,0xbf,0x00,0xf2,0x00,0x00,0x00,0x00] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x00,0x3f,0x00,0xf2,0x00,0x00,0x00,0x40] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %s = extractelement <2 x i16> %coords, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -9,32 +9,27 @@ ; VERDE-LABEL: load_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -55,7 +50,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_tfe: @@ -71,7 +65,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_tfe: @@ -87,7 +80,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_tfe: @@ -98,7 +90,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe: @@ -115,7 +106,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) @@ -139,7 +129,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_lwe: @@ -155,7 +144,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_lwe: @@ -171,7 +159,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_lwe: @@ -182,7 +169,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_lwe: @@ -199,7 +185,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) @@ -213,32 +198,27 @@ ; VERDE-LABEL: load_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -260,7 +240,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2d_tfe: @@ -277,7 +256,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2d_tfe: @@ -294,7 +272,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2d_tfe: @@ -305,7 +282,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2d_tfe: @@ -323,7 +299,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) @@ -337,32 +312,27 @@ ; VERDE-LABEL: load_3d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_3d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_3d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_3d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -385,7 +355,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_3d_tfe_lwe: @@ -403,7 +372,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_3d_tfe_lwe: @@ -421,7 +389,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_3d_tfe_lwe: @@ -432,7 +399,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_3d_tfe_lwe: @@ -451,7 +417,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) @@ -465,32 +430,27 @@ ; VERDE-LABEL: load_cube: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_cube: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_cube: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_cube: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -513,7 +473,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_cube_lwe: @@ -531,7 +490,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_cube_lwe: @@ -549,7 +507,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_cube_lwe: @@ -560,7 +517,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_cube_lwe: @@ -579,7 +535,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) @@ -593,32 +548,27 @@ ; VERDE-LABEL: load_1darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -640,7 +590,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1darray_tfe: @@ -657,7 +606,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1darray_tfe: @@ -674,7 +622,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1darray_tfe: @@ -685,7 +632,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1darray_tfe: @@ -703,7 +649,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0) @@ -717,32 +662,27 @@ ; VERDE-LABEL: load_2darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -765,7 +705,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2darray_lwe: @@ -783,7 +722,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2darray_lwe: @@ -801,7 +739,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2darray_lwe: @@ -812,7 +749,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darray_lwe: @@ -831,7 +767,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) @@ -845,32 +780,27 @@ ; VERDE-LABEL: load_2dmsaa: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2dmsaa: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2dmsaa: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2dmsaa: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x30,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -893,7 +823,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2dmsaa_both: @@ -911,7 +840,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2dmsaa_both: @@ -929,7 +857,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2dmsaa_both: @@ -940,7 +867,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2dmsaa_both: @@ -959,7 +885,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) @@ -973,32 +898,27 @@ ; VERDE-LABEL: load_2darraymsaa: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2darraymsaa: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2darraymsaa: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2darraymsaa: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x38,0x1f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -1022,7 +942,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_2darraymsaa_tfe: @@ -1041,7 +960,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_2darraymsaa_tfe: @@ -1060,7 +978,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_2darraymsaa_tfe: @@ -1071,7 +988,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_2darraymsaa_tfe: @@ -1091,7 +1007,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -1105,32 +1020,27 @@ ; VERDE-LABEL: load_mip_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_1d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_1d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -1152,7 +1062,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_1d_lwe: @@ -1169,7 +1078,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_1d_lwe: @@ -1186,7 +1094,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_1d_lwe: @@ -1197,7 +1104,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1d_lwe: @@ -1215,7 +1121,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0) @@ -1229,32 +1134,27 @@ ; VERDE-LABEL: load_mip_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_2d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_2d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -1277,7 +1177,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_2d_tfe: @@ -1295,7 +1194,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v4, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_2d_tfe: @@ -1313,7 +1211,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v6, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[5:6], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_2d_tfe: @@ -1324,7 +1221,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v6, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[5:6], v4, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2d_tfe: @@ -1343,7 +1239,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0) @@ -1678,7 +1573,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v3, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_tfe_V4_dmask3: @@ -1693,7 +1587,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v3, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask3: @@ -1708,7 +1601,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v5, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[4:5], v3, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_tfe_V4_dmask3: @@ -1719,7 +1611,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v5, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[4:5], v3, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask3: @@ -1735,7 +1626,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v4, s8 ; encoding: [0x08,0x02,0x08,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[4:5], v3, off ; encoding: [0x00,0x80,0x70,0xdc,0x04,0x03,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) @@ -1757,7 +1647,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v2, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_tfe_V4_dmask2: @@ -1771,7 +1660,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v2, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask2: @@ -1785,7 +1673,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v4, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[3:4], v2, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_tfe_V4_dmask2: @@ -1796,7 +1683,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v4, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[3:4], v2, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask2: @@ -1811,7 +1697,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s8 ; encoding: [0x08,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[3:4], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x03,0x02,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) @@ -1832,7 +1717,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_tfe_V4_dmask1: @@ -1845,7 +1729,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_tfe_V4_dmask1: @@ -1858,7 +1741,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v3, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[2:3], v1, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_tfe_V4_dmask1: @@ -1869,7 +1751,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v3, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[2:3], v1, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V4_dmask1: @@ -1883,7 +1764,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) @@ -1904,7 +1784,6 @@ ; VERDE-NEXT: s_mov_b32 s10, -1 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_tfe_V2_dmask1: @@ -1917,7 +1796,6 @@ ; FIJI-NEXT: s_mov_b32 s10, -1 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_tfe_V2_dmask1: @@ -1930,7 +1808,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v3, s9 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[2:3], v1, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_tfe_V2_dmask1: @@ -1941,7 +1818,6 @@ ; NOPRT-NEXT: v_mov_b32_e32 v3, s9 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: global_store_dword v[2:3], v1, off -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_tfe_V2_dmask1: @@ -1955,7 +1831,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, s8 ; encoding: [0x08,0x02,0x04,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) @@ -1970,32 +1845,27 @@ ; VERDE-LABEL: load_mip_3d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_3d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_3d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_3d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2006,32 +1876,27 @@ ; VERDE-LABEL: load_mip_cube: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_cube: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_cube: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_cube: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2042,32 +1907,27 @@ ; VERDE-LABEL: load_mip_1darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_1darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_1darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_1darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2078,32 +1938,27 @@ ; VERDE-LABEL: load_mip_2darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_mip_2darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_mip_2darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_mip_2darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_mip_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x04,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2548,32 +2403,27 @@ ; VERDE-LABEL: getresinfo_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_1d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_1d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2584,32 +2434,27 @@ ; VERDE-LABEL: getresinfo_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_2d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_2d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2620,32 +2465,27 @@ ; VERDE-LABEL: getresinfo_3d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_3d: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_3d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_3d: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_3d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; encoding: [0x10,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2656,32 +2496,27 @@ ; VERDE-LABEL: getresinfo_cube: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_cube: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_cube: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_cube: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_cube: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; encoding: [0x18,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2692,32 +2527,27 @@ ; VERDE-LABEL: getresinfo_1darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_1darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_1darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_1darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_1darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; encoding: [0x20,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2728,32 +2558,27 @@ ; VERDE-LABEL: getresinfo_2darray: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_2darray: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_2darray: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_2darray: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darray: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; encoding: [0x28,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2764,32 +2589,27 @@ ; VERDE-LABEL: getresinfo_2dmsaa: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_2dmsaa: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_2dmsaa: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_2dmsaa: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2dmsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x30,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2800,32 +2620,27 @@ ; VERDE-LABEL: getresinfo_2darraymsaa: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: getresinfo_2darraymsaa: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: getresinfo_2darraymsaa: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: getresinfo_2darraymsaa: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: getresinfo_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x38,0x1f,0x38,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -2836,32 +2651,27 @@ ; VERDE-LABEL: load_1d_V1: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_V1: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_V1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_V1: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v0, v0, s[0:7] dmask:0x8 unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x18,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2872,32 +2682,27 @@ ; VERDE-LABEL: load_1d_V2: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_V2: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_V2: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_V2: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x19,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -2970,32 +2775,27 @@ ; VERDE-LABEL: load_1d_glc: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_glc: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_glc: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_glc: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; encoding: [0x00,0x3f,0x00,0xf0,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -3006,32 +2806,27 @@ ; VERDE-LABEL: load_1d_slc: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_slc: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_slc: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_slc: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; encoding: [0x00,0x1f,0x00,0xf2,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -3042,32 +2837,27 @@ ; VERDE-LABEL: load_1d_glc_slc: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: load_1d_glc_slc: ; FIJI: ; %bb.0: ; %main_body ; FIJI-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc -; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: load_1d_glc_slc: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: load_1d_glc_slc: ; NOPRT: ; %bb.0: ; %main_body ; NOPRT-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc -; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: load_1d_glc_slc: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; encoding: [0x00,0x3f,0x00,0xf2,0x00,0x00,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -3254,7 +3044,6 @@ ; VERDE-NEXT: ds_write_b32 v0, v3 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: v_mov_b32_e32 v0, v1 -; VERDE-NEXT: s_waitcnt lgkmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; FIJI-LABEL: image_load_mmo: @@ -3265,7 +3054,6 @@ ; FIJI-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, v1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: image_load_mmo: @@ -3275,7 +3063,6 @@ ; GFX6789-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: v_mov_b32_e32 v0, v1 -; GFX6789-NEXT: s_waitcnt lgkmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; NOPRT-LABEL: image_load_mmo: @@ -3285,7 +3072,6 @@ ; NOPRT-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; NOPRT-NEXT: s_waitcnt vmcnt(0) ; NOPRT-NEXT: v_mov_b32_e32 v0, v1 -; NOPRT-NEXT: s_waitcnt lgkmcnt(0) ; NOPRT-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_load_mmo: @@ -3296,7 +3082,6 @@ ; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll @@ -11,7 +11,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_2d: @@ -23,7 +22,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -39,7 +37,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_cube: @@ -51,7 +48,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f16(i32 1, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -67,7 +63,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_2darray: @@ -79,7 +74,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4 v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f16(i32 1, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -95,7 +89,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_2d: @@ -107,7 +100,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -123,7 +115,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_cl_2d: @@ -135,7 +126,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f16(i32 1, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -153,7 +143,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_cl_2d: @@ -165,7 +154,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -181,7 +169,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_b_2d: @@ -193,7 +180,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -209,7 +195,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_b_2d: @@ -221,7 +206,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -239,7 +223,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_b_cl_2d: @@ -251,7 +234,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -270,7 +252,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_b_cl_2d: @@ -282,7 +263,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f16(i32 1, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -295,7 +275,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_l_2d: @@ -304,7 +283,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: image_gather4_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f16(i32 1, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -319,7 +297,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: image_gather4_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_l_2d: @@ -328,7 +305,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -341,7 +317,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_lz_2d: @@ -350,7 +325,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -363,7 +337,6 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: gather4_c_lz_2d: @@ -372,7 +345,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.dim.ll @@ -5,7 +5,6 @@ ; GCN-LABEL: {{^}}getlod_1d: ; PRE-GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} ; GFX10: image_get_lod v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D -; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @getlod_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) { main_body: %r = call <4 x float> @llvm.amdgcn.image.getlod.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -15,7 +14,6 @@ ; GCN-LABEL: {{^}}getlod_2d: ; PRE-GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3{{$}} ; GFX10: image_get_lod v[0:1], v[0:1], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_2D -; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <2 x float> @getlod_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { main_body: %r = call <2 x float> @llvm.amdgcn.image.getlod.2d.v2f32.f32(i32 3, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -9,7 +9,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d: @@ -19,7 +18,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -35,7 +33,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_2d: @@ -47,7 +44,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -63,7 +59,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_3d: @@ -75,7 +70,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f16(i32 15, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -91,7 +85,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cube: @@ -103,7 +96,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f16(i32 15, half %s, half %t, half %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -119,7 +111,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1darray: @@ -131,7 +122,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f16(i32 15, half %s, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -147,7 +137,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_2darray: @@ -159,7 +148,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f16(i32 15, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -173,7 +161,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_1d: @@ -183,7 +170,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -199,7 +185,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_2d: @@ -211,7 +196,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -227,7 +211,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cl_1d: @@ -239,7 +222,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f16(i32 15, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -255,7 +237,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cl_2d: @@ -267,7 +248,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_cl v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f16(i32 15, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -283,7 +263,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cl_1d: @@ -295,7 +274,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -313,7 +291,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cl_2d: @@ -325,7 +302,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -339,7 +315,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_1d: @@ -349,7 +324,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f16(i32 15, float %bias, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -365,7 +339,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_2d: @@ -377,7 +350,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -391,7 +363,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_1d: @@ -401,7 +372,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -417,7 +387,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_2d: @@ -429,7 +398,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -445,7 +413,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_cl_1d: @@ -457,7 +424,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -475,7 +441,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b_cl v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_cl_2d: @@ -487,7 +452,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -503,7 +467,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_cl_1d: @@ -515,7 +478,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -534,7 +496,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 16, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_c_b_cl v[0:3], v[4:7], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_cl_2d: @@ -546,7 +507,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f16(i32 15, float %bias, float %zcompare, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -557,14 +517,12 @@ ; GFX9-LABEL: sample_d_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -582,7 +540,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX9-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_2d: @@ -596,7 +553,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -617,7 +573,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX9-NEXT: image_sample_d v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_3d: @@ -631,7 +586,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d v[0:3], [v0, v2, v3, v5, v6, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, half %s, half %t, half %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -642,14 +596,12 @@ ; GFX9-LABEL: sample_c_d_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -669,7 +621,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_2d: @@ -683,7 +634,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -696,7 +646,6 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_1d: @@ -705,7 +654,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -723,7 +671,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX9-NEXT: image_sample_d_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_2d: @@ -737,7 +684,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -750,7 +696,6 @@ ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_1d: @@ -759,7 +704,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -779,7 +723,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_d_cl v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_2d: @@ -793,7 +736,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -804,14 +746,12 @@ ; GFX9-LABEL: sample_cd_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -829,7 +769,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX9-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_2d: @@ -843,7 +782,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_cd v[0:3], v[2:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -854,14 +792,12 @@ ; GFX9-LABEL: sample_c_cd_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -881,7 +817,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v1, v8, 16, v1 ; GFX9-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_2d: @@ -895,7 +830,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -908,7 +842,6 @@ ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_1d: @@ -917,7 +850,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f16(i32 15, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -935,7 +867,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v4, v3, 16, v2 ; GFX9-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX9-NEXT: image_sample_cd_cl v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_2d: @@ -949,7 +880,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f16(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -962,7 +892,6 @@ ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_1d: @@ -971,7 +900,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dsdv, half %s, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -991,7 +919,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v8, v2, 16, v0 ; GFX9-NEXT: image_sample_c_cd_cl v[0:3], v[7:14], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_2d: @@ -1005,7 +932,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f16(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1018,7 +944,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_1d: @@ -1027,7 +952,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_l v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f16(i32 15, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1040,7 +964,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_2d: @@ -1049,7 +972,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX10-NEXT: image_sample_l v[0:3], v[1:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f16(i32 15, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1062,7 +984,6 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_1d: @@ -1071,7 +992,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f16(i32 15, float %zcompare, half %s, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1086,7 +1006,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v4, v2, 16, v0 ; GFX9-NEXT: image_sample_c_l v[0:3], v[3:5], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_2d: @@ -1095,7 +1014,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, half %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1106,14 +1024,12 @@ ; GFX9-LABEL: sample_lz_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f16(i32 15, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1126,7 +1042,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_2d: @@ -1135,7 +1050,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f16(i32 15, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1146,14 +1060,12 @@ ; GFX9-LABEL: sample_c_lz_1d: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f16(i32 15, float %zcompare, half %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1166,7 +1078,6 @@ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf a16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_2d: @@ -1175,7 +1086,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f16(i32 15, float %zcompare, half %s, half %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1196,7 +1106,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v0, v[8:15], s[0:7], s[8:11] dmask:0x4 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V1: @@ -1210,7 +1119,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f16.f16(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1231,7 +1139,6 @@ ; GFX9-NEXT: v_lshl_or_b32 v11, v5, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v10, v3, 16, v0 ; GFX9-NEXT: image_sample_c_d_o v[0:1], v[8:15], s[0:7], s[8:11] dmask:0x6 a16 da -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V2: @@ -1245,7 +1152,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f16(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %t, half %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -11,7 +11,6 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 -; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_2d_f16: @@ -20,7 +19,6 @@ ; GFX81-NEXT: s_wqm_b64 exec, exec ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX81-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 -; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: image_sample_2d_f16: @@ -29,7 +27,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_2d_f16: @@ -39,7 +36,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -60,7 +56,6 @@ ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_mov_b32_e32 v0, v2 ; TONGA-NEXT: flat_store_dword v[4:5], v3 -; TONGA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_2d_f16_tfe: @@ -76,7 +71,6 @@ ; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: v_mov_b32_e32 v0, v2 ; GFX81-NEXT: flat_store_dword v[4:5], v3 -; GFX81-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: image_sample_2d_f16_tfe: @@ -92,7 +86,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: global_store_dword v[4:5], v3, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_2d_f16_tfe: @@ -109,7 +102,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: global_store_dword v[4:5], v3, off -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) @@ -131,20 +123,17 @@ ; GFX81-LABEL: image_sample_c_d_1d_v2f16: ; GFX81: ; %bb.0: ; %main_body ; GFX81-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 -; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: image_sample_c_d_1d_v2f16: ; GFX9: ; %bb.0: ; %main_body ; GFX9-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_c_d_1d_v2f16: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -193,7 +182,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) @@ -226,7 +214,6 @@ ; GFX81-NEXT: s_wqm_b64 exec, exec ; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16 -; GFX81-NEXT: s_waitcnt vmcnt(0) ; GFX81-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: image_sample_b_2d_v4f16: @@ -235,7 +222,6 @@ ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: image_sample_b_2d_v4f16: @@ -245,7 +231,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) @@ -316,7 +301,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -10,7 +10,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d: @@ -19,7 +18,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d: @@ -29,7 +27,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -53,7 +50,6 @@ ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe: @@ -72,7 +68,6 @@ ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf tfe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[6:7], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe: @@ -92,7 +87,6 @@ ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0f,0x81,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -112,7 +106,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_1: @@ -124,7 +117,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_1: @@ -137,7 +129,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x01,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -160,7 +151,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_2: @@ -172,7 +162,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_2: @@ -185,7 +174,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x02,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -208,7 +196,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_3: @@ -220,7 +207,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_3: @@ -233,7 +219,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x04,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -256,7 +241,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v1, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_4: @@ -268,7 +252,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v1, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_4: @@ -281,7 +264,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x08,0x81,0xf0,0x02,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -305,7 +287,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_12: @@ -318,7 +299,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: @@ -332,7 +312,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x03,0x81,0xf0,0x03,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -358,7 +337,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v2, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_24: @@ -371,7 +349,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: @@ -385,7 +362,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0a,0x81,0xf0,0x03,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -412,7 +388,6 @@ ; VERDE-NEXT: v_mov_b32_e32 v3, v0 ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_tfe_adjust_writemask_134: @@ -426,7 +401,6 @@ ; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_tfe_adjust_writemask_134: @@ -441,7 +415,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd dim:SQ_RSRC_IMG_1D tfe ; encoding: [0x00,0x0d,0x81,0xf0,0x04,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -475,7 +448,6 @@ ; VERDE-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: buffer_store_dword v4, off, s[12:15], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_lwe: @@ -494,7 +466,6 @@ ; GFX6789-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf lwe ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[6:7], v4, off -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_lwe: @@ -514,7 +485,6 @@ ; GFX10-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; encoding: [0x00,0x0f,0x82,0xf0,0x05,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0) @@ -531,7 +501,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_2d: @@ -540,7 +509,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_2d: @@ -550,7 +518,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -564,7 +531,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_3d: @@ -573,7 +539,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_3d: @@ -583,7 +548,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x10,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -597,7 +561,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cube: @@ -606,7 +569,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cube: @@ -616,7 +578,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_CUBE ; encoding: [0x18,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32 15, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -630,7 +591,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1darray: @@ -639,7 +599,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1darray: @@ -649,7 +608,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY ; encoding: [0x20,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32 15, float %s, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -663,7 +621,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_2darray: @@ -672,7 +629,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_2darray: @@ -682,7 +638,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x0f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.2darray.v4f32.f32(i32 15, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -696,7 +651,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_1d: @@ -705,7 +659,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_1d: @@ -715,7 +668,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -729,7 +681,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_2d: @@ -738,7 +689,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_2d: @@ -748,7 +698,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -762,7 +711,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cl_1d: @@ -771,7 +719,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cl_1d: @@ -781,7 +728,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32(i32 15, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -795,7 +741,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cl_2d: @@ -804,7 +749,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cl_2d: @@ -814,7 +758,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x84,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32(i32 15, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -828,7 +771,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cl_1d: @@ -837,7 +779,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cl_1d: @@ -847,7 +788,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -861,7 +801,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cl_2d: @@ -870,7 +809,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cl_2d: @@ -880,7 +818,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cl.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -894,7 +831,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_b_1d: @@ -903,7 +839,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_1d: @@ -913,7 +848,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.1d.v4f32.f32.f32(i32 15, float %bias, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -927,7 +861,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_b_2d: @@ -936,7 +869,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_2d: @@ -946,7 +878,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x94,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -960,7 +891,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_b_1d: @@ -969,7 +899,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_1d: @@ -979,7 +908,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -993,7 +921,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_b_2d: @@ -1002,7 +929,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_2d: @@ -1012,7 +938,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb4,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1026,7 +951,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_b_cl_1d: @@ -1035,7 +959,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_cl_1d: @@ -1045,7 +968,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1059,7 +981,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_b_cl_2d: @@ -1068,7 +989,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_b_cl_2d: @@ -1078,7 +998,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x98,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1092,7 +1011,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_b_cl_1d: @@ -1101,7 +1019,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_cl_1d: @@ -1111,7 +1028,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.1d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1125,7 +1041,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_b_cl_2d: @@ -1134,7 +1049,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_b_cl_2d: @@ -1144,7 +1058,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample_c_b_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb8,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.2d.v4f32.f32.f32(i32 15, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1155,20 +1068,17 @@ ; VERDE-LABEL: sample_d_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_d_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1179,20 +1089,17 @@ ; VERDE-LABEL: sample_d_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_d_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1203,20 +1110,17 @@ ; VERDE-LABEL: sample_c_d_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1227,20 +1131,17 @@ ; VERDE-LABEL: sample_c_d_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1251,20 +1152,17 @@ ; VERDE-LABEL: sample_d_cl_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_d_cl_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1275,20 +1173,17 @@ ; VERDE-LABEL: sample_d_cl_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_d_cl_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1299,20 +1194,17 @@ ; VERDE-LABEL: sample_c_d_cl_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_cl_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1323,20 +1215,17 @@ ; VERDE-LABEL: sample_c_d_cl_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_cl_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1347,20 +1236,17 @@ ; VERDE-LABEL: sample_cd_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cd_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1371,20 +1257,17 @@ ; VERDE-LABEL: sample_cd_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cd_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1395,20 +1278,17 @@ ; VERDE-LABEL: sample_c_cd_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cd_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1419,20 +1299,17 @@ ; VERDE-LABEL: sample_c_cd_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cd_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1443,20 +1320,17 @@ ; VERDE-LABEL: sample_cd_cl_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cd_cl_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1467,20 +1341,17 @@ ; VERDE-LABEL: sample_cd_cl_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_cd_cl_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1491,20 +1362,17 @@ ; VERDE-LABEL: sample_c_cd_cl_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cd_cl_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1515,20 +1383,17 @@ ; VERDE-LABEL: sample_c_cd_cl_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_cd_cl_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_cl v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f32.f32(i32 15, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1539,20 +1404,17 @@ ; VERDE-LABEL: sample_l_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_l_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_l v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x90,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1563,20 +1425,17 @@ ; VERDE-LABEL: sample_l_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_l_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_l_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x90,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1587,20 +1446,17 @@ ; VERDE-LABEL: sample_c_l_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_l_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xb0,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1611,20 +1467,17 @@ ; VERDE-LABEL: sample_c_l_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_l_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_l_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xb0,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1635,20 +1488,17 @@ ; VERDE-LABEL: sample_lz_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_lz_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0x9c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1659,20 +1509,17 @@ ; VERDE-LABEL: sample_lz_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_lz_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_lz_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0x9c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1683,20 +1530,17 @@ ; VERDE-LABEL: sample_c_lz_1d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_lz_1d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_1d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0f,0xbc,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32 15, float %zcompare, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1707,20 +1551,17 @@ ; VERDE-LABEL: sample_c_lz_2d: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_lz_2d: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_lz_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x08,0x0f,0xbc,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1731,20 +1572,17 @@ ; VERDE-LABEL: sample_c_d_o_2darray_V1: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_o_2darray_V1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_o v0, v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1762,7 +1600,6 @@ ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: v_mov_b32_e32 v0, v9 ; VERDE-NEXT: buffer_store_dword v10, off, s[12:15], 0 -; VERDE-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_o_2darray_V1_tfe: @@ -1775,7 +1612,6 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: global_store_dword v[0:1], v10, off ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V1_tfe: @@ -1790,7 +1626,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -1804,20 +1639,17 @@ ; VERDE-LABEL: sample_c_d_o_2darray_V2: ; VERDE: ; %bb.0: ; %main_body ; VERDE-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_c_d_o_2darray_V2: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x28,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -1859,7 +1691,6 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: image_sample_c_d_o v[0:2], [v11, v10, v9, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x06,0xe9,0xf0,0x0b,0x00,0x40,0x00,0x0a,0x09,0x03,0x04,0x05,0x06,0x07,0x08] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) @@ -1881,7 +1712,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_unorm: @@ -1890,7 +1720,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_unorm: @@ -1900,7 +1729,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; encoding: [0x00,0x1f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 1, i32 0, i32 0) @@ -1914,7 +1742,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_glc: @@ -1923,7 +1750,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_glc: @@ -1933,7 +1759,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; encoding: [0x00,0x2f,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1) @@ -1947,7 +1772,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf slc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_slc: @@ -1956,7 +1780,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf slc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_slc: @@ -1966,7 +1789,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; encoding: [0x00,0x0f,0x80,0xf2,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 2) @@ -1980,7 +1802,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc slc -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: sample_1d_glc_slc: @@ -1989,7 +1810,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf glc slc -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: sample_1d_glc_slc: @@ -1999,7 +1819,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D glc slc ; encoding: [0x00,0x2f,0x80,0xf2,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 3) @@ -2013,7 +1832,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_0: @@ -2022,7 +1840,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_0: @@ -2032,7 +1849,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x01,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2047,7 +1863,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_01: @@ -2056,7 +1871,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_01: @@ -2066,7 +1880,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x03,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2081,7 +1894,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_012: @@ -2090,7 +1902,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_012: @@ -2100,7 +1911,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x07,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2115,7 +1925,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_12: @@ -2124,7 +1933,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_12: @@ -2134,7 +1942,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2149,7 +1956,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_03: @@ -2158,7 +1964,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_03: @@ -2168,7 +1973,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x9 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x09,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2183,7 +1987,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_13: @@ -2192,7 +1995,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_13: @@ -2202,7 +2004,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2217,7 +2018,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_123: @@ -2226,7 +2026,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_123: @@ -2236,7 +2035,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:2], v0, s[0:7], s[8:11] dmask:0xe dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2269,7 +2067,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_123_to_12: @@ -2278,7 +2075,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_123_to_12: @@ -2288,7 +2084,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x06,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 14, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -2303,7 +2098,6 @@ ; VERDE-NEXT: s_wqm_b64 exec, exec ; VERDE-NEXT: s_and_b64 exec, exec, s[12:13] ; VERDE-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa -; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: ; return to shader part epilog ; ; GFX6789-LABEL: adjust_writemask_sample_013_to_13: @@ -2312,7 +2106,6 @@ ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: s_and_b64 exec, exec, s[12:13] ; GFX6789-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa -; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: adjust_writemask_sample_013_to_13: @@ -2322,7 +2115,6 @@ ; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo ; encoding: [0x7e,0x09,0xfe,0xbe] ; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 ; encoding: [0x7e,0x0c,0x7e,0x87] ; GFX10-NEXT: image_sample v[0:1], v0, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D ; encoding: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x40,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %r = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 11, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -6,7 +6,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -23,7 +22,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -40,7 +38,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x15,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x03,0x05,0x06,0x07,0x08,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -52,7 +49,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -69,7 +65,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -81,7 +76,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -98,7 +92,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -110,7 +103,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -127,7 +119,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -139,7 +130,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -156,7 +146,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -168,7 +157,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -185,7 +173,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -197,7 +184,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -214,7 +200,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -226,7 +211,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; encoding: [0x01,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -243,7 +227,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0d,0x0f,0xac,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06,0x07,0x00,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -260,7 +243,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x04,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -277,7 +259,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; encoding: [0x04,0x00,0x6f,0xd7,0x05,0x21,0x11,0x04] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x2d,0x06,0xe8,0xf0,0x00,0x00,0x40,0x00,0x01,0x02,0x04,0x06,0x07,0x08,0x00,0x00] -; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -6,7 +6,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -23,7 +22,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -40,7 +38,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v3, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %drdh, half %dsdv, half %dtdv, half %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -52,7 +49,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -69,7 +65,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -81,7 +76,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -98,7 +92,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -110,7 +103,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -127,7 +119,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -139,7 +130,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -156,7 +146,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v3, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -168,7 +157,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -185,7 +173,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -197,7 +184,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.1d.v4f32.f16.f32(i32 15, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -214,7 +200,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.2d.v4f32.f16.f32(i32 15, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -226,7 +211,6 @@ ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.1d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -243,7 +227,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], [v0, v1, v3, v5, v6, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.2d.v4f32.f16.f32(i32 15, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -260,7 +243,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call float @llvm.amdgcn.image.sample.c.d.o.2darray.f16.f32.f32(i32 4, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) @@ -277,7 +259,6 @@ ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], [v0, v1, v2, v4, v6, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: %v = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -5,7 +5,6 @@ ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return define amdgpu_ps i32 @test_ps() #1 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -60,7 +60,6 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @func_implicitarg_ptr() #0 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() @@ -72,7 +71,6 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @opencl_func_implicitarg_ptr() #0 { %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() @@ -158,7 +156,6 @@ ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; GCN: s_waitcnt lgkmcnt(0) define void @func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() @@ -174,7 +171,6 @@ ; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0 ; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; GCN: s_waitcnt lgkmcnt(0) define void @opencl_func_kernarg_implicitarg_ptr() #0 { %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.format.ll @@ -5,7 +5,6 @@ ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) @@ -19,7 +18,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 42, i32 0, i32 0) @@ -32,7 +30,6 @@ ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092 ;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 4092, i32 60, i32 0) @@ -45,7 +42,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) @@ -54,7 +50,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -64,7 +59,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, off, s[0:3], 0 -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { main_body: %data = call float @llvm.amdgcn.raw.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) @@ -73,7 +67,6 @@ ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0 -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { main_body: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -6,7 +6,6 @@ ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0{{$}} ;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc{{$}} ;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc{{$}} -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0) @@ -25,7 +24,6 @@ ;GFX10: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 dlc{{$}} ;GFX10: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc dlc{{$}} ;GFX10: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc dlc{{$}} -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load_dlc(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 4) @@ -39,7 +37,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 40, i32 0, i32 0) @@ -49,7 +46,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 4, i32 8188, i32 0) @@ -58,7 +54,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0) @@ -67,7 +62,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -77,7 +71,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %ofs) { main_body: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) @@ -86,7 +79,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v0, s[0:3], 0 offen -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %ofs) { main_body: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0, i32 0) @@ -123,7 +115,6 @@ ;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -149,7 +140,6 @@ ;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { main_body: %a = shl i32 %inp, 6 @@ -176,7 +166,6 @@ ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}} ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}} -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -199,7 +188,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged_and: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a) { main_body: %a1 = add i32 %a, 4 @@ -218,7 +206,6 @@ ;CHECK-NEXT: %bb. ;CHECK-NEXT: v_lshlrev_b32_e32 v{{[0-9]}}, 4, v0 ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp) { main_body: %a = shl i32 %inp, 4 @@ -239,7 +226,6 @@ ;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -256,7 +242,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged: ;CHECK-NEXT: %bb. ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 -;CHECK: s_waitcnt define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) { main_body: %vr1 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -273,7 +258,6 @@ ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 glc ;CHECK: buffer_load_dword v6, off, s[0:3], 0 slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { main_body: %data = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0) @@ -409,7 +393,6 @@ ;GFX10-NEXT: s_clause ;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 ;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 -;CHECK: s_waitcnt define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -432,7 +415,6 @@ ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 ;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 -;CHECK: s_waitcnt define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { main_body: %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.ll @@ -11,7 +11,6 @@ ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] glc ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] slc ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] glc dlc -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 78, i32 0) @@ -45,7 +44,6 @@ ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] offset:4095 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] offset:73 ; GFX10-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] offset:1 -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 4095, i32 61, i32 47, i32 0) %vdata_glc = call <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32> %0, i32 73, i32 %soffs, i32 62, i32 0) @@ -102,4 +100,3 @@ declare <2 x i32> @llvm.amdgcn.raw.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.raw.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) declare <4 x float> @llvm.amdgcn.raw.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32) - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -5,7 +5,6 @@ ;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_load_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) @@ -19,7 +18,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i32 0, i32 0) @@ -32,7 +30,6 @@ ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS1]] idxen offset:4092 ;CHECK-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc ;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[OFS2]] idxen offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4092, i32 60, i32 0) @@ -45,7 +42,6 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) @@ -54,7 +50,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) @@ -63,7 +58,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -73,7 +67,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) @@ -83,7 +76,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both_reversed: ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) @@ -92,7 +84,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { main_body: %data = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -101,7 +92,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x_i32: ;CHECK: buffer_load_format_x v0, {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { main_body: %data = call i32 @llvm.amdgcn.struct.buffer.load.format.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -111,7 +101,6 @@ ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { main_body: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.ll @@ -5,7 +5,6 @@ ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) @@ -19,7 +18,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i32 0, i32 0) @@ -29,7 +27,6 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 4, i32 8188, i32 0) @@ -38,7 +35,6 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i32 0, i32 0) @@ -47,7 +43,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i32 0, i32 0) @@ -56,7 +51,6 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 60 @@ -66,7 +60,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i32 0, i32 0) @@ -76,7 +69,6 @@ ;CHECK-LABEL: {{^}}buffer_load_both_reversed: ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i32 0, i32 0) @@ -85,7 +77,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) @@ -94,7 +85,6 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) @@ -129,7 +119,6 @@ ;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen ;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc ;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(<4 x i32> inreg) { main_body: %data = call <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.ll @@ -12,7 +12,6 @@ ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_32_SINT] idxen glc ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen slc ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_FMT_32_FLOAT] idxen glc dlc -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 78, i32 0) @@ -48,7 +47,6 @@ ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_FMT_10_10_10_2_SSCALED] idxen offset:4095 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_UINT] idxen offset:73 ; GFX10: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[ZEROREG]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:1 -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4095, i32 61, i32 47, i32 0) %vdata_glc = call <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 73, i32 %soffs, i32 62, i32 0) @@ -126,4 +124,3 @@ declare <2 x i32> @llvm.amdgcn.struct.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.struct.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32) declare <4 x float> @llvm.amdgcn.struct.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32) - diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -6,7 +6,6 @@ ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { main_body: %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) @@ -36,7 +35,6 @@ ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095 ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73 ; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1 -; GCN: s_waitcnt define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -12,7 +12,6 @@ ; GFX900: ds_write_b16 [[ZERO]], v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { entry: @@ -34,7 +33,6 @@ ; GFX900-DAG: s_waitcnt lgkmcnt(0) ; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] ; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] -; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { entry: @@ -56,7 +54,6 @@ ; GFX900-NEXT: ds_write_b16 v2, v0 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { entry: @@ -73,7 +70,6 @@ ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: ; GCN: s_waitcnt ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_u16 v @@ -106,7 +102,6 @@ ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_u16 v @@ -158,7 +153,6 @@ ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_u16 v @@ -176,7 +170,6 @@ ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_u8 v @@ -195,7 +188,6 @@ ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_i8 v @@ -214,7 +206,6 @@ ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_u8 v @@ -235,7 +226,6 @@ ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: ds_read_i8 v @@ -256,7 +246,6 @@ ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { entry: @@ -273,7 +262,6 @@ ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { entry: @@ -290,7 +278,6 @@ ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { entry: @@ -308,7 +295,6 @@ ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { entry: @@ -326,7 +312,6 @@ ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { entry: @@ -345,7 +330,6 @@ ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { entry: @@ -364,7 +348,6 @@ ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}} @@ -385,7 +368,6 @@ ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}} @@ -406,7 +388,6 @@ ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} @@ -428,7 +409,6 @@ ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} @@ -450,7 +430,6 @@ ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} @@ -473,7 +452,6 @@ ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v[0:1], v2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} @@ -496,7 +474,6 @@ ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} @@ -515,7 +492,6 @@ ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} @@ -534,7 +510,6 @@ ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} @@ -552,7 +527,6 @@ ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} @@ -570,7 +544,6 @@ ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} @@ -590,7 +563,6 @@ ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} @@ -611,7 +583,6 @@ ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} @@ -632,7 +603,6 @@ ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} @@ -652,7 +622,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} @@ -671,7 +640,6 @@ ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} @@ -690,7 +658,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} @@ -710,7 +677,6 @@ ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; GFX803: flat_load_ushort @@ -730,7 +696,6 @@ ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 ; GFX803: flat_load_ushort @@ -750,7 +715,6 @@ ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { entry: @@ -769,7 +733,6 @@ ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { entry: @@ -977,7 +940,6 @@ ; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 { entry: @@ -998,7 +960,6 @@ ; GFX900-NEXT: ds_write_b16 v1, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]] -; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -8,14 +8,12 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16_d16 v0, v0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_undeflo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_undeflo: @@ -23,7 +21,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -76,7 +73,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg: @@ -87,7 +83,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg: @@ -99,7 +94,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -132,7 +126,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -181,7 +174,6 @@ ; GFX900-NEXT: ds_read_u16_d16 v1, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reghi_vreg: @@ -193,7 +185,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_reghi_vreg: @@ -205,7 +196,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -224,7 +214,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg: @@ -235,7 +224,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg: @@ -247,7 +235,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load half, half addrspace(3)* %in @@ -264,7 +251,6 @@ ; GFX900-NEXT: ds_read_u8_d16 v1, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8: @@ -275,7 +261,6 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_zexti8: @@ -288,7 +273,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -308,7 +292,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: @@ -319,7 +302,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_zexti8: @@ -331,7 +313,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -349,7 +330,6 @@ ; GFX900-NEXT: ds_read_i8_d16 v1, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8: @@ -360,7 +340,6 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_sexti8: @@ -372,7 +351,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -392,7 +370,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: @@ -403,7 +380,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reglo_vreg_sexti8: @@ -415,7 +391,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -435,7 +410,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: @@ -446,7 +420,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_zexti8: @@ -458,7 +431,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -479,7 +451,6 @@ ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX900-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: @@ -490,7 +461,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2f16_reglo_vreg_sexti8: @@ -502,7 +472,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i8, i8 addrspace(3)* %in @@ -525,7 +494,6 @@ ; GFX900-NEXT: ds_write_b16 v2, v0 ; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: @@ -538,7 +506,6 @@ ; GFX906-NEXT: ds_write_b16 v2, v0 ; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: @@ -552,7 +519,6 @@ ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -573,7 +539,6 @@ ; GFX900-NEXT: ds_write_b16 v0, v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi: @@ -587,7 +552,6 @@ ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi: @@ -602,7 +566,6 @@ ; GFX803-NEXT: s_waitcnt lgkmcnt(1) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -625,7 +588,6 @@ ; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: @@ -639,7 +601,6 @@ ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi: @@ -654,7 +615,6 @@ ; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %load = load i16, i16 addrspace(3)* %in @@ -673,7 +633,6 @@ ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg: @@ -684,7 +643,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg: @@ -697,7 +655,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -715,7 +672,6 @@ ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg: @@ -727,7 +683,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg: @@ -740,7 +695,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -758,7 +712,6 @@ ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8: @@ -769,7 +722,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_zexti8: @@ -783,7 +735,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -802,7 +753,6 @@ ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8: @@ -813,7 +763,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2i16_reglo_vreg_sexti8: @@ -826,7 +775,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -845,7 +793,6 @@ ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8: @@ -857,7 +804,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_zexti8: @@ -871,7 +817,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -891,7 +836,6 @@ ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8: @@ -903,7 +847,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_global_lo_v2f16_reglo_vreg_sexti8: @@ -916,7 +859,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -936,7 +878,6 @@ ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2i16_reghi_vreg: @@ -947,7 +888,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2i16_reghi_vreg: @@ -958,7 +898,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -975,7 +914,6 @@ ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2f16_reghi_vreg: @@ -987,7 +925,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2f16_reghi_vreg: @@ -998,7 +935,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] ; FIXME: the and above should be removable @@ -1017,7 +953,6 @@ ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8: @@ -1028,7 +963,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_zexti8: @@ -1040,7 +974,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1058,7 +991,6 @@ ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8: @@ -1069,7 +1001,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2i16_reglo_vreg_sexti8: @@ -1080,7 +1011,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1098,7 +1028,6 @@ ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8: @@ -1110,7 +1039,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_zexti8: @@ -1122,7 +1050,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1141,7 +1068,6 @@ ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1] ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8: @@ -1153,7 +1079,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_flat_lo_v2f16_reglo_vreg_sexti8: @@ -1164,7 +1089,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1183,7 +1107,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg: @@ -1194,7 +1117,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg: @@ -1205,7 +1127,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1225,7 +1146,6 @@ ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg: @@ -1236,7 +1156,6 @@ ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg: @@ -1247,7 +1166,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 @@ -1265,7 +1183,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg: @@ -1277,7 +1194,6 @@ ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg: @@ -1288,7 +1204,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1306,7 +1221,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: @@ -1317,7 +1231,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: @@ -1328,7 +1241,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1345,7 +1257,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: @@ -1356,7 +1267,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: @@ -1367,7 +1277,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1384,7 +1293,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: @@ -1396,7 +1304,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: @@ -1407,7 +1314,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1424,7 +1330,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: @@ -1435,7 +1340,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8: @@ -1447,7 +1351,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1466,7 +1369,6 @@ ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: @@ -1477,7 +1379,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8: @@ -1488,7 +1389,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1507,7 +1407,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: @@ -1518,7 +1417,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: @@ -1530,7 +1428,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1548,7 +1445,6 @@ ; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: @@ -1559,7 +1455,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: @@ -1570,7 +1465,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1588,7 +1482,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: @@ -1600,7 +1493,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: @@ -1612,7 +1504,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1631,7 +1522,6 @@ ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_constant_lo_v2i16_reglo_vreg: @@ -1642,7 +1532,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v1, v0, v2 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_lo_v2i16_reglo_vreg: @@ -1655,7 +1544,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x i16> @@ -1673,7 +1561,6 @@ ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg: @@ -1685,7 +1572,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg: @@ -1698,7 +1584,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1716,7 +1601,6 @@ ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8: @@ -1728,7 +1612,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_zexti8: @@ -1742,7 +1625,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1762,7 +1644,6 @@ ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v2, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8: @@ -1774,7 +1655,6 @@ ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_constant_lo_v2f16_reglo_vreg_sexti8: @@ -1787,7 +1667,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %reg.bc = bitcast i32 %reg to <2 x half> @@ -1809,7 +1688,6 @@ ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: @@ -1822,7 +1700,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset: @@ -1835,7 +1712,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1859,7 +1735,6 @@ ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: @@ -1872,7 +1747,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: @@ -1885,7 +1759,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1910,7 +1783,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: @@ -1923,7 +1795,6 @@ ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v1, v0 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: @@ -1937,7 +1808,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -1962,7 +1832,6 @@ ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: @@ -1976,7 +1845,6 @@ ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset: @@ -1989,7 +1857,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) @@ -2015,7 +1882,6 @@ ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] ; ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: @@ -2029,7 +1895,6 @@ ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX906-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off -; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset: @@ -2043,7 +1908,6 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 -; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] entry: %obj0 = alloca [10 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/load-local.128.ll b/llvm/test/CodeGen/AMDGPU/load-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.128.ll @@ -8,7 +8,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32: @@ -16,7 +15,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b128 v[0:3], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v4i32: @@ -26,7 +24,6 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[2:3], v1 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr ret <4 x i32> %load @@ -289,7 +286,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align4: @@ -299,7 +295,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v4i32_align4: @@ -313,7 +308,6 @@ ; GFX6-NEXT: ds_read_b32 v3, v3 ; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 ret <4 x i32> %load @@ -324,7 +318,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align8: @@ -332,7 +325,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v4i32_align8: @@ -342,7 +334,6 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[2:3], v1 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 ret <4 x i32> %load @@ -353,7 +344,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b128 v[0:3], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v4i32_align16: @@ -361,7 +351,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b128 v[0:3], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v4i32_align16: @@ -371,7 +360,6 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b64 v[2:3], v1 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 ret <4 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll --- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll @@ -8,7 +8,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32: @@ -16,7 +15,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b96 v[0:2], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v3i32: @@ -26,7 +24,6 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr ret <3 x i32> %load @@ -240,7 +237,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align4: @@ -250,7 +246,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v3i32_align4: @@ -262,7 +257,6 @@ ; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4 ret <3 x i32> %load @@ -273,7 +267,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align8: @@ -283,7 +276,6 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 ; GFX7-NEXT: ds_read_b32 v2, v2 offset:8 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v3i32_align8: @@ -295,7 +287,6 @@ ; GFX6-NEXT: ds_read_b32 v2, v2 ; GFX6-NEXT: ds_read_b32 v1, v1 ; GFX6-NEXT: ds_read_b32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8 ret <3 x i32> %load @@ -306,7 +297,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_read_b96 v[0:2], v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: load_lds_v3i32_align16: @@ -314,7 +304,6 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_read_b96 v[0:2], v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-LABEL: load_lds_v3i32_align16: @@ -324,7 +313,6 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: ds_read_b32 v2, v1 ; GFX6-NEXT: ds_read_b64 v[0:1], v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16 ret <3 x i32> %load diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -121,7 +121,6 @@ ; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]{{$}} ; GFX9-NEXT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, v3 ; GFX9-NEXT: v_mad_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp{{$}} -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { %src0.ext = fpext half %src0 to float diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -24,7 +24,6 @@ ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [40 x i8], addrspace(5) %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)* diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -157,7 +157,6 @@ ; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56 ; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -230,7 +229,6 @@ ; GCN-NEXT: global_store_dword v[3:4], v5, off ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dword v[3:4], v2, off offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32 @@ -259,7 +257,6 @@ ; GCN-NEXT: global_store_dword v[3:4], v5, off ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dword v[3:4], v2, off offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 32 diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -28,7 +28,6 @@ ; GFX9-NEXT: s_cbranch_execnz BB0_1 ; GFX9-NEXT: ; %bb.2: ; %.loopexit ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: %tmp22 = and i32 %arg6, 16777215 @@ -102,7 +101,6 @@ ; GFX9-NEXT: s_cbranch_execnz BB1_2 ; GFX9-NEXT: BB1_3: ; %Flow3 ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: %tmp = icmp ult i32 %arg, %arg1 @@ -158,7 +156,6 @@ ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %b = and i32 %b.arg, 16777215 %s = and i32 %s.arg, 16777215 @@ -209,11 +206,14 @@ ; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, v40 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, v40, v42 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_readlane_b32 s4, v43, 2 ; GFX9-NEXT: v_readlane_b32 s5, v43, 3 ; GFX9-NEXT: v_readlane_b32 s35, v43, 1 @@ -226,7 +226,6 @@ ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[4:5] %b = and i32 %b.arg, 16777215 %s = and i32 %s.arg, 16777215 diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -375,7 +375,6 @@ ; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: ; return define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -21,6 +21,7 @@ ; GCN-DAG: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: v_readlane_b32 s4, v40, 0 ; GCN: v_readlane_b32 s5, v40, 1 @@ -30,7 +31,6 @@ ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] define void @test_func_call_external_void_func_i32_imm() #0 { call void @external_void_func_i32(i32 42) @@ -43,6 +43,7 @@ ; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}} ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; GCN: s_swappc_b64 +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN: s_sub_u32 s32, s32, 0x1400{{$}} ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -182,7 +182,6 @@ ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 ; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: @@ -244,7 +243,6 @@ ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_sub_u32 s32, s32, 0x2000 ; GCN-NEXT: s_mov_b32 s33, s7 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %cond = icmp eq i32 %arg.cond, 0 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -10,7 +10,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_1: @@ -21,7 +20,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 1 %load = load i8, i8* %gep, align 4 @@ -33,7 +31,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_11bit_max: @@ -44,7 +41,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 2047 %load = load i8, i8* %gep, align 4 @@ -56,7 +52,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_12bit_max: @@ -67,7 +62,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 4095 %load = load i8, i8* %gep, align 4 @@ -81,7 +75,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_13bit_max: @@ -92,7 +85,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8191 %load = load i8, i8* %gep, align 4 @@ -106,7 +98,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max: @@ -117,7 +108,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -2048 %load = load i8, i8* %gep, align 4 @@ -131,7 +121,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max: @@ -142,7 +131,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -4096 %load = load i8, i8* %gep, align 4 @@ -156,7 +144,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max: @@ -167,7 +154,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -8192 %load = load i8, i8* %gep, align 4 @@ -179,7 +165,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max: @@ -190,7 +175,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 4095 %load = load i8, i8* %gep, align 4 @@ -204,7 +188,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max: @@ -215,7 +198,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8191 %load = load i8, i8* %gep, align 4 @@ -229,7 +211,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max: @@ -240,7 +221,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 16383 %load = load i8, i8* %gep, align 4 @@ -254,7 +234,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max: @@ -265,7 +244,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -4096 %load = load i8, i8* %gep, align 4 @@ -279,7 +257,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max: @@ -290,7 +267,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -8192 %load = load i8, i8* %gep, align 4 @@ -304,7 +280,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max: @@ -315,7 +290,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -16384 %load = load i8, i8* %gep, align 4 @@ -330,7 +304,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split0: @@ -341,7 +314,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589936639 %load = load i8, i8* %gep, align 4 @@ -356,7 +328,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_split1: @@ -367,7 +338,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589936640 %load = load i8, i8* %gep, align 4 @@ -382,7 +352,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split0: @@ -393,7 +362,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589938687 %load = load i8, i8* %gep, align 4 @@ -408,7 +376,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_split1: @@ -419,7 +386,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589938688 %load = load i8, i8* %gep, align 4 @@ -434,7 +400,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split0: @@ -445,7 +410,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589942783 %load = load i8, i8* %gep, align 4 @@ -460,7 +424,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_split1: @@ -471,7 +434,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 8589942784 %load = load i8, i8* %gep, align 4 @@ -487,7 +449,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -498,7 +459,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854773761 %load = load i8, i8* %gep, align 4 @@ -514,7 +474,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -525,7 +484,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854773760 %load = load i8, i8* %gep, align 4 @@ -541,7 +499,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -552,7 +509,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854771713 %load = load i8, i8* %gep, align 4 @@ -568,7 +524,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -579,7 +534,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854771712 %load = load i8, i8* %gep, align 4 @@ -595,7 +549,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -606,7 +559,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854767617 %load = load i8, i8* %gep, align 4 @@ -622,7 +574,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: flat_load_ubyte v0, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -633,7 +584,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: flat_load_ubyte v0, v[0:1] -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8* %p, i64 -9223372036854767616 %load = load i8, i8* %gep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -10,7 +10,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_1: @@ -19,7 +18,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 1 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -31,7 +29,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_11bit_max: @@ -40,7 +37,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 2047 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -52,7 +48,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_12bit_max: @@ -63,7 +58,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -77,7 +71,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_13bit_max: @@ -88,7 +81,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -100,7 +92,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_neg_11bit_max: @@ -109,7 +100,6 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -2048 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -121,7 +111,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max: @@ -132,7 +121,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -146,7 +134,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max: @@ -157,7 +144,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -169,7 +155,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_11bit_max: @@ -180,7 +165,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 4095 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -194,7 +178,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_12bit_max: @@ -205,7 +188,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8191 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -219,7 +201,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_13bit_max: @@ -230,7 +211,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 16383 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -242,7 +222,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max: @@ -253,7 +232,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -4096 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -267,7 +245,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max: @@ -278,7 +255,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -8192 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -292,7 +268,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max: @@ -303,7 +278,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -16384 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -318,7 +292,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split0: @@ -329,7 +302,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936639 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -344,7 +316,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_split1: @@ -355,7 +326,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589936640 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -370,7 +340,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split0: @@ -381,7 +350,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938687 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -396,7 +364,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_split1: @@ -407,7 +374,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589938688 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -422,7 +388,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split0: @@ -433,7 +398,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942783 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -448,7 +412,6 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_split1: @@ -459,7 +422,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 8589942784 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -475,7 +437,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split0: @@ -486,7 +447,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773761 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -502,7 +462,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_11bit_neg_high_split1: @@ -513,7 +472,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854773760 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -529,7 +487,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split0: @@ -540,7 +497,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771713 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -556,7 +512,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_12bit_neg_high_split1: @@ -567,7 +522,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854771712 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -583,7 +537,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split0: @@ -594,7 +547,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767617 %load = load i8, i8 addrspace(1)* %gep, align 4 @@ -610,7 +562,6 @@ ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_inst_valu_offset_64bit_13bit_neg_high_split1: @@ -621,7 +572,6 @@ ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i8, i8 addrspace(1)* %p, i64 -9223372036854767616 %load = load i8, i8 addrspace(1)* %gep, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll --- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -61,7 +61,7 @@ ; GCN: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec -; GCN-NEXT: s_waitcnt +; GCN-NOT: s_waitcnt ; GCN: BB{{[0-9]+_[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir --- a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir +++ b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir @@ -171,7 +171,6 @@ # CHECK-LABEL: name: si_return # CHECK: bb.0: # CHECK-NEXT: S_STORE_DWORD -# CHECK-NEXT: S_WAITCNT # CHECK-NEXT: S_DCACHE_WB # CHECK-NEXT: SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -1668,7 +1668,6 @@ ; GCN-NEXT: s_mov_b32 s5, s6 ; GCN-NEXT: v_or_b32_e32 v2, 4, v2 ; GCN-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: shl_or_k: @@ -1701,7 +1700,6 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v5, 2, v4 ; GCN-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; EG-LABEL: shl_or_k_two_uses: diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -20,7 +20,7 @@ ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 -; GCN: s_waitcnt vmcnt(0) +; GCN-NOT: s_waitcnt vmcnt(0) ; GCN: s_setpc_b64 ; GCN: ; ScratchSize: 68 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -694,7 +694,7 @@ ; ; GCN-LABEL: {{^}}arg_divergence: ; GCN: buffer_load_dword v0, v0, -; GCN-NEXT: s_waitcnt +; GCN-NOT: s_waitcnt ; GCN-NEXT: ; return to shader part epilog define amdgpu_cs float @arg_divergence(i32 inreg %unused, <3 x i32> %arg4) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -37,6 +37,8 @@ ; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: s_cbranch_execz BB0_2 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -239,7 +239,6 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %local_val = alloca i32, align 128, addrspace(5) store volatile i32 %b, i32 addrspace(5)* %local_val, align 128 diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -11,7 +11,6 @@ ; GFX803-NEXT: flat_store_short v[0:1], v2 ; GFX906-NEXT: global_store_short v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16(i16 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -31,7 +30,6 @@ ; GFX803-NEXT: flat_store_short v[0:1], v2 ; GFX906-NEXT: global_store_short v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2f16(half addrspace(1)* %out, i32 %arg) #0 { entry: @@ -51,7 +49,6 @@ ; GFX803-NEXT: flat_store_short v[0:1], v2 ; GFX906-NEXT: global_store_short v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_i32_shift(i16 addrspace(1)* %out, i32 %value) #0 { entry: @@ -70,7 +67,6 @@ ; GFX803-NEXT: flat_store_byte v[0:1], v2 ; GFX906-NEXT: global_store_byte v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8(i8 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -90,7 +86,6 @@ ; GFX803-NEXT: flat_store_byte v[0:1], v2 ; GFX906-NEXT: global_store_byte v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_i8_shift(i8 addrspace(1)* %out, i32 %value) #0 { entry: @@ -112,7 +107,6 @@ ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: global_store_short v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_max_offset(i16 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -136,7 +130,6 @@ ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: global_store_short v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_min_offset(i16 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -159,7 +152,6 @@ ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: global_store_byte v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_max_offset(i8 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -183,7 +175,6 @@ ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: global_store_byte v[0:1], v2, off -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_global_hi_v2i16_i8_min_offset(i8 addrspace(1)* %out, i32 %arg) #0 { entry: @@ -203,7 +194,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16(i16* %out, i32 %arg) #0 { entry: @@ -221,7 +211,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2f16(half* %out, i32 %arg) #0 { entry: @@ -239,7 +228,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_short v[0:1], v2 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_i32_shift(i16* %out, i32 %value) #0 { entry: @@ -257,7 +245,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_i8(i8* %out, i32 %arg) #0 { entry: @@ -276,7 +263,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NO-D16-HI-NEXT: flat_store_byte v[0:1], v2 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_i8_shift(i8* %out, i32 %value) #0 { entry: @@ -298,7 +284,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_short v[0:1], v2{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 { entry: @@ -322,7 +307,6 @@ ; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_neg_offset(i16* %out, i32 %arg) #0 { entry: @@ -345,7 +329,6 @@ ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX906-NEXT: flat_store_byte v[0:1], v2 offset:4095{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_i8_max_offset(i8* %out, i32 %arg) #0 { entry: @@ -374,7 +357,6 @@ ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_flat_hi_v2i16_i8_neg_offset(i8* %out, i32 %arg) #0 { entry: @@ -394,7 +376,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2i16(i16 addrspace(5)* %out, i32 %arg) #0 { entry: @@ -413,7 +394,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2f16(half addrspace(5)* %out, i32 %arg) #0 { entry: @@ -432,7 +412,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_i32_shift(i16 addrspace(5)* %out, i32 %value) #0 { entry: @@ -450,7 +429,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2i16_i8(i8 addrspace(5)* %out, i32 %arg) #0 { entry: @@ -469,7 +447,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_i8_shift(i8 addrspace(5)* %out, i32 %value) #0 { entry: @@ -486,7 +463,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2i16_max_offset(i16 addrspace(5)* byval %out, i32 %arg) #0 { entry: @@ -507,7 +483,6 @@ ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2i16_nooff(i32 %arg) #0 { entry: @@ -527,7 +502,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_private_hi_v2i16_i8_nooff(i32 %arg) #0 { entry: @@ -546,7 +520,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_local_hi_v2i16(i16 addrspace(3)* %out, i32 %arg) #0 { entry: @@ -565,7 +538,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_local_hi_v2f16(half addrspace(3)* %out, i32 %arg) #0 { entry: @@ -584,7 +556,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_local_hi_i32_shift(i16 addrspace(3)* %out, i32 %value) #0 { entry: @@ -602,7 +573,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b8 v0, v1 -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_local_hi_v2i16_i8(i8 addrspace(3)* %out, i32 %arg) #0 { entry: @@ -620,7 +590,6 @@ ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 ; NO-D16-HI: ds_write_b16 v0, v1 offset:65534{{$}} -; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 define void @store_local_hi_v2i16_max_offset(i16 addrspace(3)* %out, i32 %arg) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -12,7 +12,6 @@ ; CIVI-NEXT: ds_write_b32 v0, v1 ; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: @@ -21,7 +20,6 @@ ; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 ; GFX9-NEXT: ds_write_b32 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store i56 %arg, i56 addrspace(3)* %ptr, align 8 ret void @@ -201,7 +199,6 @@ ; CIVI-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; CIVI-NEXT: s_mov_b32 m0, -1 ; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i13: @@ -209,7 +206,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store i13 %arg, i13 addrspace(3)* %ptr, align 8 ret void @@ -223,7 +219,6 @@ ; CIVI-NEXT: ds_write_b16 v0, v1 ; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 ; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: @@ -232,7 +227,6 @@ ; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -59,7 +59,6 @@ ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; CHECK-NEXT: ; return to shader part epilog bb: %tmp = load volatile i32, i32 addrspace(1)* undef, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -6,7 +6,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -22,7 +21,6 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -35,7 +33,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -62,7 +59,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -78,7 +74,6 @@ ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -94,7 +89,6 @@ ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -161,7 +155,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -175,7 +168,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -189,7 +181,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -231,7 +222,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -245,7 +235,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -261,7 +250,6 @@ ; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -277,7 +265,6 @@ ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -304,7 +291,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -320,7 +306,6 @@ ; GFX9-NEXT: global_load_dword v1, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -336,7 +321,6 @@ ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -382,7 +366,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -400,7 +383,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -478,7 +460,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -492,7 +473,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -622,7 +602,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -636,7 +615,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -652,7 +630,6 @@ ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 @@ -670,7 +647,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 %val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -111,6 +111,7 @@ ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[4:7] dmask:0x1 ; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload @@ -146,6 +147,8 @@ ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10: buffer_load_dword v44, off, s[0:3], s33 diff --git a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll --- a/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/visit-physreg-vgpr-imm-folding-bug.ll @@ -7,6 +7,7 @@ ; CHECK: v_mov_b32_e32 v0, 0{{$}} ; CHECK: v_mov_b32_e32 v1, 2.0{{$}} ; CHECK: s_swappc_b64 +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 ; CHECK: s_swappc_b64 define amdgpu_kernel void @vgpr_multi_use_imm_fold() { diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -218,8 +218,8 @@ ; GCN-LABEL: store_vscnt_private: ; GCN: buffer_store_dword -; GFX8_9: s_waitcnt vmcnt(0) -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX8_9-NOT: s_waitcnt vmcnt(0) +; GFX10-NOT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_private(i32 addrspace(5)* %p) { store i32 0, i32 addrspace(5)* %p @@ -229,8 +229,8 @@ ; GCN-LABEL: store_vscnt_global: ; GFX8: flat_store_dword ; GFX9_10: global_store_dword -; GFX8_9: s_waitcnt vmcnt(0) -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX8_9-NOT: s_waitcnt vmcnt(0) +; GFX10-NOT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_global(i32 addrspace(1)* %p) { store i32 0, i32 addrspace(1)* %p @@ -239,9 +239,9 @@ ; GCN-LABEL: store_vscnt_flat: ; GCN: flat_store_dword -; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0 +; GFX8_9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10-NOT: s_waitcnt lgkmcnt(0){{$}} +; GFX10-NOT: s_waitcnt_vscnt null, 0x0 ; GCN-NEXT: s_setpc_b64 define void @store_vscnt_flat(i32* %p) { store i32 0, i32* %p diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1085,7 +1085,6 @@ ; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_with_call() #1 { call void @external_void_func_void()