diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1352,6 +1352,14 @@ // \returns the number of address arguments from which to enable MIMG NSA // on supported architectures. unsigned getNSAThreshold(const MachineFunction &MF) const; + + // \returns true if the subtarget has a hazard requiring an "s_nop 0" + // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". + bool requiresNopBeforeDeallocVGPRs() const { + // Currently all targets that support the dealloc VGPRs message also require + // the nop. + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1967,6 +1967,10 @@ // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM // instructions. for (MachineInstr *MI : ReleaseVGPRInsts) { + if (ST->requiresNopBeforeDeallocVGPRs()) { + BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + } BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG)) .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); Modified = true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -90,6 +90,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 @@ -174,6 +175,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 @@ -381,6 +383,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4 @@ -462,6 +465,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 @@ -691,6 +695,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -2035,6 +2040,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -2128,6 +2134,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8 @@ -2217,6 +2224,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 @@ -2439,6 +2447,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8 @@ -2525,6 +2534,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 @@ -2769,6 +2779,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -2950,6 +2961,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -90,6 +90,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 @@ -174,6 +175,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 @@ -381,6 +383,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4 @@ -462,6 +465,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 @@ -691,6 +695,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -862,6 +867,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] ; GFX11-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -955,6 +961,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8 @@ -1044,6 +1051,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 @@ -1266,6 +1274,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8 @@ -1352,6 +1361,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 @@ -1596,6 +1606,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -2335,6 +2346,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v3, v0, s[2:3] ; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -3063,6 +3075,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -480,6 +480,7 @@ ; GFX11-NEXT: s_movrels_b64 s[0:1], s[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -682,6 +683,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s14, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, s15, vcc_lo ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -799,6 +801,7 @@ ; GFX11-NEXT: v_movrels_b32_e32 v16, v0 ; GFX11-NEXT: v_movrels_b32_e32 v17, v1 ; GFX11-NEXT: global_store_b64 v[0:1], v[16:17], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -905,6 +908,7 @@ ; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1857,6 +1861,7 @@ ; GFX11-NEXT: s_movrels_b64 s[0:1], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -3372,6 +3377,7 @@ ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -4361,6 +4367,7 @@ ; GFX11-NEXT: s_cselect_b32 s2, 4.0, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -4718,6 +4725,7 @@ ; GFX11-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3] ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -108,6 +108,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -230,6 +231,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -375,6 +377,7 @@ ; GFX11-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3 ; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -497,6 +500,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -624,6 +628,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -762,6 +767,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -901,6 +907,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1066,6 +1073,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -89,6 +89,7 @@ ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr @@ -179,6 +180,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1 ) %ptr @@ -265,6 +267,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr @@ -358,6 +361,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr @@ -448,6 +452,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %ptr @@ -541,6 +546,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr @@ -630,6 +636,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr @@ -720,6 +727,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(1) %ptr @@ -898,6 +906,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1 ) %ptr @@ -1029,6 +1038,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr @@ -1169,6 +1179,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr @@ -1306,6 +1317,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(4) %ptr @@ -1430,6 +1442,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr @@ -1549,6 +1562,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr @@ -1671,6 +1685,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i16>, ptr addrspace(1) %ptr @@ -1854,6 +1869,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2009,6 +2025,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1 ) %ptr @@ -2191,6 +2208,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2378,6 +2396,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2565,6 +2584,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(4) %ptr @@ -2721,6 +2741,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr @@ -2873,6 +2894,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr @@ -3026,6 +3048,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i16>, ptr addrspace(1) %ptr @@ -3209,6 +3232,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr @@ -3363,6 +3387,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: global_store_b128 v[10:11], v[6:9], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1 ) %ptr @@ -3544,6 +3569,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr @@ -3838,6 +3864,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr @@ -4132,6 +4159,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: global_store_b128 v[10:11], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(4) %ptr @@ -4378,6 +4406,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[11:12], v[0:3], off ; GFX11-NEXT: global_store_b128 v[13:14], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr @@ -4529,6 +4558,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[0:1], v[3:6], off ; GFX11-NEXT: global_store_b128 v[11:12], v[7:10], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr @@ -4772,6 +4802,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v[12:13], v[0:3], off ; GFX11-NEXT: global_store_b128 v[14:15], v[4:7], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i16>, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -105,6 +105,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr @@ -207,6 +208,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1 ) %ptr @@ -311,6 +313,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr @@ -419,6 +422,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr @@ -523,6 +527,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(4) %ptr @@ -625,6 +630,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr @@ -724,6 +730,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr @@ -823,6 +830,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i8>, ptr addrspace(1) %ptr @@ -972,6 +980,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1 ) %ptr @@ -1059,6 +1068,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr @@ -1153,6 +1163,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr @@ -1244,6 +1255,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(4) %ptr @@ -1339,6 +1351,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr @@ -1430,6 +1443,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr @@ -1522,6 +1536,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <4 x i8>, ptr addrspace(1) %ptr @@ -1656,6 +1671,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -1779,6 +1795,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1 ) %ptr @@ -1911,6 +1928,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -2052,6 +2070,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -2190,6 +2209,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(4) %ptr @@ -2316,6 +2336,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr @@ -2437,6 +2458,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr @@ -2561,6 +2583,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <8 x i8>, ptr addrspace(1) %ptr @@ -2744,6 +2767,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_mov_b32_e32 v3, s3 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -2899,6 +2923,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1 ) %ptr @@ -3081,6 +3106,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -3268,6 +3294,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -3455,6 +3482,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(4) %ptr @@ -3611,6 +3639,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr @@ -3763,6 +3792,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr @@ -3916,6 +3946,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <16 x i8>, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -149,6 +149,7 @@ ; GFX11-NEXT: global_store_b128 v64, v[56:59], s[2:3] offset:224 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v64, v[60:63], s[2:3] offset:240 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1078,6 +1078,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[13:16], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1227,6 +1228,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[14:17], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1288,6 +1290,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1494,6 +1497,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[15:18], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1618,6 +1622,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1679,6 +1684,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1800,6 +1806,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -2407,6 +2414,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -2533,6 +2541,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b128 v[0:1], v[12:15], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -379,6 +379,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s5, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -398,6 +399,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s5, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) @@ -488,6 +490,7 @@ ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -505,6 +508,7 @@ ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX11_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) @@ -595,6 +599,7 @@ ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -612,6 +617,7 @@ ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) @@ -702,6 +708,7 @@ ; GFX11_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2 ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -719,6 +726,7 @@ ; GFX11_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) @@ -814,6 +822,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] ; GFX11_W32-NEXT: v_mov_b32_e32 v2, 0 ; GFX11_W32-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -832,6 +841,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3] ; GFX11_W64-NEXT: v_mov_b32_e32 v2, 0 ; GFX11_W64-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) @@ -926,6 +936,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -944,6 +955,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 @@ -1032,6 +1044,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1049,6 +1062,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) @@ -1136,6 +1150,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1153,6 +1168,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) @@ -1291,6 +1307,7 @@ ; GFX11_W32-NEXT: v_div_fmas_f32 v0, v2, v3, v1 ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1315,6 +1332,7 @@ ; GFX11_W64-NEXT: v_div_fmas_f32 v0, v2, v3, v1 ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[4:5] offset:8 +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1493,6 +1511,7 @@ ; GFX11_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W32-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 +; GFX11_W32-NEXT: s_nop 0 ; GFX11_W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W32-NEXT: s_endpgm ; @@ -1523,6 +1542,7 @@ ; GFX11_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11_W64-NEXT: global_store_b32 v1, v0, s[0:1] offset:8 +; GFX11_W64-NEXT: s_nop 0 ; GFX11_W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11_W64-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -72,6 +72,7 @@ ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -154,6 +155,7 @@ ; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -242,6 +244,7 @@ ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -330,6 +333,7 @@ ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -405,6 +409,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -478,6 +483,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -551,6 +557,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -624,6 +631,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -698,6 +706,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -772,6 +781,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -846,6 +856,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -920,6 +931,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -982,6 +994,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) @@ -1039,6 +1052,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) @@ -1098,6 +1112,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) @@ -1157,6 +1172,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) @@ -1221,6 +1237,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1289,6 +1306,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1373,6 +1391,7 @@ ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1462,6 +1481,7 @@ ; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1518,6 +1538,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) @@ -1565,6 +1586,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) @@ -1610,6 +1632,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) @@ -1663,6 +1686,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -40,6 +40,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -123,6 +123,7 @@ ; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i32.ll @@ -33,6 +33,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -55,6 +55,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.f32.i32(float %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -112,6 +113,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v2f32.i32(<2 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -169,6 +171,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v3f32.i32(<3 x float> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -226,6 +229,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -283,6 +287,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -340,6 +345,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 2, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -397,6 +403,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 4, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -454,6 +461,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 8, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -511,6 +519,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -568,6 +577,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %in, i32 6, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -632,6 +642,7 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: image_store v0, v[1:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm tail call void @llvm.amdgcn.image.store.2d.f32.i32(float %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -64,6 +64,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -131,6 +132,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.private(ptr %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -64,6 +64,7 @@ ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -131,6 +132,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB1_2: ; %bb1 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -40,6 +40,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01] ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 true) #0 @@ -81,6 +82,7 @@ ; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11] ; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] %tmp0 = call i64 @llvm.amdgcn.mov.dpp.i64(i64 %in1, i32 1, i32 1, i32 1, i1 false) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -36,6 +36,7 @@ ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) @@ -86,6 +87,7 @@ ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll @@ -17,6 +17,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -34,6 +35,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -51,6 +53,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -66,6 +69,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -83,6 +87,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -98,6 +103,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -115,6 +121,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -130,6 +137,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -145,6 +153,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -160,6 +169,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -175,6 +185,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -190,6 +201,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -205,6 +217,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -220,6 +233,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -237,6 +251,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -252,6 +267,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -267,6 +283,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -282,6 +299,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -298,6 +316,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -313,6 +332,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -328,6 +348,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -343,6 +364,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll @@ -15,6 +15,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -30,6 +31,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -45,6 +47,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -58,6 +61,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -73,6 +77,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -86,6 +91,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -101,6 +107,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -115,6 +122,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -128,6 +136,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -141,6 +150,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -154,6 +164,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -167,6 +178,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -180,6 +192,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -193,6 +206,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -208,6 +222,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -221,6 +236,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -234,6 +250,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -247,6 +264,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -260,6 +278,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -273,6 +292,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -286,6 +306,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -299,6 +320,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-unaligned.ll @@ -326,6 +326,7 @@ ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <8 x i32>, ptr addrspace(4) %ptr, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll @@ -36,6 +36,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v5, v5, v0, v1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -85,6 +86,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -134,6 +136,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -179,6 +182,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -229,6 +233,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -276,6 +281,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -324,6 +330,7 @@ ; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -356,6 +363,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -415,6 +423,7 @@ ; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -447,6 +456,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -535,6 +545,7 @@ ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -305,6 +305,7 @@ ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX11-NEXT: global_store_b32 v[2:3], v1, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -405,6 +406,7 @@ ; GFX11-NEXT: v_mul_i32_i24_e32 v1, -7, v1 ; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -76,6 +76,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -151,6 +152,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 @@ -211,6 +213,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 @@ -264,6 +267,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %a, %b @@ -326,6 +330,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -392,6 +397,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -456,6 +462,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -519,6 +526,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, v0, 32 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -584,6 +592,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_u16 v0, 0x3f80, v0 op_sel:[1,0] op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -672,6 +681,7 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -767,6 +777,7 @@ ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: global_store_b128 v1, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -860,6 +871,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -963,6 +975,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -181,6 +181,7 @@ ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -211,6 +212,7 @@ ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -395,6 +397,7 @@ ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -426,6 +429,7 @@ ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -661,6 +665,7 @@ ; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -702,6 +707,7 @@ ; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -953,6 +959,7 @@ ; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -997,6 +1004,7 @@ ; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1067,6 +1075,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1249,6 +1258,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1280,6 +1290,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1467,6 +1478,7 @@ ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1499,6 +1511,7 @@ ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1734,6 +1747,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1776,6 +1790,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1846,6 +1861,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -184,6 +184,7 @@ ; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -221,6 +222,7 @@ ; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -447,6 +449,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1] ; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -486,6 +489,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3] ; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -763,6 +767,7 @@ ; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -813,6 +818,7 @@ ; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1013,6 +1019,7 @@ ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1051,6 +1058,7 @@ ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1321,6 +1329,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1368,6 +1377,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1459,6 +1469,7 @@ ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1681,6 +1692,7 @@ ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1719,6 +1731,7 @@ ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1948,6 +1961,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1988,6 +2002,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2265,6 +2280,7 @@ ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2315,6 +2331,7 @@ ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2564,6 +2581,7 @@ ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2605,6 +2623,7 @@ ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2886,6 +2905,7 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2935,6 +2955,7 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -3026,6 +3047,7 @@ ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -195,6 +195,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -227,6 +228,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -431,6 +433,7 @@ ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -465,6 +468,7 @@ ; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -714,6 +718,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -759,6 +764,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1180,6 +1186,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1214,6 +1221,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1460,6 +1468,7 @@ ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v3 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1501,6 +1510,7 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v3 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -1576,6 +1586,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1773,6 +1784,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -1806,6 +1818,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2013,6 +2026,7 @@ ; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2048,6 +2062,7 @@ ; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2297,6 +2312,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2342,6 +2358,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -2772,6 +2789,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -2809,6 +2827,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -3066,6 +3085,7 @@ ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -3109,6 +3129,7 @@ ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -3184,6 +3205,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -3435,6 +3457,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -3480,6 +3503,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -3730,6 +3754,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -3775,6 +3800,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -4025,6 +4051,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -4070,6 +4097,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -4320,6 +4348,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -4365,6 +4394,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -4583,6 +4613,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -4617,6 +4648,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -4866,6 +4898,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -4911,6 +4944,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -5129,6 +5163,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -5163,6 +5198,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -5412,6 +5448,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -5457,6 +5494,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -5670,6 +5708,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -5704,6 +5743,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -5953,6 +5993,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -5998,6 +6039,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -6211,6 +6253,7 @@ ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -6245,6 +6288,7 @@ ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -188,6 +188,7 @@ ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: .LBB0_6: ; %UnifiedReturnBlock +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -227,6 +228,7 @@ ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: .LBB0_6: ; %UnifiedReturnBlock +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: @@ -556,6 +558,7 @@ ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1164-NEXT: .LBB1_6: ; %UnifiedReturnBlock +; GFX1164-NEXT: s_nop 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; @@ -619,6 +622,7 @@ ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 ; GFX1132-NEXT: .LBB1_6: ; %UnifiedReturnBlock +; GFX1132-NEXT: s_nop 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -180,6 +180,7 @@ ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -210,6 +211,7 @@ ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -394,6 +396,7 @@ ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -425,6 +428,7 @@ ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -660,6 +664,7 @@ ; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -701,6 +706,7 @@ ; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -771,6 +777,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -953,6 +960,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -984,6 +992,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1171,6 +1180,7 @@ ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1203,6 +1213,7 @@ ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1438,6 +1449,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1480,6 +1492,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1550,6 +1563,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -186,6 +186,7 @@ ; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -217,6 +218,7 @@ ; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -407,6 +409,7 @@ ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -439,6 +442,7 @@ ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -680,6 +684,7 @@ ; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -722,6 +727,7 @@ ; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -792,6 +798,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -878,6 +885,7 @@ ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -894,6 +902,7 @@ ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1082,6 +1091,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1114,6 +1124,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1307,6 +1318,7 @@ ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1340,6 +1352,7 @@ ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1581,6 +1594,7 @@ ; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1624,6 +1638,7 @@ ; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1694,6 +1709,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1780,6 +1796,7 @@ ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W64-NEXT: s_nop 0 ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1796,6 +1813,7 @@ ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11W32-NEXT: s_nop 0 ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -69,6 +69,7 @@ ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -86,6 +87,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1 @@ -160,6 +162,7 @@ ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -172,6 +175,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr @@ -229,6 +233,7 @@ ; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -243,6 +248,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1 @@ -314,6 +320,7 @@ ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -326,6 +333,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -393,6 +401,7 @@ ; GFX11-FLAT-NEXT: s_mov_b32 s4, s0 ; GFX11-FLAT-NEXT: s_mov_b32 s5, s1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -406,6 +415,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1 @@ -481,6 +491,7 @@ ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -495,6 +506,7 @@ ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -555,6 +567,7 @@ ; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -567,6 +580,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1 @@ -642,6 +656,7 @@ ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -656,6 +671,7 @@ ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -728,6 +744,7 @@ ; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -743,6 +760,7 @@ ; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-GISEL-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1 @@ -826,6 +844,7 @@ ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; @@ -842,6 +861,7 @@ ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -82,10 +82,12 @@ ; GFX11-NEXT: s_cbranch_vccnz .LBB0_2 ; GFX11-NEXT: ; %bb.1: ; %one ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB0_2: ; %two ; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -171,6 +173,7 @@ ; GFX11-NEXT: s_mov_b32 s2, s6 ; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -256,6 +259,7 @@ ; GFX11-NEXT: s_mov_b32 s2, s6 ; GFX11-NEXT: s_mov_b32 s3, s7 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -57,6 +57,7 @@ ; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 @@ -111,6 +112,7 @@ ; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) %in, align 8 @@ -175,6 +177,7 @@ ; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) %in, align 16 @@ -263,6 +266,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x i32>, ptr addrspace(1) %in, align 32 @@ -317,6 +321,7 @@ ; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 @@ -381,6 +386,7 @@ ; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 ; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) %in, align 16 @@ -469,6 +475,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x i64>, ptr addrspace(1) %in, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -4679,6 +4679,7 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -5102,6 +5103,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -32,6 +32,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -166,6 +167,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call float @coldcc(float 1.0) @@ -229,6 +231,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = call float @fastcc(float 1.0) @@ -430,6 +433,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, @@ -466,6 +470,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <2 x i16> %arg0, @@ -589,6 +594,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, @@ -622,6 +628,7 @@ ; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0 ; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, @@ -678,6 +685,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, @@ -720,6 +728,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, @@ -753,6 +762,7 @@ ; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <3 x i32> %arg0, @@ -785,6 +795,7 @@ ; GFX11-NEXT: v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1 ; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 ; GFX11-NEXT: global_store_b96 v[0:1], v[0:2], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <3 x float> %arg0, @@ -827,6 +838,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add <5 x i32> %arg0, @@ -867,6 +879,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <5 x float> %arg0, @@ -893,6 +906,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 @@ -925,6 +939,7 @@ ; GFX11-NEXT: s_add_i32 s0, s0, s0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = add i16 %arg0, %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -111,6 +111,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -210,6 +211,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -300,6 +302,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -389,6 +392,7 @@ ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -485,6 +489,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) @@ -606,6 +611,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) @@ -742,6 +748,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) @@ -874,6 +881,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -988,6 +996,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1087,6 +1096,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1177,6 +1187,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1266,6 +1277,7 @@ ; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1363,6 +1375,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) @@ -1484,6 +1497,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) @@ -1620,6 +1634,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) @@ -1752,6 +1767,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2911,6 +2927,7 @@ ; GFX11-NEXT: .LBB16_3: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -565,6 +565,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, off offset:6 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -58,6 +58,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -139,6 +140,7 @@ ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -207,6 +209,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -280,6 +283,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -353,6 +357,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e32 v1, 0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -419,6 +424,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -488,6 +494,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -557,6 +564,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -628,6 +636,7 @@ ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp ; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -695,6 +704,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -772,6 +782,7 @@ ; GFX11-NEXT: v_add_f32_e64 v2, v2, v1 clamp ; GFX11-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -853,6 +864,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -931,6 +943,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1022,6 +1035,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1106,6 +1120,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1192,6 +1207,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1276,6 +1292,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1358,6 +1375,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1443,6 +1461,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1529,6 +1548,7 @@ ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -58,6 +58,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -125,6 +126,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -193,6 +195,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -271,6 +274,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -349,6 +353,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -434,6 +439,7 @@ ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -503,6 +509,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -571,6 +578,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -640,6 +648,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -710,6 +719,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -777,6 +787,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -845,6 +856,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -918,6 +930,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -983,6 +996,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1048,6 +1062,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1113,6 +1128,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1178,6 +1194,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1243,6 +1260,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1308,6 +1326,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1359,6 +1378,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1407,6 +1427,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1456,6 +1477,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1505,6 +1527,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1553,6 +1576,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1601,6 +1625,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1673,6 +1698,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1741,6 +1767,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1814,6 +1841,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1886,6 +1914,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1954,6 +1983,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2019,6 +2049,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2084,6 +2115,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2149,6 +2181,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2214,6 +2247,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2279,6 +2313,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2330,6 +2365,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2379,6 +2415,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2450,6 +2487,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2536,6 +2574,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2621,6 +2660,7 @@ ; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0 ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2705,6 +2745,7 @@ ; GFX11-NEXT: v_pk_max_f16 v1, v1, 0 ; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2781,6 +2822,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2861,6 +2903,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2940,6 +2983,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3018,6 +3062,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3096,6 +3141,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3182,6 +3228,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3268,6 +3315,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3348,6 +3396,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm { diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -318,6 +318,7 @@ ; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9 ; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -373,6 +374,7 @@ ; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -462,6 +464,7 @@ ; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 ; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -96,6 +96,7 @@ ; GFX11-NEXT: s_min_u32 s2, s2, 32 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone @@ -198,6 +199,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -317,6 +319,7 @@ ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -463,6 +466,7 @@ ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -586,6 +590,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u16 v1, v1, -8 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr @@ -688,6 +693,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 ; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) @@ -780,6 +786,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_min3_u32 v0, v0, s2, 64 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) @@ -903,6 +910,7 @@ ; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1029,6 +1037,7 @@ ; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1135,6 +1144,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1241,6 +1251,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1363,6 +1374,7 @@ ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1484,6 +1496,7 @@ ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1597,6 +1610,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1724,6 +1738,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr @@ -1842,6 +1857,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 ; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -962,6 +962,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1047,6 +1048,7 @@ ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1137,6 +1139,7 @@ ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1232,6 +1235,7 @@ ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1368,6 +1372,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1538,6 +1543,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1730,6 +1736,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: global_store_b32 v4, v5, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -1916,6 +1923,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2037,6 +2045,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2122,6 +2131,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2203,6 +2213,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2282,6 +2293,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2415,6 +2427,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2495,6 +2508,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2575,6 +2589,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2656,6 +2671,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2737,6 +2753,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2831,6 +2848,7 @@ ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -2972,6 +2990,7 @@ ; GFX11-NEXT: global_store_b8 v[0:1], v3, off ; GFX11-NEXT: global_store_b8 v[0:1], v0, off ; GFX11-NEXT: global_store_b8 v[0:1], v1, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -49,6 +49,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 0, i32 0 @@ -133,6 +134,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x i16> undef, i16 %a, i32 0 @@ -217,6 +219,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tmp = insertelement <2 x half> undef, half %a, i32 0 @@ -399,6 +402,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shift = lshr i32 %b, 16 @@ -488,6 +492,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shift_a = lshr i32 %a, 16 @@ -725,6 +730,7 @@ ; GFX11-NEXT: ds_load_u16_d16 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -118,6 +118,7 @@ ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -199,6 +200,7 @@ ; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -594,6 +596,7 @@ ; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -194,6 +194,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 10 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -47,6 +47,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr @@ -106,6 +107,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr @@ -174,6 +176,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -229,6 +232,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 @@ -284,6 +288,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx @@ -338,6 +343,7 @@ ; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -411,6 +417,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2] ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -474,6 +481,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr @@ -534,6 +542,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <16 x half>, ptr addrspace(4) %ptr @@ -679,6 +688,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -931,6 +941,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -54,6 +54,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc= bitcast i16 %in to half @@ -108,6 +109,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) @@ -161,6 +163,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -217,6 +220,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) @@ -277,6 +281,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in0) @@ -334,6 +339,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -391,6 +397,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> @@ -471,6 +478,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -558,6 +566,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -644,6 +653,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -723,6 +733,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -75,6 +75,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -143,6 +144,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -209,6 +211,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_f16_e32 v0, 2.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -297,6 +300,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -375,6 +379,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -450,6 +455,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0x3c004000, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -109,6 +109,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 @@ -244,6 +245,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 @@ -344,6 +346,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x20001 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep.r = getelementptr i16, ptr addrspace(1) %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -53,6 +53,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half undef) @@ -107,6 +108,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -159,6 +161,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = bitcast i16 %val.arg to half @@ -253,6 +256,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -309,6 +313,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -366,6 +371,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -422,6 +428,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -478,6 +485,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %out @@ -523,6 +531,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0.0) @@ -566,6 +575,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -0.0) @@ -609,6 +619,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 1.0) @@ -652,6 +663,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half -1.0) @@ -695,6 +707,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 16.0) @@ -738,6 +751,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) @@ -781,6 +795,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) @@ -824,6 +839,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) @@ -867,6 +883,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) @@ -910,6 +927,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) @@ -953,6 +971,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) @@ -996,6 +1015,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) @@ -1039,6 +1059,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) @@ -1082,6 +1103,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) @@ -1125,6 +1147,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) @@ -1168,6 +1191,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) @@ -1240,6 +1264,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1318,6 +1343,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1398,6 +1424,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1476,6 +1503,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1541,6 +1569,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = bitcast i32 %val.arg to <2 x half> @@ -1584,6 +1613,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) @@ -1627,6 +1657,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1670,6 +1701,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1713,6 +1745,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1756,6 +1789,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1799,6 +1833,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1842,6 +1877,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1885,6 +1921,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1928,6 +1965,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -1971,6 +2009,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2014,6 +2053,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) @@ -2057,6 +2097,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2100,6 +2141,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2143,6 +2185,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2186,6 +2229,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2229,6 +2273,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> ) @@ -2351,6 +2396,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef) @@ -2659,6 +2705,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -52,6 +52,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out @@ -102,6 +103,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float %val) @@ -142,6 +144,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out @@ -184,6 +187,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out @@ -227,6 +231,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %out @@ -261,6 +266,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float undef) @@ -293,6 +299,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0.0) @@ -327,6 +334,7 @@ ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -0.0) @@ -360,6 +368,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 1.0) @@ -393,6 +402,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float -1.0) @@ -426,6 +436,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 16.0) @@ -458,6 +469,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) @@ -494,6 +506,7 @@ ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) @@ -530,6 +543,7 @@ ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) @@ -566,6 +580,7 @@ ; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) @@ -599,6 +614,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float)) @@ -633,6 +649,7 @@ ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) @@ -666,6 +683,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x807fffff ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float)) @@ -699,6 +717,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000) @@ -732,6 +751,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float)) @@ -765,6 +785,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float)) @@ -798,6 +819,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float)) @@ -831,6 +853,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float)) @@ -864,6 +887,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float)) @@ -897,6 +921,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float)) @@ -937,6 +962,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out @@ -982,6 +1008,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double %val) @@ -1022,6 +1049,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], |v[0:1]|, |v[0:1]| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out @@ -1064,6 +1092,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out @@ -1107,6 +1136,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load double, ptr addrspace(1) %out @@ -1145,6 +1175,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0.0) @@ -1180,6 +1211,7 @@ ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -0.0) @@ -1214,6 +1246,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 1.0) @@ -1248,6 +1281,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbff00000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double -1.0) @@ -1282,6 +1316,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40300000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 16.0) @@ -1318,6 +1353,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) @@ -1354,6 +1390,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double)) @@ -1389,6 +1426,7 @@ ; GFX11-NEXT: v_bfrev_b32_e32 v1, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) @@ -1425,6 +1463,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double)) @@ -1459,6 +1498,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000) @@ -1493,6 +1533,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double)) @@ -1527,6 +1568,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double)) @@ -1561,6 +1603,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double)) @@ -1595,6 +1638,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double)) @@ -1629,6 +1673,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double)) @@ -1663,6 +1708,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7ff80000 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double)) @@ -1725,6 +1771,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1791,6 +1838,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1859,6 +1907,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1937,6 +1986,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2003,6 +2053,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2069,6 +2120,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2138,6 +2190,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2216,6 +2269,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2286,6 +2340,7 @@ ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll @@ -77,6 +77,7 @@ ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -166,6 +167,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -256,6 +258,7 @@ ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -344,6 +347,7 @@ ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -432,6 +436,7 @@ ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -520,6 +525,7 @@ ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -608,6 +614,7 @@ ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -696,6 +703,7 @@ ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -784,6 +792,7 @@ ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -872,6 +881,7 @@ ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -960,6 +970,7 @@ ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1048,6 +1059,7 @@ ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1136,6 +1148,7 @@ ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1224,6 +1237,7 @@ ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1312,6 +1326,7 @@ ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1415,6 +1430,7 @@ ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1519,6 +1535,7 @@ ; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1622,6 +1639,7 @@ ; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1725,6 +1743,7 @@ ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1829,6 +1848,7 @@ ; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1933,6 +1953,7 @@ ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2037,6 +2058,7 @@ ; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2141,6 +2163,7 @@ ; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2244,6 +2267,7 @@ ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2347,6 +2371,7 @@ ; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2451,6 +2476,7 @@ ; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2554,6 +2580,7 @@ ; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2657,6 +2684,7 @@ ; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -2760,6 +2788,7 @@ ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -70,6 +70,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) @@ -123,6 +124,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 0.0) @@ -176,6 +178,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 1.0) @@ -229,6 +232,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half 10.0) @@ -282,6 +286,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -1.0) @@ -335,6 +340,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half %mag, half -10.0) @@ -389,6 +395,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 0.0, half %sign) @@ -448,6 +455,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 1.0, half %sign) @@ -507,6 +515,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half 10.0, half %sign) @@ -565,6 +574,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -1.0, half %sign) @@ -624,6 +634,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call half @llvm.copysign.f16(half -10.0, half %sign) @@ -895,6 +906,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -995,6 +1007,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1 ; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1093,6 +1106,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1193,6 +1207,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1293,6 +1308,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1390,6 +1406,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1492,6 +1509,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 ; GFX11-NEXT: global_store_b16 v2, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1757,6 +1775,7 @@ ; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 ; GFX11-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half @@ -1846,6 +1865,7 @@ ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) @@ -1957,6 +1977,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) @@ -2085,6 +2106,7 @@ ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -40,6 +40,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float %sign) @@ -82,6 +83,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 0.0) @@ -124,6 +126,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 1.0) @@ -166,6 +169,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float 10.0) @@ -208,6 +212,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -1.0) @@ -250,6 +255,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float %mag, float -10.0) @@ -292,6 +298,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 0.0, float %sign) @@ -338,6 +345,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 1.0, float %sign) @@ -383,6 +391,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float 10.0, float %sign) @@ -428,6 +437,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -1.0, float %sign) @@ -473,6 +483,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call float @llvm.copysign.f32(float -10.0, float %sign) @@ -527,6 +538,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) @@ -591,6 +603,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3 ; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) @@ -661,6 +674,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) @@ -932,6 +946,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float @@ -976,6 +991,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 1.0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.trunc = fptrunc double %sign to float @@ -1022,6 +1038,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float @@ -1071,6 +1088,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to float @@ -1120,6 +1138,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext bfloat %sign to float diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -54,6 +54,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double %sign) @@ -99,6 +100,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 0.0) @@ -144,6 +146,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 1.0) @@ -189,6 +192,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double 10.0) @@ -234,6 +238,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -1.0) @@ -279,6 +284,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double %mag, double -10.0) @@ -332,6 +338,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext float %sign to double @@ -386,6 +393,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sign.ext = fpext half %sign to double @@ -429,6 +437,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 0.0, double %sign) @@ -474,6 +483,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 1.0, double %sign) @@ -519,6 +529,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double 10.0, double %sign) @@ -564,6 +575,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -1.0, double %sign) @@ -609,6 +621,7 @@ ; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call double @llvm.copysign.f64(double -10.0, double %sign) @@ -669,6 +682,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2 ; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) @@ -746,6 +760,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) @@ -835,6 +850,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -125,6 +125,7 @@ ; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -223,6 +224,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -316,6 +318,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e64 v1, |v1| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -412,6 +415,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -492,6 +496,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -585,6 +590,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -679,6 +685,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rsq_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -779,6 +786,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -888,6 +896,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -998,6 +1007,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1071,6 +1081,7 @@ ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef @@ -1137,6 +1148,7 @@ ; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef @@ -1203,6 +1215,7 @@ ; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = load half, ptr addrspace(1) undef diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -157,6 +157,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -309,6 +310,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -393,6 +395,7 @@ ; GFX11-NEXT: v_mul_f32_e32 v1, s2, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -546,6 +549,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -609,6 +613,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -672,6 +677,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -735,6 +741,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -899,6 +906,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -962,6 +970,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1215,6 +1224,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_div_fixup_f32 v1, v1, s7, s5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1325,6 +1335,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1402,6 +1413,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1479,6 +1491,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1912,6 +1925,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, s7, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2019,6 +2033,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2126,6 +2141,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_dual_mul_f32 v2, s2, v2 :: v_dual_mul_f32 v3, s3, v3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2299,6 +2315,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2444,6 +2461,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -52,6 +52,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -117,6 +118,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -175,6 +177,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -228,6 +231,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -293,6 +297,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -351,6 +356,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -416,6 +422,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -474,6 +481,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -541,6 +549,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -613,6 +622,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -711,6 +721,7 @@ ; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -733,6 +744,7 @@ ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -831,6 +843,7 @@ ; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] ; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -853,6 +866,7 @@ ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] ; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 @@ -945,6 +959,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -961,6 +976,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1036,6 +1052,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1052,6 +1069,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1127,6 +1145,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1143,6 +1162,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1218,6 +1238,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1234,6 +1255,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1309,6 +1331,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1325,6 +1348,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1400,6 +1424,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1416,6 +1441,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1491,6 +1517,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1507,6 +1534,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1582,6 +1610,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1598,6 +1627,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1673,6 +1703,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1689,6 +1720,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1764,6 +1796,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1780,6 +1813,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1855,6 +1889,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1871,6 +1906,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -1946,6 +1982,7 @@ ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -1962,6 +1999,7 @@ ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -2058,6 +2096,7 @@ ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1 ; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -2076,6 +2115,7 @@ ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2 ; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -2140,6 +2180,7 @@ ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] ; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; @@ -2158,6 +2199,7 @@ ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm ptr addrspace(1) %in1, @@ -2203,6 +2245,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2248,6 +2291,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2302,6 +2346,7 @@ ; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1 ; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0 ; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -123,6 +123,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 @@ -254,6 +255,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 @@ -392,6 +394,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 @@ -531,6 +534,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -63,6 +63,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -136,6 +137,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -210,6 +212,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -284,6 +287,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -361,6 +365,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -451,6 +456,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -530,6 +536,7 @@ ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0 ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -599,6 +606,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -672,6 +680,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -772,6 +781,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, -v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -871,6 +881,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, -v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -970,6 +981,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, -v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1069,6 +1081,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1174,6 +1187,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3| ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1293,6 +1307,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1396,6 +1411,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1494,6 +1510,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1592,6 +1609,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1702,6 +1720,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1800,6 +1819,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1898,6 +1918,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1996,6 +2017,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2094,6 +2116,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2192,6 +2215,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2290,6 +2314,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2388,6 +2413,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2486,6 +2512,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2584,6 +2611,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2682,6 +2710,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2780,6 +2809,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2878,6 +2908,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2976,6 +3007,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3074,6 +3106,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3172,6 +3205,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3273,6 +3307,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3406,6 +3441,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3537,6 +3573,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3667,6 +3704,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3787,6 +3825,7 @@ ; GFX11-NEXT: v_maxmin_f32 v3, v1, v2, v3 ; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3897,6 +3936,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4012,6 +4052,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4127,6 +4168,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4240,6 +4282,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_minmax_f32 v1, -v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4343,6 +4386,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4421,6 +4465,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4535,6 +4580,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4621,6 +4667,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4711,6 +4758,7 @@ ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -4819,6 +4867,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -123,6 +123,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 @@ -254,6 +255,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 @@ -392,6 +394,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 @@ -531,6 +534,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s1 ; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 @@ -739,6 +743,7 @@ ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 @@ -885,6 +890,7 @@ ; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] ; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -76,6 +76,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s5 ; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -144,6 +145,7 @@ ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -210,6 +212,7 @@ ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -323,6 +326,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -419,6 +423,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -513,6 +518,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -643,6 +649,7 @@ ; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -754,6 +761,7 @@ ; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1 ; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -59,6 +59,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %1 = call half @llvm.nearbyint.f16(half %in) @@ -98,6 +99,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f32_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -139,6 +141,7 @@ ; GFX11-NEXT: v_rndne_f32_e32 v1, s3 ; GFX11-NEXT: v_rndne_f32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -188,6 +191,7 @@ ; GFX11-NEXT: v_rndne_f32_e32 v1, s5 ; GFX11-NEXT: v_rndne_f32_e32 v0, s4 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -248,6 +252,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -323,6 +328,7 @@ ; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7] ; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5] ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -430,6 +436,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -57,6 +57,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) @@ -120,6 +121,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2| ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %x) @@ -178,6 +180,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half @@ -233,6 +236,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call half @llvm.fabs.f16(half %in) @@ -276,6 +280,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in, align 2 @@ -347,6 +352,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v0, 0x80008000, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %in, @@ -404,6 +410,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -448,6 +455,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) @@ -515,6 +523,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_pk_mul_f16 v1, s2, -4.0 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -587,6 +596,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) @@ -668,6 +678,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b32 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -1531,6 +1531,7 @@ ; GFX11-NEXT: v_cndmask_b32_e64 v1, s2, -v0, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v0, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = select i1 %z, double %x, double %y diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -51,6 +51,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub half -0.0, %in @@ -109,6 +110,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -166,6 +168,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i16 %in to half @@ -226,6 +229,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in @@ -281,6 +285,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fneg = fsub <2 x half> , %in @@ -327,6 +332,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %in = call i32 asm sideeffect "; def $0", "=s"() @@ -385,6 +391,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -442,6 +449,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %bc = bitcast i32 %in to <2 x half> @@ -513,6 +521,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_mul_f16 v1, v1, v1 neg_lo:[1,0] neg_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in @@ -592,6 +601,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in @@ -649,6 +659,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -44,6 +44,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -91,6 +92,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -127,6 +129,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -174,6 +177,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -223,6 +227,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x.fabs = tail call float @llvm.fabs.f32(float %x) #3 @@ -269,6 +274,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -325,6 +331,7 @@ ; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -378,6 +385,7 @@ ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -434,6 +442,7 @@ ; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp uno float %x, 0.000000e+00 @@ -483,6 +492,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -532,6 +542,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 @@ -592,6 +603,7 @@ ; GFX11-NEXT: s_and_b32 s2, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, %y @@ -642,6 +654,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 @@ -690,6 +703,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 @@ -740,6 +754,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -66,6 +66,7 @@ ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -119,6 +120,7 @@ ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -164,6 +166,7 @@ ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -204,6 +207,7 @@ ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -419,6 +423,7 @@ ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -472,6 +477,7 @@ ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -517,6 +523,7 @@ ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -557,6 +564,7 @@ ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -629,6 +637,7 @@ ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -693,6 +702,7 @@ ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -66,6 +66,7 @@ ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -119,6 +120,7 @@ ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -164,6 +166,7 @@ ; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -204,6 +207,7 @@ ; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -420,6 +424,7 @@ ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -473,6 +478,7 @@ ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -518,6 +524,7 @@ ; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -558,6 +565,7 @@ ; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen glc ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v[0:1], v0, off +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: @@ -630,6 +638,7 @@ ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) ; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; @@ -694,6 +703,7 @@ ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) ; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics.ll @@ -16,6 +16,7 @@ ; GFX11-LABEL: global_atomic_fmin_f32_noret: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_min_f32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) @@ -31,6 +32,7 @@ ; GFX11-LABEL: global_atomic_fmax_f32_noret: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_max_f32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) @@ -50,6 +52,7 @@ ; GFX11-NEXT: global_atomic_max_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[3:4], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.f32.p1.f32(ptr addrspace(1) %ptr, float %data) @@ -70,6 +73,7 @@ ; GFX11-NEXT: global_atomic_min_f32 v0, v[0:1], v2, off glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[3:4], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.f32.p1.f32(ptr addrspace(1) %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -60,6 +60,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -62,6 +62,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -59,6 +59,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -57,6 +57,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -125,6 +126,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -195,6 +197,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -272,6 +275,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -327,6 +331,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -390,6 +395,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -455,6 +461,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0| ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -520,6 +527,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0| ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -600,6 +608,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -680,6 +689,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -759,6 +769,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -839,6 +850,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -918,6 +930,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -999,6 +1012,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -104,6 +104,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -117,6 +118,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -230,6 +232,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -245,6 +248,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -365,6 +369,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -380,6 +385,7 @@ ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -513,6 +519,7 @@ ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -531,6 +538,7 @@ ; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -637,6 +645,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -650,6 +659,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -757,6 +767,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -770,6 +781,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -877,6 +889,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| ; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -890,6 +903,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1000,6 +1014,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -1015,6 +1030,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1125,6 +1141,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -1140,6 +1157,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1256,6 +1274,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -1271,6 +1290,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -76,6 +76,7 @@ ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -87,6 +88,7 @@ ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc double %in to float @@ -479,6 +481,7 @@ ; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-SAFE-SDAG-NEXT: s_nop 0 ; GFX11-SAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SAFE-SDAG-NEXT: s_endpgm ; @@ -537,6 +540,7 @@ ; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-SAFE-GISEL-NEXT: s_nop 0 ; GFX11-SAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SAFE-GISEL-NEXT: s_endpgm ; @@ -550,6 +554,7 @@ ; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-SDAG-NEXT: s_nop 0 ; GFX11-UNSAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-UNSAFE-SDAG-NEXT: s_endpgm ; @@ -563,6 +568,7 @@ ; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-GISEL-NEXT: s_nop 0 ; GFX11-UNSAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-UNSAFE-GISEL-NEXT: s_endpgm %result = fptrunc double %in to half @@ -645,6 +651,7 @@ ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -659,6 +666,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <2 x double> %in to <2 x float> @@ -751,6 +759,7 @@ ; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -766,6 +775,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] ; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <3 x double> %in to <3 x float> @@ -859,6 +869,7 @@ ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -875,6 +886,7 @@ ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9] ; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11] ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <4 x double> %in to <4 x float> @@ -999,6 +1011,7 @@ ; GFX11-SDAG-NEXT: s_clause 0x1 ; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -1021,6 +1034,7 @@ ; GFX11-GISEL-NEXT: s_clause 0x1 ; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %result = fptrunc <8 x double> %in to <8 x float> diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -176,6 +176,7 @@ ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -321,6 +322,7 @@ ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -466,6 +468,7 @@ ; GFX11-NEXT: v_trunc_f16_e32 v3, v3 ; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { @@ -673,6 +676,7 @@ ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -810,6 +814,7 @@ ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -947,6 +952,7 @@ ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v1, -v3, v2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { @@ -1167,6 +1173,7 @@ ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -1360,6 +1367,7 @@ ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -1553,6 +1561,7 @@ ; GFX11-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] ; GFX11-NEXT: global_store_b64 v10, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #1 { @@ -1820,6 +1829,7 @@ ; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -2240,6 +2250,7 @@ ; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0 ; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -2543,6 +2554,7 @@ ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 ; GFX11-NEXT: v_fma_f32 v0, -v3, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -3038,6 +3050,7 @@ ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 ; GFX11-NEXT: v_fma_f32 v0, -v5, v4, v0 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { @@ -3368,6 +3381,7 @@ ; GFX11-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %in2) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -101,6 +101,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -172,6 +173,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -303,6 +305,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3 ; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -389,6 +392,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -580,6 +584,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9 ; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -686,6 +691,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -778,6 +784,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %shl = shl i32 %a, 7 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -93,6 +93,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -164,6 +165,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -265,6 +267,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -351,6 +354,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -482,6 +486,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -586,6 +591,7 @@ ; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7 ; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -76,6 +76,7 @@ ; GFX11-NEXT: s_mov_b32 s9, s5 ; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -144,6 +145,7 @@ ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -210,6 +212,7 @@ ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -323,6 +326,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -419,6 +423,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -513,6 +518,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -140,6 +140,7 @@ ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst @@ -251,6 +252,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst @@ -538,6 +540,7 @@ ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst @@ -680,6 +683,7 @@ ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst @@ -977,6 +981,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load ptr, ptr addrspace(4) %arg diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -3408,6 +3408,7 @@ ; GFX11-LABEL: global_inc_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3425,6 +3426,7 @@ ; GFX11-LABEL: global_inc_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3482,6 +3484,7 @@ ; GFX11-LABEL: global_inc_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3499,6 +3502,7 @@ ; GFX11-LABEL: global_inc_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3561,6 +3565,7 @@ ; GFX11-LABEL: global_dec_saddr_i32_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3578,6 +3583,7 @@ ; GFX11-LABEL: global_dec_saddr_i32_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3635,6 +3641,7 @@ ; GFX11-LABEL: global_dec_saddr_i64_nortn: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -3652,6 +3659,7 @@ ; GFX11-LABEL: global_dec_saddr_i64_nortn_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -18,6 +18,7 @@ ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr @@ -41,6 +42,7 @@ ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:2047 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr @@ -65,6 +67,7 @@ ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] offset:-2048 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %voffset = load i32, ptr addrspace(1) %voffset.ptr @@ -112,6 +115,7 @@ ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds @@ -152,6 +156,7 @@ ; GFX11-NEXT: v_readfirstlane_b32 s0, v2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v3 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] offset:-120 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sbase = load ptr addrspace(1), ptr addrspace(3) @ptr.in.lds @@ -175,6 +180,7 @@ ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -192,6 +198,7 @@ ; GFX11-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -210,6 +217,7 @@ ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -227,6 +235,7 @@ ; GFX11-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -245,6 +254,7 @@ ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -262,6 +272,7 @@ ; GFX11-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -280,6 +291,7 @@ ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -297,6 +309,7 @@ ; GFX11-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -315,6 +328,7 @@ ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -332,6 +346,7 @@ ; GFX11-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -350,6 +365,7 @@ ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -367,6 +383,7 @@ ; GFX11-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -385,6 +402,7 @@ ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -402,6 +420,7 @@ ; GFX11-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -420,6 +439,7 @@ ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -437,6 +457,7 @@ ; GFX11-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -455,6 +476,7 @@ ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -472,6 +494,7 @@ ; GFX11-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -490,6 +513,7 @@ ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -507,6 +531,7 @@ ; GFX11-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -525,6 +550,7 @@ ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -542,6 +568,7 @@ ; GFX11-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -560,6 +587,7 @@ ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -577,6 +605,7 @@ ; GFX11-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -595,6 +624,7 @@ ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -612,6 +642,7 @@ ; GFX11-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -630,6 +661,7 @@ ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -647,6 +679,7 @@ ; GFX11-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -665,6 +698,7 @@ ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -682,6 +716,7 @@ ; GFX11-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -700,6 +735,7 @@ ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -717,6 +753,7 @@ ; GFX11-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b96 v0, v[1:3], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -735,6 +772,7 @@ ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -752,6 +790,7 @@ ; GFX11-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -770,6 +809,7 @@ ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -787,6 +827,7 @@ ; GFX11-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -805,6 +846,7 @@ ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -822,6 +864,7 @@ ; GFX11-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -840,6 +883,7 @@ ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -857,6 +901,7 @@ ; GFX11-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -875,6 +920,7 @@ ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -892,6 +938,7 @@ ; GFX11-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -910,6 +957,7 @@ ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -927,6 +975,7 @@ ; GFX11-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -945,6 +994,7 @@ ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -962,6 +1012,7 @@ ; GFX11-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -980,6 +1031,7 @@ ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -997,6 +1049,7 @@ ; GFX11-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_b128 v0, v[1:4], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1029,6 +1082,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1056,6 +1110,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1084,6 +1139,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1111,6 +1167,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b64 v0, v[1:2], s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1133,6 +1190,7 @@ ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1151,6 +1209,7 @@ ; GFX11-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1170,6 +1229,7 @@ ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -1189,6 +1249,7 @@ ; GFX11-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: ; GFX11: ; %bb.0: ; GFX11-NEXT: global_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -36,6 +36,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store half %arg, ptr addrspace(1) %out @@ -73,6 +74,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <2 x half> %arg, ptr addrspace(1) %out @@ -105,6 +107,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <3 x half> %arg, ptr addrspace(1) %out @@ -132,6 +135,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <4 x half> %arg, ptr addrspace(1) %out @@ -177,6 +181,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm store <8 x half> %arg, ptr addrspace(1) %out @@ -221,6 +226,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fpext = fpext <2 x half> %in to <2 x float> @@ -260,6 +266,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext half %arg to float @@ -305,6 +312,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x float> @@ -349,6 +357,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -399,6 +408,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -485,6 +495,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x float> @@ -530,6 +541,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext half %arg to double @@ -584,6 +596,7 @@ ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <2 x half> %arg to <2 x double> @@ -651,6 +664,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x double> @@ -729,6 +743,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x double> @@ -863,6 +878,7 @@ ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext = fpext <8 x half> %arg to <8 x double> @@ -892,6 +908,7 @@ ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in @@ -921,6 +938,7 @@ ; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in @@ -950,6 +968,7 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in @@ -979,6 +998,7 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in @@ -1010,6 +1030,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in @@ -1062,6 +1083,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in @@ -1117,6 +1139,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v4 ; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in @@ -1177,6 +1200,7 @@ ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v5 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in @@ -1264,6 +1288,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v12, v[4:7], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in @@ -1420,6 +1445,7 @@ ; GFX11-NEXT: global_store_b128 v20, v[0:3], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v20, v[12:15], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v20, v[8:11], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in @@ -1455,6 +1481,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in @@ -1514,6 +1541,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) %in @@ -1591,6 +1619,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) %in @@ -1677,6 +1706,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x half>, ptr addrspace(1) %in @@ -1810,6 +1840,7 @@ ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x half>, ptr addrspace(1) %in @@ -2059,6 +2090,7 @@ ; GFX11-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32 ; GFX11-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v32, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x half>, ptr addrspace(1) %in @@ -2091,6 +2123,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in @@ -2145,6 +2178,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <2 x float>, ptr addrspace(1) %in @@ -2214,6 +2248,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4 ; GFX11-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <3 x float>, ptr addrspace(1) %in @@ -2278,6 +2313,7 @@ ; GFX11-NEXT: v_pack_b32_f16 v1, v2, v3 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v5 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <4 x float>, ptr addrspace(1) %in @@ -2377,6 +2413,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v5 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <8 x float>, ptr addrspace(1) %in @@ -2556,6 +2593,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16 ; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load <16 x float>, ptr addrspace(1) %in @@ -2606,6 +2644,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_add_f16_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd half %a, %b @@ -2659,6 +2698,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <2 x half> %a, %b @@ -2733,6 +2773,7 @@ ; GFX11-NEXT: v_pk_add_f16 v1, v1, v3 ; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1 @@ -2856,6 +2897,7 @@ ; GFX11-NEXT: v_pk_add_f16 v1, s5, s9 ; GFX11-NEXT: v_pk_add_f16 v0, s4, s8 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd <8 x half> %a, %b @@ -2885,6 +2927,7 @@ ; GFX11-NEXT: global_load_u16 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load half, ptr addrspace(1) %in @@ -2915,6 +2958,7 @@ ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %in diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -148,6 +148,7 @@ ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -307,6 +308,7 @@ ; GFX11-NEXT: global_store_b32 v0, v1, s[10:11] ; GFX11-NEXT: s_cbranch_scc0 .LBB1_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -469,6 +471,7 @@ ; GFX11-NEXT: s_cmpk_eq_i32 s5, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -619,6 +622,7 @@ ; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -745,6 +749,7 @@ ; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] ; GFX11-NEXT: s_cbranch_vccz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -871,6 +876,7 @@ ; GFX11-NEXT: s_cbranch_vccz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -1013,6 +1019,7 @@ ; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] ; GFX11-NEXT: s_cbranch_vccz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -1164,6 +1171,7 @@ ; GFX11-NEXT: s_cbranch_vccz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -63,6 +63,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -152,6 +153,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -241,6 +243,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -330,6 +333,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -419,6 +423,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -520,6 +525,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -618,6 +624,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -27,6 +27,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -74,6 +75,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -119,6 +121,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -164,6 +167,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -209,6 +213,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -254,6 +259,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -299,6 +305,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -344,6 +351,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -389,6 +397,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -434,6 +443,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -479,6 +489,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -524,6 +535,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -569,6 +581,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -614,6 +627,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -663,6 +677,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -717,6 +732,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -771,6 +787,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -825,6 +842,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -879,6 +897,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -933,6 +952,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -987,6 +1007,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1041,6 +1062,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1095,6 +1117,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1161,6 +1184,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1242,6 +1266,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1311,6 +1336,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1365,6 +1391,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1419,6 +1446,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1485,6 +1513,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1565,6 +1594,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1645,6 +1675,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] ; GFX11-NEXT: v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00] ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1713,6 +1744,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1767,6 +1799,7 @@ ; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] ; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -41,6 +41,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -108,6 +109,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -190,6 +192,7 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s1 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -257,6 +260,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -339,6 +343,7 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s0 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -437,6 +442,7 @@ ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s1 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -490,6 +496,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -556,6 +563,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr @@ -603,6 +611,7 @@ ; GFX11-NEXT: s_pack_ll_b32_b16 s2, 0x4500, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr @@ -648,6 +657,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %vec = load <2 x half>, ptr addrspace(4) %vec.ptr @@ -716,6 +726,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -790,6 +801,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, v1, s0, 0x7060302 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -862,6 +874,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -935,6 +948,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1005,6 +1019,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1077,6 +1092,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s2, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1147,6 +1163,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 53, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1219,6 +1236,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1289,6 +1307,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1377,6 +1396,7 @@ ; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %idx = load volatile i32, ptr addrspace(4) %idx.ptr @@ -1457,6 +1477,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, s0, 0x3e703e7, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1558,6 +1579,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, v1, 0x12341234, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1636,6 +1658,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s0, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1714,6 +1737,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1792,6 +1816,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1870,6 +1895,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1948,6 +1974,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s0, v1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2057,6 +2084,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v0, v2, s0, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2161,6 +2189,7 @@ ; GFX11-NEXT: v_bfi_b32 v1, s1, s6, v1 ; GFX11-NEXT: v_bfi_b32 v0, s0, s6, v0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2241,6 +2270,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_perm_b32 v1, s0, v1, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2321,6 +2351,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s0, v3 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2547,6 +2578,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2647,6 +2679,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -2747,6 +2780,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -3147,6 +3181,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -56,6 +56,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -116,6 +117,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -226,6 +228,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -305,6 +308,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -382,6 +386,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -480,6 +485,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -581,6 +587,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -682,6 +689,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -784,6 +792,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.add.gs.reg.rtn.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b32 v[1:2], v3, off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %arg, i32 16) @@ -42,6 +43,7 @@ ; CHECK-NEXT: ds_add_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %arg, i32 32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bvh.stack.rtn.ll @@ -11,6 +11,7 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; CHECK-NEXT: global_store_b32 v[6:7], v0, off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 0) @@ -28,6 +29,7 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; CHECK-NEXT: global_store_b32 v[6:7], v0, off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.rtn(i32 %addr, i32 %data0, <4 x i32> %data1, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.sub.gs.reg.rtn.ll @@ -20,6 +20,7 @@ ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:16 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b32 v[1:2], v3, off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %arg, i32 16) @@ -42,6 +43,7 @@ ; CHECK-NEXT: ds_sub_gs_reg_rtn v[3:4], v0 offset:32 gds ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_store_b64 v[1:2], v[3:4], off +; CHECK-NEXT: s_nop 0 ; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; CHECK-NEXT: s_endpgm %res = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %arg, i32 32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -21,6 +21,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -43,6 +44,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -70,6 +72,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -92,6 +95,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -126,6 +130,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -152,6 +157,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -178,6 +184,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -208,6 +215,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -234,6 +242,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -264,6 +273,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -290,6 +300,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -320,6 +331,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -346,6 +358,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -376,6 +389,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -402,6 +416,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -432,6 +447,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -458,6 +474,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -489,6 +506,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -515,6 +533,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -545,6 +564,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -571,6 +591,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -601,6 +622,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -627,6 +649,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -657,6 +680,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -683,6 +707,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -713,6 +738,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -739,6 +765,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -769,6 +796,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -795,6 +823,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -825,6 +854,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -851,6 +881,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -881,6 +912,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -907,6 +939,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -937,6 +970,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -963,6 +997,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -993,6 +1028,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1019,6 +1055,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1049,6 +1086,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1075,6 +1113,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1105,6 +1144,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1131,6 +1171,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1161,6 +1202,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1187,6 +1229,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1217,6 +1260,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1243,6 +1287,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1273,6 +1318,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1299,6 +1345,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1329,6 +1376,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1355,6 +1403,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1385,6 +1434,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1411,6 +1461,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1441,6 +1492,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1467,6 +1519,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1499,6 +1552,7 @@ ; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1527,6 +1581,7 @@ ; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1561,6 +1616,7 @@ ; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1589,6 +1645,7 @@ ; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3| ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1626,6 +1683,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1653,6 +1711,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1679,6 +1738,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1710,6 +1770,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1736,6 +1797,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1767,6 +1829,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1793,6 +1856,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1824,6 +1888,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1850,6 +1915,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1881,6 +1947,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1907,6 +1974,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1938,6 +2006,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1964,6 +2033,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1995,6 +2065,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2021,6 +2092,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -2052,6 +2124,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2078,6 +2151,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -2109,6 +2183,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2135,6 +2210,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -2166,6 +2242,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2192,6 +2269,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -2223,6 +2301,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2249,6 +2328,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -2280,6 +2360,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -2306,6 +2387,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -24,6 +24,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -81,6 +82,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -139,6 +141,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -184,6 +187,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -245,6 +249,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -306,6 +311,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -367,6 +373,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -428,6 +435,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -489,6 +497,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -551,6 +560,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -612,6 +622,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -673,6 +684,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -734,6 +746,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -795,6 +808,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -856,6 +870,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -917,6 +932,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -994,6 +1010,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1071,6 +1088,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1148,6 +1166,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1225,6 +1244,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1302,6 +1322,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1379,6 +1400,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1456,6 +1478,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1533,6 +1556,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1610,6 +1634,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1687,6 +1712,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1764,6 +1790,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1844,6 +1871,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1912,6 +1940,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1976,6 +2005,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2022,6 +2052,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2084,6 +2115,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2146,6 +2178,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2208,6 +2241,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2270,6 +2304,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2332,6 +2367,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2394,6 +2430,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2456,6 +2493,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2518,6 +2556,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2580,6 +2619,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2642,6 +2682,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2704,6 +2745,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -16,6 +16,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -16,6 +16,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -17,6 +17,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -46,6 +47,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -29,6 +29,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -55,6 +56,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -89,6 +91,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -115,6 +118,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -141,6 +145,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -171,6 +176,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -197,6 +203,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -227,6 +234,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -253,6 +261,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -283,6 +292,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -309,6 +319,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -339,6 +350,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -365,6 +377,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -395,6 +408,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -421,6 +435,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -451,6 +466,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -477,6 +493,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -507,6 +524,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -533,6 +551,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -563,6 +582,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -589,6 +609,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -618,6 +639,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -642,6 +664,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -670,6 +693,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -694,6 +718,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -722,6 +747,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -746,6 +772,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -774,6 +801,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -798,6 +826,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -826,6 +855,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -850,6 +880,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -878,6 +909,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -902,6 +934,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -930,6 +963,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -954,6 +988,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -982,6 +1017,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1006,6 +1042,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1034,6 +1071,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1058,6 +1096,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1086,6 +1125,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1110,6 +1150,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1139,6 +1180,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1165,6 +1207,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1199,6 +1242,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1225,6 +1269,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1251,6 +1296,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1281,6 +1327,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1307,6 +1354,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1337,6 +1385,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1363,6 +1412,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1393,6 +1443,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1419,6 +1470,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1449,6 +1501,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1475,6 +1528,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1505,6 +1559,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1531,6 +1586,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1561,6 +1617,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1587,6 +1644,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1617,6 +1675,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1643,6 +1702,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1673,6 +1733,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -1699,6 +1760,7 @@ ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1731,6 +1793,7 @@ ; GFX11-NEXT: s_and_b32 s2, s2, s3 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1767,6 +1830,7 @@ ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -34,6 +34,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -101,6 +102,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -138,6 +140,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -199,6 +202,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -260,6 +264,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -321,6 +326,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -382,6 +388,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -443,6 +450,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -504,6 +512,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -565,6 +574,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -626,6 +636,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -686,6 +697,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -760,6 +772,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -834,6 +847,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -908,6 +922,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -982,6 +997,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1056,6 +1072,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1130,6 +1147,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1204,6 +1222,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1278,6 +1297,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1352,6 +1372,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1427,6 +1448,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1494,6 +1516,7 @@ ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; @@ -1531,6 +1554,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1592,6 +1616,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1653,6 +1678,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1714,6 +1740,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1775,6 +1802,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1836,6 +1864,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1897,6 +1926,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1958,6 +1988,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2019,6 +2050,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2082,6 +2114,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2139,6 +2172,7 @@ ; GISEL-GFX11-LABEL: test_intr_icmp_i32_invalid_cc: ; GISEL-GFX11: ; %bb.0: ; GISEL-GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll @@ -296,6 +296,7 @@ ; GFX11-LABEL: store_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -318,6 +319,7 @@ ; GFX11-LABEL: store_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -341,6 +343,7 @@ ; GFX11-LABEL: store_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -365,6 +368,7 @@ ; GFX11-LABEL: store_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -389,6 +393,7 @@ ; GFX11-LABEL: store_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -412,6 +417,7 @@ ; GFX11-LABEL: store_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -436,6 +442,7 @@ ; GFX11-LABEL: store_2dmsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -460,6 +467,7 @@ ; GFX11-LABEL: store_2darraymsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -485,6 +493,7 @@ ; GFX11-LABEL: store_mip_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -508,6 +517,7 @@ ; GFX11-LABEL: store_mip_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -532,6 +542,7 @@ ; GFX11-LABEL: store_mip_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -557,6 +568,7 @@ ; GFX11-LABEL: store_mip_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -582,6 +594,7 @@ ; GFX11-LABEL: store_mip_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -606,6 +619,7 @@ ; GFX11-LABEL: store_mip_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -811,6 +825,7 @@ ; GFX11-LABEL: store_1d_V1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -833,6 +848,7 @@ ; GFX11-LABEL: store_1d_V2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -909,6 +925,7 @@ ; GFX11-LABEL: store_1d_glc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -931,6 +948,7 @@ ; GFX11-LABEL: store_1d_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -953,6 +971,7 @@ ; GFX11-LABEL: store_1d_glc_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.encode.ll @@ -380,6 +380,7 @@ ; GFX11-LABEL: store_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -402,6 +403,7 @@ ; GFX11-LABEL: store_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -425,6 +427,7 @@ ; GFX11-LABEL: store_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -449,6 +452,7 @@ ; GFX11-LABEL: store_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -473,6 +477,7 @@ ; GFX11-LABEL: store_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -496,6 +501,7 @@ ; GFX11-LABEL: store_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -520,6 +526,7 @@ ; GFX11-LABEL: store_2dmsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x98,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -544,6 +551,7 @@ ; GFX11-LABEL: store_2darraymsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x9c,0x0f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -569,6 +577,7 @@ ; GFX11-LABEL: store_mip_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -592,6 +601,7 @@ ; GFX11-LABEL: store_mip_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 ; encoding: [0x84,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -616,6 +626,7 @@ ; GFX11-LABEL: store_mip_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 ; encoding: [0x88,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -641,6 +652,7 @@ ; GFX11-LABEL: store_mip_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm a16 ; encoding: [0x8c,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -666,6 +678,7 @@ ; GFX11-LABEL: store_mip_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm a16 ; encoding: [0x90,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -690,6 +703,7 @@ ; GFX11-LABEL: store_mip_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm a16 ; encoding: [0x94,0x0f,0x1d,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -955,6 +969,7 @@ ; GFX11-LABEL: store_1d_V1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x02,0x19,0xf0,0x01,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -977,6 +992,7 @@ ; GFX11-LABEL: store_1d_V2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm a16 ; encoding: [0x80,0x0c,0x19,0xf0,0x02,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -1071,6 +1087,7 @@ ; GFX11-LABEL: store_1d_glc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc a16 ; encoding: [0x80,0x4f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -1093,6 +1110,7 @@ ; GFX11-LABEL: store_1d_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc a16 ; encoding: [0x80,0x1f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: @@ -1115,6 +1133,7 @@ ; GFX11-LABEL: store_1d_glc_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc a16 ; encoding: [0x80,0x5f,0x19,0xf0,0x04,0x00,0x00,0x00] +; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf] ; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -2363,6 +2363,7 @@ ; GFX11-LABEL: store_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2399,6 +2400,7 @@ ; GFX11-LABEL: store_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2435,6 +2437,7 @@ ; GFX11-LABEL: store_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2471,6 +2474,7 @@ ; GFX11-LABEL: store_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2507,6 +2511,7 @@ ; GFX11-LABEL: store_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2543,6 +2548,7 @@ ; GFX11-LABEL: store_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2579,6 +2585,7 @@ ; GFX11-LABEL: store_2dmsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2615,6 +2622,7 @@ ; GFX11-LABEL: store_2darraymsaa: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2651,6 +2659,7 @@ ; GFX11-LABEL: store_mip_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2687,6 +2696,7 @@ ; GFX11-LABEL: store_mip_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2723,6 +2733,7 @@ ; GFX11-LABEL: store_mip_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2759,6 +2770,7 @@ ; GFX11-LABEL: store_mip_cube: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2795,6 +2807,7 @@ ; GFX11-LABEL: store_mip_1darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -2831,6 +2844,7 @@ ; GFX11-LABEL: store_mip_2darray: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3217,6 +3231,7 @@ ; GFX11-LABEL: store_1d_V1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3253,6 +3268,7 @@ ; GFX11-LABEL: store_1d_V2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3394,6 +3410,7 @@ ; GFX11-LABEL: store_1d_glc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3430,6 +3447,7 @@ ; GFX11-LABEL: store_1d_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3466,6 +3484,7 @@ ; GFX11-LABEL: store_1d_glc_slc: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -3652,6 +3671,7 @@ ; GFX11-NEXT: image_load v[0:3], v4, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: image_store v[0:3], v4, s[16:23] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll @@ -17,6 +17,7 @@ ; GFX11-LABEL: store_f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -40,6 +41,7 @@ ; GFX11-LABEL: store_v2f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -63,6 +65,7 @@ ; GFX11-LABEL: store_v3f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -86,6 +89,7 @@ ; GFX11-LABEL: store_v4f16_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -109,6 +113,7 @@ ; GFX11-LABEL: store_f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -133,6 +138,7 @@ ; GFX11-LABEL: store_v2f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -157,6 +163,7 @@ ; GFX11-LABEL: store_v3f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -181,6 +188,7 @@ ; GFX11-LABEL: store_v4f16_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:2], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -205,6 +213,7 @@ ; GFX11-LABEL: store_f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -230,6 +239,7 @@ ; GFX11-LABEL: store_v2f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -255,6 +265,7 @@ ; GFX11-LABEL: store_v3f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -280,6 +291,7 @@ ; GFX11-LABEL: store_v4f16_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 d16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll @@ -17,6 +17,7 @@ ; GFX11-LABEL: store_f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -39,6 +40,7 @@ ; GFX11-LABEL: store_v2f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -61,6 +63,7 @@ ; GFX11-LABEL: store_v3f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -83,6 +86,7 @@ ; GFX11-LABEL: store_v4f32_1d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -105,6 +109,7 @@ ; GFX11-LABEL: store_f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -128,6 +133,7 @@ ; GFX11-LABEL: store_v2f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -151,6 +157,7 @@ ; GFX11-LABEL: store_v3f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -174,6 +181,7 @@ ; GFX11-LABEL: store_v4f32_2d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -197,6 +205,7 @@ ; GFX11-LABEL: store_f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -221,6 +230,7 @@ ; GFX11-LABEL: store_v2f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -245,6 +255,7 @@ ; GFX11-LABEL: store_v3f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -269,6 +280,7 @@ ; GFX11-LABEL: store_v4f32_3d: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: image_store v[2:5], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -11,6 +11,7 @@ ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -28,6 +29,7 @@ ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -43,6 +45,7 @@ ; GFX11-NEXT: s_movk_i32 s0, 0x1000 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -60,6 +63,7 @@ ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -74,6 +78,7 @@ ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -94,6 +99,7 @@ ; GFX11-NEXT: .LBB5_2: ; %endif ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -128,6 +134,7 @@ ; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -142,6 +149,7 @@ ; SDAG-NEXT: ; %bb.2: ; %endif ; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; SDAG-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -12,6 +12,7 @@ ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -22,6 +23,7 @@ ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -40,6 +42,7 @@ ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -50,6 +53,7 @@ ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -69,6 +73,7 @@ ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -80,6 +85,7 @@ ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -99,6 +105,7 @@ ; GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -110,6 +117,7 @@ ; SDAG-NEXT: v_mov_b32_e32 v1, s2 ; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -125,6 +133,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -135,6 +144,7 @@ ; SDAG-NEXT: s_waitcnt_depctr 0xfffe ; SDAG-NEXT: v_mov_b32_e32 v3, s0 ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -157,6 +167,7 @@ ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; GISEL-NEXT: v_mov_b32_e32 v3, 0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -172,6 +183,7 @@ ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] ; SDAG-NEXT: v_mov_b32_e32 v3, s2 ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: @@ -208,6 +220,7 @@ ; GISEL-NEXT: v_mov_b32_e32 v3, s1 ; GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm ; @@ -225,6 +238,7 @@ ; SDAG-NEXT: ; %bb.2: ; %endif ; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -83,6 +83,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f16_e32 v0, v1, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -156,6 +157,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f16_e32 v0, 2.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -219,6 +221,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_ldexp_f16_e64 v0, v0, 2 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -32,6 +32,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) @@ -62,6 +63,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) @@ -95,6 +97,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) @@ -134,6 +137,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -152,6 +156,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -194,6 +199,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -206,6 +212,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -248,6 +255,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -262,6 +270,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -293,6 +302,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) @@ -323,6 +333,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) @@ -353,6 +364,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) @@ -383,6 +395,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false) @@ -413,6 +426,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false) @@ -446,6 +460,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false) @@ -485,6 +500,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -503,6 +519,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -545,6 +562,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -557,6 +575,7 @@ ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -599,6 +618,7 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -613,6 +633,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidy = call i32 @llvm.amdgcn.workitem.id.y() @@ -644,6 +665,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false) @@ -674,6 +696,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true) @@ -704,6 +727,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s7, s0 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true) @@ -732,6 +756,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -761,6 +786,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -805,6 +831,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -819,6 +846,7 @@ ; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -848,6 +876,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -878,6 +907,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -908,6 +938,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -938,6 +969,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -967,6 +999,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1011,6 +1044,7 @@ ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -1025,6 +1059,7 @@ ; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1054,6 +1089,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1084,6 +1120,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() @@ -1114,6 +1151,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1] ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -16,6 +16,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64(i32 %src0) @@ -32,6 +33,7 @@ ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane64(i32 99) @@ -47,6 +49,7 @@ ; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -57,6 +60,7 @@ ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %tidx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -17,6 +17,7 @@ ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -35,6 +36,7 @@ ; GFX11-LABEL: buffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -51,6 +53,7 @@ ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -83,6 +86,7 @@ ; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -101,6 +105,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -117,6 +122,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -136,6 +142,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 @@ -167,6 +174,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 6 @@ -199,6 +207,7 @@ ; GFX11-NEXT: buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4 ; GFX11-NEXT: buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc ; GFX11-NEXT: buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 @@ -225,6 +234,7 @@ ; GFX11-LABEL: buffer_store_x2_offen_merged_and: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a1 = add i32 %a, 4 @@ -245,6 +255,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = shl i32 %inp, 4 @@ -267,6 +278,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -287,6 +299,7 @@ ; GFX11-LABEL: buffer_store_x2_offset_merged: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -308,6 +321,7 @@ ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 glc ; GFX11-NEXT: buffer_store_b32 v6, off, s[0:3], 0 slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -328,6 +342,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -348,6 +363,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -366,6 +382,7 @@ ; GFX11-LABEL: raw_buffer_store_f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -393,6 +410,7 @@ ; GFX11-LABEL: buffer_store_v2f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -422,6 +440,7 @@ ; GFX11-LABEL: buffer_store_v4f16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -438,6 +457,7 @@ ; GFX11-LABEL: raw_buffer_store_i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -463,6 +483,7 @@ ; GFX11-LABEL: buffer_store_v2i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -490,6 +511,7 @@ ; GFX11-LABEL: buffer_store_v4i16: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -509,6 +531,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) @@ -540,6 +563,7 @@ ; GFX11-NEXT: buffer_store_b32 v3, off, s[0:3], 0 offset:16 ; GFX11-NEXT: buffer_store_b32 v4, off, s[0:3], 0 offset:28 ; GFX11-NEXT: buffer_store_b32 v5, off, s[0:3], 0 offset:32 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -42,6 +42,7 @@ ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -89,6 +90,7 @@ ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -144,6 +146,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -199,6 +202,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.ll @@ -28,6 +28,7 @@ ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -55,6 +56,7 @@ ; GFX11-LABEL: tbuffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -77,6 +79,7 @@ ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -99,6 +102,7 @@ ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -121,6 +125,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -143,6 +148,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -165,6 +171,7 @@ ; GFX11-LABEL: buffer_store_voffset_large_12bit: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -189,6 +196,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x1000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -213,6 +221,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xf000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -237,6 +246,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x7ff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -261,6 +271,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xfff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -42,6 +42,7 @@ ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -89,6 +90,7 @@ ; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -144,6 +146,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -199,6 +202,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.ll @@ -28,6 +28,7 @@ ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:78 glc dlc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -55,6 +56,7 @@ ; GFX11-LABEL: tbuffer_store_immoffs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:117 offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -77,6 +79,7 @@ ; GFX11-LABEL: tbuffer_store_scalar_and_imm_offs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], s4 format:117 offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -99,6 +102,7 @@ ; GFX11-LABEL: buffer_store_ofs: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:115 offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -121,6 +125,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, off, s[0:3], 0 format:125 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -143,6 +148,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -165,6 +171,7 @@ ; GFX11-LABEL: buffer_store_voffset_large_12bit: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -189,6 +196,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x1000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -213,6 +221,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xf000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -237,6 +246,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0x7ff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -261,6 +271,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0xfff000 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll @@ -591,6 +591,7 @@ ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: exp mrt0 v0, v0, v0, v0 done +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -10,6 +10,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -20,6 +21,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 128) @@ -35,6 +37,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -45,6 +48,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 129) @@ -61,6 +65,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 130) @@ -77,6 +82,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 131) @@ -92,6 +98,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -102,6 +109,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 132) @@ -118,6 +126,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 133) @@ -133,6 +142,7 @@ ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; @@ -143,6 +153,7 @@ ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm %ret = call i32 @llvm.amdgcn.s.sendmsg.rtn.i32(i32 0) @@ -159,6 +170,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ret = call i64 @llvm.amdgcn.s.sendmsg.rtn.i64(i32 99999) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll @@ -19,6 +19,7 @@ ; GFX11-NEXT: buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen ; GFX11-NEXT: buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc ; GFX11-NEXT: buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -39,6 +40,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -55,6 +57,7 @@ ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -77,6 +80,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -93,6 +97,7 @@ ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -111,6 +116,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v6, v4 ; GFX11-NEXT: buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -143,6 +149,7 @@ ; GFX11-NEXT: buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -161,6 +168,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -177,6 +185,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -200,6 +209,7 @@ ; GFX11-NEXT: buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen ; GFX11-NEXT: buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc ; GFX11-NEXT: buffer_store_b32 v6, v7, s[0:3], 0 idxen slc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -220,6 +230,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b8 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -240,6 +251,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %v2 = fptrunc float %v1 to half @@ -265,6 +277,7 @@ ; GFX11-LABEL: struct_buffer_store_v2f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -293,6 +306,7 @@ ; GFX11-LABEL: struct_buffer_store_v4f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -310,6 +324,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -336,6 +351,7 @@ ; GFX11-LABEL: struct_buffer_store_vif16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b32 v0, v1, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) @@ -362,6 +378,7 @@ ; GFX11-LABEL: struct_buffer_store_v4i16: ; GFX11: ; %bb.0: ; GFX11-NEXT: buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -46,6 +46,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -97,6 +98,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -160,6 +162,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -223,6 +226,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.ll @@ -40,6 +40,7 @@ ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -76,6 +77,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -107,6 +109,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -134,6 +137,7 @@ ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -173,6 +177,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -200,6 +205,7 @@ ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -240,6 +246,7 @@ ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -270,6 +277,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -297,6 +305,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -328,6 +337,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -366,6 +376,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -404,6 +415,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -442,6 +454,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -480,6 +493,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -46,6 +46,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -97,6 +98,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -160,6 +162,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: @@ -223,6 +226,7 @@ ; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-PACKED-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_nop 0 ; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-PACKED-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll @@ -40,6 +40,7 @@ ; GFX11-NEXT: tbuffer_store_format_xyzw v[4:7], v12, s[0:3], 0 format:[BUF_FMT_32_32_32_32_UINT] idxen glc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen slc ; GFX11-NEXT: tbuffer_store_format_xyzw v[8:11], v12, s[0:3], 0 format:78 idxen glc dlc +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -76,6 +77,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:117 idxen offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -107,6 +109,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], s4 format:117 idxen offset:42 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -134,6 +137,7 @@ ; GFX11-LABEL: buffer_store_idx: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_8_8_8_8_SINT] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -173,6 +177,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:115 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -200,6 +205,7 @@ ; GFX11-LABEL: buffer_store_both: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:70 idxen offen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -240,6 +246,7 @@ ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_FMT_8_8_8_8_UINT] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -270,6 +277,7 @@ ; GFX11-LABEL: buffer_store_x1: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_x v0, v1, s[0:3], 0 format:125 idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -297,6 +305,7 @@ ; GFX11-LABEL: buffer_store_x2: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -328,6 +337,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -366,6 +376,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x1000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -404,6 +415,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xf000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -442,6 +454,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7ff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: @@ -480,6 +493,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0xfff000 :: v_dual_mov_b32 v4, s4 ; GFX11-NEXT: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_FMT_32_32_32_32_FLOAT] idxen offen offset:4092 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll @@ -17,6 +17,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -34,6 +35,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -51,6 +53,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -66,6 +69,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -83,6 +87,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -98,6 +103,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[24:25], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -115,6 +121,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -130,6 +137,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -145,6 +153,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -160,6 +169,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -175,6 +185,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -190,6 +201,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -205,6 +217,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -220,6 +233,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[16:17], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[16:17], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -237,6 +251,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -252,6 +267,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -267,6 +283,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -282,6 +299,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -298,6 +316,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -313,6 +332,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -328,6 +348,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -343,6 +364,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[12:13], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll @@ -15,6 +15,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -30,6 +31,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -45,6 +47,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -58,6 +61,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -73,6 +77,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -86,6 +91,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -101,6 +107,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -115,6 +122,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -128,6 +136,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -141,6 +150,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -154,6 +164,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -167,6 +178,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -180,6 +192,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -193,6 +206,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -208,6 +222,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -221,6 +236,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -234,6 +250,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -247,6 +264,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -260,6 +278,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -273,6 +292,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -286,6 +306,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -299,6 +320,7 @@ ; W64: ; %bb.0: ; %bb ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -61,6 +61,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_ceil_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -146,6 +147,7 @@ ; GFX11-NEXT: v_ceil_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -79,6 +79,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cos_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a @@ -188,6 +189,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -61,6 +61,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_floor_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -147,6 +148,7 @@ ; GFX11-NEXT: v_floor_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -86,6 +86,7 @@ ; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11CHECK-NEXT: s_nop 0 ; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll @@ -81,6 +81,7 @@ ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11CHECK-NEXT: s_nop 0 ; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan @@ -165,6 +166,7 @@ ; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 ; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11CHECK-NEXT: s_nop 0 ; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -210,6 +210,7 @@ ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -237,6 +238,7 @@ ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -604,6 +606,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -638,6 +641,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1156,6 +1160,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1211,6 +1216,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1857,6 +1863,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1918,6 +1925,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -210,6 +210,7 @@ ; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -237,6 +238,7 @@ ; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -604,6 +606,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 ; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -638,6 +641,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1156,6 +1160,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6 ; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1211,6 +1216,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1857,6 +1863,7 @@ ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15 ; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1918,6 +1925,7 @@ ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -141,6 +141,7 @@ ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0 ; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -160,6 +161,7 @@ ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0 ; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -382,6 +384,7 @@ ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 ; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -405,6 +408,7 @@ ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -696,6 +700,7 @@ ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1 ; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3 ; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -728,6 +733,7 @@ ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5 ; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; @@ -1083,6 +1089,7 @@ ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5 ; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; @@ -1120,6 +1127,7 @@ ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -140,6 +140,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -250,6 +251,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -358,6 +360,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -474,6 +477,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -567,6 +571,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -658,6 +663,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -799,6 +805,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -953,6 +960,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1075,6 +1083,7 @@ ; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0 ; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -139,6 +139,7 @@ ; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 ; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -276,6 +277,7 @@ ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -383,6 +385,7 @@ ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -498,6 +501,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -625,6 +629,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -715,6 +720,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -855,6 +861,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1008,6 +1015,7 @@ ; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 ; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1129,6 +1137,7 @@ ; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0 ; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -372,6 +372,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: @@ -556,6 +557,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -62,6 +62,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_rndne_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -168,6 +169,7 @@ ; GFX11-NEXT: v_rndne_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -77,6 +77,7 @@ ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc_lo ; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -177,6 +178,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_add_f32 v1, v0, v1 :: v_dual_add_f32 v0, v2, v3 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -358,6 +360,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v6, v8 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -598,6 +601,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -752,6 +756,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -905,6 +910,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -79,6 +79,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sin_f16_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a @@ -188,6 +189,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -61,6 +61,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -147,6 +148,7 @@ ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -61,6 +61,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_trunc_f16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -147,6 +148,7 @@ ; GFX11-NEXT: v_trunc_f16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -68,6 +68,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = lshr <2 x i16> %lhs, %rhs @@ -148,6 +149,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -242,6 +244,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -334,6 +337,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -418,6 +422,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -498,6 +503,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -596,6 +602,7 @@ ; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -686,6 +693,7 @@ ; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -78,6 +78,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0 ; GFX11-NEXT: global_store_b16 v3, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -681,6 +681,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -134,6 +135,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -255,6 +257,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -266,6 +269,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -399,6 +403,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -412,6 +417,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -551,6 +557,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -565,6 +572,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -678,6 +686,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -689,6 +698,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -801,6 +811,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -812,6 +823,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -939,6 +951,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -952,6 +965,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1079,6 +1093,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1092,6 +1107,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1204,6 +1220,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1215,6 +1232,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1488,6 +1506,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1501,6 +1520,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1971,6 +1991,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1986,6 +2007,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2146,6 +2168,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2163,6 +2186,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2323,6 +2347,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2340,6 +2365,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2452,6 +2478,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2462,6 +2489,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2734,6 +2762,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2746,6 +2775,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4790,6 +4820,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4802,6 +4833,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4950,6 +4982,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4964,6 +4997,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5115,6 +5149,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5129,6 +5164,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5292,6 +5328,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5308,6 +5345,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5471,6 +5509,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5487,6 +5526,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5635,6 +5675,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5649,6 +5690,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5797,6 +5839,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5811,6 +5854,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5974,6 +6018,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5990,6 +6035,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6153,6 +6199,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6169,6 +6216,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6332,6 +6380,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6348,6 +6397,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6511,6 +6561,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6527,6 +6578,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6690,6 +6742,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6706,6 +6759,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6869,6 +6923,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6885,6 +6940,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -7048,6 +7104,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7064,6 +7121,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -7227,6 +7285,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7243,6 +7302,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -7366,6 +7426,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7377,6 +7438,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7498,6 +7560,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7509,6 +7572,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7642,6 +7706,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7655,6 +7720,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7794,6 +7860,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7808,6 +7875,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7921,6 +7989,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7932,6 +8001,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -8044,6 +8114,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8055,6 +8126,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -8182,6 +8254,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8195,6 +8268,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -8322,6 +8396,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8335,6 +8410,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -8447,6 +8523,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8458,6 +8535,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8731,6 +8809,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8744,6 +8823,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -9214,6 +9294,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9229,6 +9310,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -9389,6 +9471,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9406,6 +9489,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -9566,6 +9650,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9583,6 +9668,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -9695,6 +9781,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9705,6 +9792,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9977,6 +10065,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9989,6 +10078,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12033,6 +12123,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12045,6 +12136,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12193,6 +12285,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12207,6 +12300,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12370,6 +12464,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12386,6 +12481,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12549,6 +12645,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12565,6 +12662,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12713,6 +12811,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12727,6 +12826,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12875,6 +12975,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12889,6 +12990,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13052,6 +13154,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13068,6 +13171,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13231,6 +13335,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13247,6 +13352,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13410,6 +13516,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13426,6 +13533,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13589,6 +13697,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13605,6 +13714,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13768,6 +13878,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13784,6 +13895,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13947,6 +14059,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13963,6 +14076,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14126,6 +14240,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -14142,6 +14257,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -14305,6 +14421,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -14321,6 +14438,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -126,6 +126,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -137,6 +138,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -270,6 +272,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -281,6 +284,7 @@ ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -407,6 +411,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -418,6 +423,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -546,6 +552,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -557,6 +564,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -134,6 +135,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -255,6 +257,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -266,6 +269,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -387,6 +391,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -398,6 +403,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -519,6 +525,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -530,6 +537,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -643,6 +651,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -654,6 +663,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -766,6 +776,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -777,6 +788,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -889,6 +901,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -900,6 +913,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1012,6 +1026,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1023,6 +1038,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1135,6 +1151,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1146,6 +1163,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1258,6 +1276,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1269,6 +1288,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1381,6 +1401,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1392,6 +1413,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1504,6 +1526,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1515,6 +1538,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1627,6 +1651,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1638,6 +1663,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1770,6 +1796,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1783,6 +1810,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1916,6 +1944,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1929,6 +1958,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2062,6 +2092,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2075,6 +2106,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2187,6 +2219,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2197,6 +2230,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2309,6 +2343,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2319,6 +2354,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2431,6 +2467,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2441,6 +2478,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2553,6 +2591,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2563,6 +2602,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2675,6 +2715,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2685,6 +2726,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2797,6 +2839,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2807,6 +2850,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2919,6 +2963,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2929,6 +2974,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3041,6 +3087,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3051,6 +3098,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3163,6 +3211,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3173,6 +3222,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3285,6 +3335,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3295,6 +3346,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3407,6 +3459,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3417,6 +3470,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3529,6 +3583,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3539,6 +3594,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3651,6 +3707,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3661,6 +3718,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3773,6 +3831,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3783,6 +3842,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3895,6 +3955,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3905,6 +3966,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4039,6 +4101,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4051,6 +4114,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4187,6 +4251,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4199,6 +4264,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4335,6 +4401,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4347,6 +4414,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4483,6 +4551,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4495,6 +4564,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4631,6 +4701,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4643,6 +4714,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4779,6 +4851,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4791,6 +4864,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4927,6 +5001,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4939,6 +5014,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5075,6 +5151,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5087,6 +5164,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5223,6 +5301,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5235,6 +5314,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5371,6 +5451,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5383,6 +5464,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5519,6 +5601,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5531,6 +5614,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5667,6 +5751,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5679,6 +5764,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5815,6 +5901,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5827,6 +5914,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5963,6 +6051,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5975,6 +6064,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6111,6 +6201,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6123,6 +6214,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6246,6 +6338,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6257,6 +6350,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6378,6 +6472,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6389,6 +6484,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6510,6 +6606,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6521,6 +6618,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6642,6 +6740,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6653,6 +6752,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6766,6 +6866,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6777,6 +6878,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -6889,6 +6991,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6900,6 +7003,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7012,6 +7116,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7023,6 +7128,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7135,6 +7241,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7146,6 +7253,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7258,6 +7366,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7269,6 +7378,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7381,6 +7491,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7392,6 +7503,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7504,6 +7616,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7515,6 +7628,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7627,6 +7741,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7638,6 +7753,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7750,6 +7866,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7761,6 +7878,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7893,6 +8011,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7906,6 +8025,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8039,6 +8159,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8052,6 +8173,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8185,6 +8307,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8198,6 +8321,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8310,6 +8434,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8320,6 +8445,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8432,6 +8558,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8442,6 +8569,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8554,6 +8682,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8564,6 +8693,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8676,6 +8806,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8686,6 +8817,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8798,6 +8930,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8808,6 +8941,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8920,6 +9054,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8930,6 +9065,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9042,6 +9178,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9052,6 +9189,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9164,6 +9302,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9174,6 +9313,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9286,6 +9426,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9296,6 +9437,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9408,6 +9550,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9418,6 +9561,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9530,6 +9674,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9540,6 +9685,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9652,6 +9798,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9662,6 +9809,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9774,6 +9922,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9784,6 +9933,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9896,6 +10046,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9906,6 +10057,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10018,6 +10170,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10028,6 +10181,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10162,6 +10316,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10174,6 +10329,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10310,6 +10466,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10322,6 +10479,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10458,6 +10616,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10470,6 +10629,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10606,6 +10766,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10618,6 +10779,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10754,6 +10916,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10766,6 +10929,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10902,6 +11066,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10914,6 +11079,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11050,6 +11216,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11062,6 +11229,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11198,6 +11366,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11210,6 +11379,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11346,6 +11516,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11358,6 +11529,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11494,6 +11666,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11506,6 +11679,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11642,6 +11816,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11654,6 +11829,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11790,6 +11966,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11802,6 +11979,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11938,6 +12116,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11950,6 +12129,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12086,6 +12266,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12098,6 +12279,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12234,6 +12416,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12246,6 +12429,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -134,6 +135,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -255,6 +257,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -266,6 +269,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -401,6 +405,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -414,6 +419,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -555,6 +561,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -569,6 +576,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -682,6 +690,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -693,6 +702,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -805,6 +815,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -816,6 +827,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -945,6 +957,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -958,6 +971,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1087,6 +1101,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1100,6 +1115,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1212,6 +1228,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1223,6 +1240,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1500,6 +1518,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1513,6 +1532,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1993,6 +2013,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2008,6 +2029,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2172,6 +2194,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2189,6 +2212,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2353,6 +2377,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2370,6 +2395,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2482,6 +2508,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2492,6 +2519,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2768,6 +2796,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2780,6 +2809,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4204,6 +4234,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4216,6 +4247,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4366,6 +4398,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4380,6 +4413,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4547,6 +4581,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4563,6 +4598,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4730,6 +4766,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4746,6 +4783,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4896,6 +4934,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4910,6 +4949,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5060,6 +5100,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5074,6 +5115,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5241,6 +5283,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5257,6 +5300,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5424,6 +5468,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5440,6 +5485,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5607,6 +5653,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5623,6 +5670,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5790,6 +5838,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5806,6 +5855,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5973,6 +6023,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5989,6 +6040,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6156,6 +6208,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6172,6 +6225,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6339,6 +6393,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6355,6 +6410,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6522,6 +6578,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6538,6 +6595,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6661,6 +6719,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6672,6 +6731,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6793,6 +6853,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6804,6 +6865,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6939,6 +7001,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6952,6 +7015,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7093,6 +7157,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7107,6 +7172,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7220,6 +7286,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7231,6 +7298,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7343,6 +7411,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7354,6 +7423,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7483,6 +7553,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7496,6 +7567,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7625,6 +7697,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7638,6 +7711,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7750,6 +7824,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7761,6 +7836,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8038,6 +8114,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8051,6 +8128,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8531,6 +8609,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8546,6 +8625,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8710,6 +8790,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8727,6 +8808,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8891,6 +8973,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8908,6 +8991,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -9020,6 +9104,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9030,6 +9115,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9306,6 +9392,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9318,6 +9405,7 @@ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11406,6 +11494,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11418,6 +11507,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11568,6 +11658,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11582,6 +11673,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11735,6 +11827,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11749,6 +11842,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11916,6 +12010,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11932,6 +12027,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12099,6 +12195,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12115,6 +12212,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12265,6 +12363,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12279,6 +12378,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12429,6 +12529,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12443,6 +12544,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12610,6 +12712,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12626,6 +12729,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12793,6 +12897,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12809,6 +12914,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12976,6 +13082,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12992,6 +13099,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13159,6 +13267,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13175,6 +13284,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13342,6 +13452,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13358,6 +13469,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13525,6 +13637,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13541,6 +13654,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13708,6 +13822,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13724,6 +13839,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13891,6 +14007,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13907,6 +14024,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -79,6 +79,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -90,6 +91,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -179,6 +181,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -190,6 +193,7 @@ ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -278,6 +282,7 @@ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -290,6 +295,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -380,6 +386,7 @@ ; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -392,6 +399,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -477,6 +485,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -488,6 +497,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -569,6 +579,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -581,6 +592,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -134,6 +135,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -255,6 +257,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -266,6 +269,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -387,6 +391,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -398,6 +403,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -519,6 +525,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -530,6 +537,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -643,6 +651,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -654,6 +663,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -766,6 +776,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -777,6 +788,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -889,6 +901,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -900,6 +913,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1012,6 +1026,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1023,6 +1038,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1135,6 +1151,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1146,6 +1163,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1258,6 +1276,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1269,6 +1288,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1381,6 +1401,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1392,6 +1413,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1504,6 +1526,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1515,6 +1538,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1627,6 +1651,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1638,6 +1663,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1770,6 +1796,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1783,6 +1810,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1916,6 +1944,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1929,6 +1958,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2062,6 +2092,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2075,6 +2106,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2187,6 +2219,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2197,6 +2230,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2309,6 +2343,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2319,6 +2354,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2431,6 +2467,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2441,6 +2478,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2553,6 +2591,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2563,6 +2602,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2675,6 +2715,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2685,6 +2726,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2797,6 +2839,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2807,6 +2850,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2919,6 +2963,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2929,6 +2974,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3041,6 +3087,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3051,6 +3098,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3163,6 +3211,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3173,6 +3222,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3285,6 +3335,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3295,6 +3346,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3407,6 +3459,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3417,6 +3470,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3529,6 +3583,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3539,6 +3594,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3651,6 +3707,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3661,6 +3718,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3773,6 +3831,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3783,6 +3842,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3895,6 +3955,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -3905,6 +3966,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4039,6 +4101,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4051,6 +4114,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4187,6 +4251,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4199,6 +4264,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4335,6 +4401,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4347,6 +4414,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4483,6 +4551,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4495,6 +4564,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4631,6 +4701,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4643,6 +4714,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4779,6 +4851,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4791,6 +4864,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4927,6 +5001,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4939,6 +5014,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5075,6 +5151,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5087,6 +5164,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5223,6 +5301,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5235,6 +5314,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5371,6 +5451,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5383,6 +5464,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5519,6 +5601,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5531,6 +5614,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5667,6 +5751,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5679,6 +5764,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5815,6 +5901,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5827,6 +5914,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5963,6 +6051,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5975,6 +6064,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6111,6 +6201,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6123,6 +6214,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6246,6 +6338,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6257,6 +6350,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6378,6 +6472,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6389,6 +6484,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6510,6 +6606,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6521,6 +6618,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6642,6 +6740,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6653,6 +6752,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6766,6 +6866,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6777,6 +6878,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -6889,6 +6991,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6900,6 +7003,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7012,6 +7116,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7023,6 +7128,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7135,6 +7241,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7146,6 +7253,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7258,6 +7366,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7269,6 +7378,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7381,6 +7491,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7392,6 +7503,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7504,6 +7616,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7515,6 +7628,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7627,6 +7741,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7638,6 +7753,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7750,6 +7866,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7761,6 +7878,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7893,6 +8011,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7906,6 +8025,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8039,6 +8159,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8052,6 +8173,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8185,6 +8307,7 @@ ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8198,6 +8321,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8310,6 +8434,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8320,6 +8445,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8432,6 +8558,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8442,6 +8569,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8554,6 +8682,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8564,6 +8693,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8676,6 +8806,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8686,6 +8817,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8798,6 +8930,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8808,6 +8941,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -8920,6 +9054,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8930,6 +9065,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9042,6 +9178,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9052,6 +9189,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9164,6 +9302,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9174,6 +9313,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9286,6 +9426,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9296,6 +9437,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9408,6 +9550,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9418,6 +9561,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9530,6 +9674,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9540,6 +9685,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9652,6 +9798,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9662,6 +9809,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9774,6 +9922,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9784,6 +9933,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9896,6 +10046,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9906,6 +10057,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10018,6 +10170,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10028,6 +10181,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10162,6 +10316,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10174,6 +10329,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10310,6 +10466,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10322,6 +10479,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10458,6 +10616,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10470,6 +10629,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10606,6 +10766,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10618,6 +10779,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10754,6 +10916,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10766,6 +10929,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10902,6 +11066,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10914,6 +11079,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11050,6 +11216,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11062,6 +11229,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11198,6 +11366,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11210,6 +11379,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11346,6 +11516,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11358,6 +11529,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11494,6 +11666,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11506,6 +11679,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11642,6 +11816,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11654,6 +11829,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11790,6 +11966,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11802,6 +11979,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11938,6 +12116,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11950,6 +12129,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12086,6 +12266,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12098,6 +12279,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12234,6 +12416,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12246,6 +12429,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -134,6 +135,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -255,6 +257,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -266,6 +269,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -391,6 +395,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -402,6 +407,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -532,6 +538,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -543,6 +550,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -656,6 +664,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -667,6 +676,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -779,6 +789,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -790,6 +801,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -914,6 +926,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -926,6 +939,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1050,6 +1064,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1062,6 +1077,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -1174,6 +1190,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1185,6 +1202,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1315,6 +1333,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1439,6 +1458,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1451,6 +1471,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1594,6 +1615,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1737,6 +1759,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -1873,6 +1896,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -1886,6 +1910,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2035,6 +2060,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2049,6 +2075,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2198,6 +2225,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2212,6 +2240,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -2324,6 +2353,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2334,6 +2364,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2463,6 +2494,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2587,6 +2619,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -2598,6 +2631,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2740,6 +2774,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -2882,6 +2917,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3011,6 +3047,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3140,6 +3177,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3282,6 +3320,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3424,6 +3463,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3566,6 +3606,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3708,6 +3749,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3850,6 +3892,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -3992,6 +4035,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4134,6 +4178,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4276,6 +4321,7 @@ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4410,6 +4456,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4422,6 +4469,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4562,6 +4610,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4574,6 +4623,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4722,6 +4772,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4735,6 +4786,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -4887,6 +4939,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -4900,6 +4953,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5052,6 +5106,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5065,6 +5120,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5205,6 +5261,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5217,6 +5274,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5357,6 +5415,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5369,6 +5428,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5521,6 +5581,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5534,6 +5595,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5686,6 +5748,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5699,6 +5762,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -5851,6 +5915,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -5864,6 +5929,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6016,6 +6082,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6029,6 +6096,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6181,6 +6249,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6194,6 +6263,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6346,6 +6416,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6359,6 +6430,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6511,6 +6583,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6524,6 +6597,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6676,6 +6750,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6689,6 +6764,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -6812,6 +6888,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6823,6 +6900,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -6944,6 +7022,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -6955,6 +7034,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7080,6 +7160,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7091,6 +7172,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7218,6 +7300,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7229,6 +7312,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { @@ -7342,6 +7426,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7353,6 +7438,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7465,6 +7551,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7476,6 +7563,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7594,6 +7682,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7605,6 +7694,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7723,6 +7813,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7734,6 +7825,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { @@ -7846,6 +7938,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,6 +7950,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -7987,6 +8081,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8105,6 +8200,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8116,6 +8212,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8252,6 +8349,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8388,6 +8486,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8524,6 +8623,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8537,6 +8637,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8680,6 +8781,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8693,6 +8795,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8836,6 +8939,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8849,6 +8953,7 @@ ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { @@ -8961,6 +9066,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -8971,6 +9077,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9100,6 +9207,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9218,6 +9326,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -9228,6 +9337,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9363,6 +9473,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9498,6 +9609,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9627,6 +9739,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9756,6 +9869,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -9891,6 +10005,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10026,6 +10141,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10161,6 +10277,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10296,6 +10413,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10431,6 +10549,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10566,6 +10685,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10701,6 +10821,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10836,6 +10957,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -10970,6 +11092,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -10982,6 +11105,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11122,6 +11246,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11134,6 +11259,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11276,6 +11402,7 @@ ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11288,6 +11415,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11434,6 +11562,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11446,6 +11575,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11592,6 +11722,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11604,6 +11735,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11744,6 +11876,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11756,6 +11889,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -11896,6 +12030,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -11908,6 +12043,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12054,6 +12190,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12066,6 +12203,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12212,6 +12350,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12224,6 +12363,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12370,6 +12510,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12382,6 +12523,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12528,6 +12670,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12540,6 +12683,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12686,6 +12830,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12698,6 +12843,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -12844,6 +12990,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -12856,6 +13003,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13002,6 +13150,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13014,6 +13163,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { @@ -13160,6 +13310,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -13172,6 +13323,7 @@ ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -138,6 +138,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -151,6 +152,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -291,6 +293,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -305,6 +308,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -86,6 +86,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -99,6 +100,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { @@ -191,6 +193,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -205,6 +208,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -162,6 +162,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -175,6 +176,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -341,6 +343,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -355,6 +358,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -106,6 +106,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -119,6 +120,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { @@ -231,6 +233,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_nop 0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; @@ -245,6 +248,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, off glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_nop 0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll --- a/llvm/test/CodeGen/AMDGPU/minmax.ll +++ b/llvm/test/CodeGen/AMDGPU/minmax.ll @@ -22,6 +22,7 @@ ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; @@ -33,6 +34,7 @@ ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) @@ -109,6 +111,7 @@ ; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: global_store_b32 v0, v1, s[4:5] +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; @@ -120,6 +123,7 @@ ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b) @@ -205,6 +209,7 @@ ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: v_maxmin_f32 v0, s0, s1, v0 ; SDAG-NEXT: global_store_b32 v1, v0, s[4:5] +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; @@ -215,6 +220,7 @@ ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_maxmin_f32 v0, s0, s1, v0 ; GISEL-NEXT: global_store_b32 v1, v0, s[6:7] +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call float @llvm.maxnum.f32(float %a, float %b) @@ -297,6 +303,7 @@ ; SDAG-NEXT: s_mov_b32 s4, s3 ; SDAG-NEXT: v_maxmin_f16 v0, s0, s1, v0 ; SDAG-NEXT: global_store_b16 v1, v0, s[4:5] +; SDAG-NEXT: s_nop 0 ; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-NEXT: s_endpgm ; @@ -307,6 +314,7 @@ ; GISEL-NEXT: s_mov_b32 s7, s4 ; GISEL-NEXT: v_maxmin_f16 v0, s0, s1, v0 ; GISEL-NEXT: global_store_b16 v1, v0, s[6:7] +; GISEL-NEXT: s_nop 0 ; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-NEXT: s_endpgm %smax = call half @llvm.maxnum.f16(half %a, half %b) diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -102,6 +102,7 @@ ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -242,6 +243,7 @@ ; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -345,6 +347,7 @@ ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -476,6 +479,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -579,6 +583,7 @@ ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -697,6 +702,7 @@ ; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 ; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -819,6 +825,7 @@ ; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -914,6 +921,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1022,6 +1030,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1156,6 +1165,7 @@ ; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1316,6 +1326,7 @@ ; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 ; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1524,6 +1535,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1754,6 +1766,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2058,6 +2071,7 @@ ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2306,6 +2320,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo ; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll --- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll @@ -1583,6 +1583,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 1 @@ -1620,6 +1621,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 2047 @@ -1657,6 +1659,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 @@ -1694,6 +1697,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 @@ -1731,6 +1735,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -2048 @@ -1771,6 +1776,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -1827,6 +1833,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -1851,6 +1858,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 @@ -1888,6 +1896,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 4095 @@ -1925,6 +1934,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8191 @@ -1962,6 +1972,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 16383 @@ -2002,6 +2013,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; @@ -2058,6 +2070,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2082,6 +2095,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192 @@ -2127,6 +2141,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2151,6 +2166,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384 @@ -2197,6 +2213,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2221,6 +2238,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936639 @@ -2267,6 +2285,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2291,6 +2310,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589936640 @@ -2337,6 +2357,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2361,6 +2382,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938687 @@ -2407,6 +2429,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2431,6 +2454,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589938688 @@ -2477,6 +2501,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2501,6 +2526,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942783 @@ -2547,6 +2573,7 @@ ; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; @@ -2571,6 +2598,7 @@ ; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 8589942784 @@ -2615,6 +2643,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773761 @@ -2659,6 +2688,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854773760 @@ -2703,6 +2733,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771713 @@ -2747,6 +2778,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854771712 @@ -2791,6 +2823,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767617 @@ -2835,6 +2868,7 @@ ; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %gep = getelementptr i8, ptr addrspace(1) %p, i64 -9223372036854767616 diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll --- a/llvm/test/CodeGen/AMDGPU/omod.ll +++ b/llvm/test/CodeGen/AMDGPU/omod.ll @@ -51,6 +51,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -111,6 +112,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -171,6 +173,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -231,6 +234,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -267,6 +271,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -299,6 +304,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 @@ -326,6 +332,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -353,6 +360,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 div:2 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 @@ -380,6 +388,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -407,6 +416,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_med3_f32 v0, v0, v1, v2 mul:2 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z) @@ -434,6 +444,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:2 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 @@ -461,6 +472,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -488,6 +500,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 mul:4 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd nsz double %a, 1.0 @@ -526,6 +539,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v1, off ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -554,6 +568,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:4 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -583,6 +598,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp div:2 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -618,6 +634,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -651,6 +668,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e64 v0, |v0|, 0.5 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -679,6 +697,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f32_e64 v0, v0, v0 clamp ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, %a @@ -711,6 +730,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %max = call float @llvm.maxnum.f32(float %a, float 0.0) @@ -743,6 +763,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v0, |v0|, |v0| ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 @@ -775,6 +796,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v0, |v0|, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 @@ -807,6 +829,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e64 v0, v0, |v0| ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %x = fadd float %a, 1.0 @@ -840,6 +863,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -873,6 +897,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v0, 0.5, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -905,6 +930,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 @@ -937,6 +963,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd float %a, 1.0 @@ -969,6 +996,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd double %a, 1.0 @@ -1003,6 +1031,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 @@ -1037,6 +1066,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, v0, v0 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 @@ -1067,6 +1097,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f16_e64 v0, v0, 1.0 div:2 ; GFX11-NEXT: global_store_b16 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = fadd half %a, 1.0 @@ -1099,6 +1130,7 @@ ; GFX11-NEXT: v_add_f32_e64 v1, v1, v0 mul:2 ; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %mul = fmul float %a, %a diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -374,6 +374,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1013,6 +1014,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[3:4], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1437,6 +1439,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0 ; GFX11-NEXT: global_store_b32 v6, v0, s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1738,6 +1741,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo ; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1994,6 +1998,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3 ; GFX11-NEXT: global_store_b32 v6, v0, s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -2257,6 +2262,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %buffer2) { @@ -2661,6 +2667,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -2911,6 +2918,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -113,6 +113,7 @@ ; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -219,6 +220,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -334,6 +336,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 @@ -448,6 +451,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -572,6 +576,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] ; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %aptr, align 4 @@ -711,6 +716,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -116,6 +116,7 @@ ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -230,6 +231,7 @@ ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -342,6 +344,7 @@ ; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -455,6 +458,7 @@ ; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -568,6 +572,7 @@ ; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -726,6 +731,7 @@ ; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -868,6 +874,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1008,6 +1015,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1150,6 +1158,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -1292,6 +1301,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -79,6 +79,7 @@ ; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs @@ -159,6 +160,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -253,6 +255,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, s0, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -345,6 +348,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s0 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -430,6 +434,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -511,6 +516,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -609,6 +615,7 @@ ; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1 ; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -705,6 +712,7 @@ ; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -73,6 +73,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -180,6 +181,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -260,6 +262,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -337,6 +340,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -414,6 +418,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -491,6 +496,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v1, 16, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -568,6 +574,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -645,6 +652,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v1, 17, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -722,6 +730,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -854,6 +863,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -940,6 +950,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1048,6 +1059,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1134,6 +1146,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1218,6 +1231,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1302,6 +1316,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1384,6 +1399,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1464,6 +1480,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1544,6 +1561,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1624,6 +1642,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1708,6 +1727,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1787,6 +1807,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1868,6 +1889,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1952,6 +1974,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2031,6 +2054,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2112,6 +2136,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2196,6 +2221,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2280,6 +2306,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2364,6 +2391,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2448,6 +2476,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2527,6 +2556,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2605,6 +2635,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -76,6 +76,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp i64 %in to half @@ -170,6 +171,7 @@ ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -248,6 +250,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp i64 %in to float @@ -339,6 +342,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -452,6 +456,7 @@ ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> @@ -660,6 +665,7 @@ ; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 ; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 ; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -785,6 +791,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x half> @@ -1016,6 +1023,7 @@ ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -57,6 +57,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -125,6 +126,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -204,6 +206,7 @@ ; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -283,6 +286,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -377,6 +381,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -688,6 +688,7 @@ ; GFX11-NEXT: s_cbranch_scc0 .LBB8_2 ; GFX11-NEXT: ; %bb.1: ; %exit ; GFX11-NEXT: global_store_b32 v[0:1], v9, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB8_2: ; %bb @@ -720,6 +721,7 @@ ; GFX11-NEXT: v_mov_b32_e64 v9, -2 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: global_store_b32 v[0:1], v9, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB8_4: @@ -1103,6 +1105,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v0, 8 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB10_4: @@ -1257,6 +1260,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB11_5: ; %end +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB11_6: @@ -1521,6 +1525,7 @@ ; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: .LBB13_5: ; %UnifiedReturnBlock +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB13_6: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -79,6 +79,7 @@ ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -161,6 +162,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 @@ -199,6 +201,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load <2 x i16>, ptr addrspace(4) %in0 @@ -263,6 +266,7 @@ ; GFX11-NEXT: s_mov_b32 s4, s0 ; GFX11-NEXT: s_mov_b32 s5, s1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = sub <2 x i16> %a, %b @@ -329,6 +333,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -400,6 +405,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -469,6 +475,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -537,6 +544,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -607,6 +615,7 @@ ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0] ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -699,6 +708,7 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -801,6 +811,7 @@ ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -898,6 +909,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1005,6 +1017,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -64,6 +64,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to half @@ -145,6 +146,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -211,6 +213,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_ldexp_f32 v0, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp i64 %in to float @@ -288,6 +291,7 @@ ; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 ; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -377,6 +381,7 @@ ; GFX11-NEXT: v_ldexp_f32 v1, v0, s2 ; GFX11-NEXT: v_ldexp_f32 v0, v2, s3 ; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> @@ -536,6 +541,7 @@ ; GFX11-NEXT: v_ldexp_f32 v1, v6, v11 ; GFX11-NEXT: v_ldexp_f32 v0, v4, v5 ; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -637,6 +643,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x half> @@ -819,6 +826,7 @@ ; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3 ; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -57,6 +57,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -125,6 +126,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -204,6 +206,7 @@ ; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -283,6 +286,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -377,6 +381,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -75,6 +75,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -200,6 +201,7 @@ ; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 ; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0 ; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r0, diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -126,6 +126,7 @@ ; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SDAG-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -192,6 +193,7 @@ ; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm %src0 = trunc i32 %src0ext to i16 @@ -441,6 +443,7 @@ ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] ; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; @@ -515,6 +518,7 @@ ; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm %src.max = call <2 x i16> @llvm.smax.v2i16(<2 x i16> %src, <2 x i16> ) diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1637,6 +1637,7 @@ ; GFX11-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] ; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -1748,6 +1749,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ld8 = load <8 x i32>, ptr addrspace(4) %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -46,6 +46,7 @@ ; GFX11-NEXT: s_or_b32 s0, s0, 4 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 @@ -102,6 +103,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 @@ -159,6 +161,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 @@ -231,6 +234,7 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b16 v[0:1], v4, off ; GFX11-NEXT: global_store_d16_hi_b8 v[2:3], v5, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i17, ptr addrspace(4) %arg, align 4 @@ -279,6 +283,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load half, ptr addrspace(4) %arg, align 4 @@ -350,6 +355,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, 0x300, v2 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load <2 x i8>, ptr addrspace(4) %arg, align 4 @@ -409,6 +415,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v2, 4, v2 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -461,6 +468,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b8 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 @@ -516,6 +524,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(4) %arg, align 4 @@ -574,6 +583,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i1, ptr addrspace(4) %arg, align 4 @@ -628,6 +638,7 @@ ; GFX11-NEXT: s_or_b32 s0, s0, 4 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(6) %arg, align 4 @@ -680,6 +691,7 @@ ; GFX11-NEXT: s_or_b32 s0, s0, 1 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %load = load i16, ptr addrspace(1) %arg, align 4, !invariant.load !0 diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll @@ -31,6 +31,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -54,6 +55,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -77,6 +79,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -98,6 +101,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -121,6 +125,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -142,6 +147,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 ; W32-NEXT: global_store_b128 v[26:27], v[16:19], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -165,6 +171,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -186,6 +193,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -207,6 +215,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -228,6 +237,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -249,6 +259,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -270,6 +281,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -291,6 +303,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -312,6 +325,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[18:19], v[12:15], off offset:16 ; W32-NEXT: global_store_b128 v[18:19], v[8:11], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -335,6 +349,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -356,6 +371,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -377,6 +393,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -398,6 +415,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -420,6 +438,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -441,6 +460,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -462,6 +482,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: @@ -483,6 +504,7 @@ ; W32-NEXT: s_clause 0x1 ; W32-NEXT: global_store_b128 v[14:15], v[8:11], off offset:16 ; W32-NEXT: global_store_b128 v[14:15], v[4:7], off +; W32-NEXT: s_nop 0 ; W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W32-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll --- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll @@ -27,6 +27,7 @@ ; W64-NEXT: v_wmma_f32_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -46,6 +47,7 @@ ; W64-NEXT: v_wmma_f32_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -65,6 +67,7 @@ ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -82,6 +85,7 @@ ; W64-NEXT: v_wmma_f16_16x16x16_f16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -101,6 +105,7 @@ ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -118,6 +123,7 @@ ; W64-NEXT: v_wmma_bf16_16x16x16_bf16 v[16:19], v[8:15], v[8:15], v[16:19] op_sel:[0,0,1] ; W64-NEXT: global_store_b128 v[20:21], v[24:27], off ; W64-NEXT: global_store_b128 v[22:23], v[16:19], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -137,6 +143,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -155,6 +162,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -172,6 +180,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -189,6 +198,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -206,6 +216,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -223,6 +234,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -240,6 +252,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -257,6 +270,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu8 v[8:11], v[4:7], v[4:7], v[8:11] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[12:13], v[16:19], off ; W64-NEXT: global_store_b128 v[14:15], v[8:11], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -276,6 +290,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -293,6 +308,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -310,6 +326,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -327,6 +344,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -344,6 +362,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -361,6 +380,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[0,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -378,6 +398,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,0,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: @@ -395,6 +416,7 @@ ; W64-NEXT: v_wmma_i32_16x16x16_iu4 v[4:7], v[2:3], v[2:3], v[4:7] neg_lo:[1,1,0] clamp ; W64-NEXT: global_store_b128 v[8:9], v[12:15], off ; W64-NEXT: global_store_b128 v[10:11], v[4:7], off +; W64-NEXT: s_nop 0 ; W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; W64-NEXT: s_endpgm bb: