Index: llvm/lib/CodeGen/MachineCSE.cpp =================================================================== --- llvm/lib/CodeGen/MachineCSE.cpp +++ llvm/lib/CodeGen/MachineCSE.cpp @@ -265,8 +265,10 @@ } static bool isCallerPreservedOrConstPhysReg(MCRegister Reg, + const MachineOperand &MO, const MachineFunction &MF, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { // MachineRegisterInfo::isConstantPhysReg directly called by // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the // reserved registers to be frozen. That doesn't cause a problem post-ISel as @@ -275,7 +277,7 @@ // It does cause issues mid-GlobalISel, however, hence the additional // reservedRegsFrozen check. const MachineRegisterInfo &MRI = MF.getRegInfo(); - return TRI.isCallerPreservedPhysReg(Reg, MF) || + return TRI.isCallerPreservedPhysReg(Reg, MF) || TII.isIgnorableUse(MO) || (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg)); } @@ -298,7 +300,8 @@ if (Register::isVirtualRegister(Reg)) continue; // Reading either caller preserved or constant physregs is ok. - if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), *MI->getMF(), *TRI)) + if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), MO, *MI->getMF(), *TRI, + *TII)) for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) PhysRefs.insert(*AI); } Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -498,32 +498,31 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -537,8 +536,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -573,30 +571,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -618,31 +616,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -681,31 +679,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -732,33 +730,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -836,32 +833,31 @@ ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 @@ -870,15 +866,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -913,33 +908,33 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v4, s5 -; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: v_mov_b32_e32 v5, s5 +; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: @@ -961,12 +956,13 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W32-NEXT: ; %bb.1: @@ -974,21 +970,20 @@ ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v4, s8 -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v5, s8 +; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB3_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: @@ -1027,34 +1022,34 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v4, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v5, s5 +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1081,14 +1076,14 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W32-NEXT: ; %bb.1: @@ -1096,21 +1091,19 @@ ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mov_b32_e32 v4, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc +; GFX11W32-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8 +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB3_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1685,32 +1678,31 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 @@ -1724,8 +1716,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -1760,30 +1751,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1805,31 +1796,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1868,31 +1859,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1919,33 +1910,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB7_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -491,48 +491,46 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 @@ -541,47 +539,45 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 @@ -618,6 +614,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -627,12 +624,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB2_2: @@ -667,6 +663,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -676,11 +673,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB2_2: @@ -731,8 +727,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -742,12 +739,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_2: @@ -786,8 +782,9 @@ ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 @@ -797,11 +794,10 @@ ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_2: @@ -835,8 +831,9 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -856,23 +853,23 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v0, v2 +; GFX8-NEXT: ds_add_u32 v2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -892,14 +889,13 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v0, v2 +; GFX9-NEXT: ds_add_u32 v2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_endpgm @@ -924,13 +920,13 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -955,16 +951,16 @@ ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v3, v0 +; GFX1032-NEXT: ds_add_u32 v0, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB3_2: @@ -996,17 +992,17 @@ ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v3, v0 +; GFX1164-NEXT: ds_add_u32 v0, v3 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB3_2: @@ -1031,17 +1027,16 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v3, v0 +; GFX1132-NEXT: ds_add_u32 v0, v3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB3_2: @@ -1059,19 +1054,19 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1081,8 +1076,8 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc @@ -1093,19 +1088,19 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1115,7 +1110,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1127,18 +1122,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1148,7 +1143,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1159,20 +1154,20 @@ ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: @@ -1181,7 +1176,7 @@ ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v3, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1191,19 +1186,19 @@ ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: @@ -1212,7 +1207,7 @@ ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v3, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1222,22 +1217,22 @@ ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: @@ -1246,7 +1241,7 @@ ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v3, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1257,21 +1252,21 @@ ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v2, v[1:2] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: @@ -1280,7 +1275,7 @@ ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v3, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1302,13 +1297,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -1345,6 +1340,7 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1355,9 +1351,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1384,6 +1379,7 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1397,7 +1393,6 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1421,6 +1416,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1429,7 +1425,6 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1459,14 +1454,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -1496,8 +1491,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1505,7 +1501,6 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1538,15 +1533,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -2145,48 +2140,46 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 @@ -2195,47 +2188,45 @@ ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 @@ -2272,6 +2263,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -2281,12 +2273,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: @@ -2321,6 +2312,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -2330,11 +2322,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: @@ -2385,8 +2376,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -2396,12 +2388,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_2: @@ -2440,8 +2431,9 @@ ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 @@ -2451,11 +2443,10 @@ ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_2: @@ -2489,8 +2480,9 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -2510,23 +2502,23 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v0, v2 +; GFX8-NEXT: ds_sub_u32 v2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -2546,14 +2538,13 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v0, v2 +; GFX9-NEXT: ds_sub_u32 v2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: s_endpgm @@ -2578,13 +2569,13 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2609,16 +2600,16 @@ ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v3, v0 +; GFX1032-NEXT: ds_sub_u32 v0, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: @@ -2650,17 +2641,17 @@ ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v3, v0 +; GFX1164-NEXT: ds_sub_u32 v0, v3 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB10_2: @@ -2685,17 +2676,16 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v3, v0 +; GFX1132-NEXT: ds_sub_u32 v0, v3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB10_2: @@ -2713,19 +2703,19 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2735,8 +2725,8 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -2747,19 +2737,19 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2767,8 +2757,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -2781,18 +2771,18 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2800,8 +2790,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v3 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -2813,20 +2803,20 @@ ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: @@ -2834,9 +2824,9 @@ ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -2848,19 +2838,19 @@ ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: @@ -2868,9 +2858,9 @@ ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -2882,31 +2872,31 @@ ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: ; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc @@ -2920,30 +2910,30 @@ ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo @@ -2968,13 +2958,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -3011,6 +3001,7 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3021,9 +3012,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3051,6 +3041,7 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3064,7 +3055,6 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3090,6 +3080,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3098,7 +3089,6 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3131,14 +3121,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -3171,8 +3161,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3180,7 +3171,6 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3215,15 +3205,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -3704,48 +3694,46 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB15_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 @@ -3754,47 +3742,45 @@ ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB15_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 @@ -3831,6 +3817,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -3840,12 +3827,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB15_2: @@ -3880,6 +3866,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -3889,11 +3876,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB15_2: @@ -3944,8 +3930,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -3955,12 +3942,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB15_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_2: @@ -3999,8 +3985,9 @@ ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 @@ -4010,11 +3997,10 @@ ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB15_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_2: @@ -4054,48 +4040,46 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB16_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 @@ -4104,47 +4088,45 @@ ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 @@ -4181,6 +4163,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -4190,12 +4173,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB16_2: @@ -4230,6 +4212,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -4239,11 +4222,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB16_2: @@ -4294,8 +4276,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -4305,12 +4288,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_2: @@ -4349,8 +4331,9 @@ ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 @@ -4360,11 +4343,10 @@ ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_2: @@ -5606,48 +5588,46 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 @@ -5656,47 +5636,45 @@ ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB21_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 @@ -5733,6 +5711,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -5742,12 +5721,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB21_2: @@ -5782,6 +5760,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -5791,11 +5770,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB21_2: @@ -5846,8 +5824,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -5857,12 +5836,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB21_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_2: @@ -5901,8 +5879,9 @@ ; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 @@ -5912,11 +5891,10 @@ ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB21_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_2: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -497,32 +497,31 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -536,8 +535,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -572,30 +570,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -617,31 +615,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -680,31 +678,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -731,33 +729,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1332,32 +1329,31 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 @@ -1371,8 +1367,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -1407,30 +1402,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1452,31 +1447,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1515,31 +1510,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1566,33 +1561,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc ; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -463,39 +463,37 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_2: @@ -503,7 +501,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, s1 @@ -513,39 +511,37 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_2: @@ -553,8 +549,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -589,31 +584,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -635,32 +629,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -699,32 +692,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -751,34 +743,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1421,39 +1411,37 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_2: @@ -1461,7 +1449,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, s1 @@ -1471,39 +1459,37 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_2: @@ -1511,8 +1497,7 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] @@ -1547,31 +1532,30 @@ ; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1593,32 +1577,31 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_mov_b32 s3, s4 ; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[4:7], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1657,32 +1640,31 @@ ; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1709,34 +1691,32 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 ; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_mov_b32 s3, s4 ; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[4:7], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc ; GFX11W32-NEXT: .LBB7_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -176,8 +176,8 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS1-NEXT: .LBB1_9: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: flat_load_dword v0, v[0:1] +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 +; GLOBALNESS1-NEXT: flat_load_dword v0, v[32:33] ; GLOBALNESS1-NEXT: s_mov_b32 s68, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s70, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s71, s69 @@ -217,8 +217,7 @@ ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v41, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 1 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -226,6 +225,7 @@ ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -281,7 +281,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] @@ -290,15 +290,14 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], a[32:33], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -562,8 +561,8 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 ; GLOBALNESS0-NEXT: .LBB1_9: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: flat_load_dword v0, v[0:1] +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 +; GLOBALNESS0-NEXT: flat_load_dword v0, v[32:33] ; GLOBALNESS0-NEXT: s_mov_b32 s68, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s71, s69 @@ -603,8 +602,7 @@ ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 1 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -612,6 +610,7 @@ ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -667,7 +666,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] @@ -676,15 +675,14 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], a[32:33], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1