Index: llvm/include/llvm/CodeGen/MachineInstrBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/MachineInstrBuilder.h +++ llvm/include/llvm/CodeGen/MachineInstrBuilder.h @@ -195,7 +195,7 @@ } const MachineInstrBuilder &addRegMask(const uint32_t *Mask) const { - MI->addOperand(*MF, MachineOperand::CreateRegMask(Mask)); + MI->addOperand(*MF, MachineOperand::CreateRegMask(Mask, MF)); return *this; } Index: llvm/include/llvm/CodeGen/MachineOperand.h =================================================================== --- llvm/include/llvm/CodeGen/MachineOperand.h +++ llvm/include/llvm/CodeGen/MachineOperand.h @@ -27,6 +27,7 @@ class ConstantInt; class GlobalValue; class MachineBasicBlock; +class MachineFunction; class MachineInstr; class MachineRegisterInfo; class MCCFIInstruction; @@ -704,10 +705,7 @@ /// operand does not take ownership of the memory referenced by Mask, it must /// remain valid for the lifetime of the operand. See CreateRegMask(). /// Any physreg with a 0 bit in the mask is clobbered by the instruction. - void setRegMask(const uint32_t *RegMaskPtr) { - assert(isRegMask() && "Wrong MachineOperand mutator"); - Contents.RegMask = RegMaskPtr; - } + void setRegMask(const uint32_t *RegMaskPtr, MachineInstr *MI = nullptr); void setIntrinsicID(Intrinsic::ID IID) { assert(isIntrinsicID() && "Wrong MachineOperand mutator"); @@ -892,12 +890,9 @@ /// /// Any physreg with a 0 bit in the mask is clobbered by the instruction. /// - static MachineOperand CreateRegMask(const uint32_t *Mask) { - assert(Mask && "Missing register mask"); - MachineOperand Op(MachineOperand::MO_RegisterMask); - Op.Contents.RegMask = Mask; - return Op; - } + static MachineOperand CreateRegMask(const uint32_t *Mask, + MachineFunction *MF); + static MachineOperand CreateRegLiveOut(const uint32_t *Mask) { assert(Mask && "Missing live-out register mask"); MachineOperand Op(MachineOperand::MO_RegisterLiveOut); Index: llvm/include/llvm/CodeGen/MachineRegisterInfo.h =================================================================== --- llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -882,6 +882,8 @@ UsedPhysRegMask.setBitsNotInMask(RegMask); } + void recollectUsedPhysRegMask(); + const BitVector &getUsedPhysRegsMask() const { return UsedPhysRegMask; } //===--------------------------------------------------------------------===// Index: llvm/lib/CodeGen/MIRParser/MIParser.cpp =================================================================== --- llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -2757,7 +2757,7 @@ if (expectAndConsume(MIToken::rparen)) return true; - Dest = MachineOperand::CreateRegMask(Mask); + Dest = MachineOperand::CreateRegMask(Mask, &MF); return false; } @@ -2870,7 +2870,7 @@ return true; case MIToken::Identifier: if (const auto *RegMask = PFS.Target.getRegMask(Token.stringValue())) { - Dest = MachineOperand::CreateRegMask(RegMask); + Dest = MachineOperand::CreateRegMask(RegMask, &MF); lex(); break; } else if (Token.stringValue() == "CustomRegMask") { Index: llvm/lib/CodeGen/MachineCSE.cpp =================================================================== --- llvm/lib/CodeGen/MachineCSE.cpp +++ llvm/lib/CodeGen/MachineCSE.cpp @@ -265,8 +265,10 @@ } static bool isCallerPreservedOrConstPhysReg(MCRegister Reg, + const MachineOperand &MO, const MachineFunction &MF, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + const TargetInstrInfo &TII) { // MachineRegisterInfo::isConstantPhysReg directly called by // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the // reserved registers to be frozen. That doesn't cause a problem post-ISel as @@ -275,7 +277,7 @@ // It does cause issues mid-GlobalISel, however, hence the additional // reservedRegsFrozen check. const MachineRegisterInfo &MRI = MF.getRegInfo(); - return TRI.isCallerPreservedPhysReg(Reg, MF) || + return TRI.isCallerPreservedPhysReg(Reg, MF) || TII.isIgnorableUse(MO) || (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg)); } @@ -298,7 +300,8 @@ if (Register::isVirtualRegister(Reg)) continue; // Reading either caller preserved or constant physregs is ok. - if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), *MI->getMF(), *TRI)) + if (!isCallerPreservedOrConstPhysReg(Reg.asMCReg(), MO, *MI->getMF(), *TRI, + *TII)) for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) PhysRefs.insert(*AI); } Index: llvm/lib/CodeGen/MachineOperand.cpp =================================================================== --- llvm/lib/CodeGen/MachineOperand.cpp +++ llvm/lib/CodeGen/MachineOperand.cpp @@ -150,6 +150,23 @@ MF->getRegInfo().removeRegOperandFromUseList(this); } +void MachineOperand::setRegMask(const uint32_t *RegMaskPtr, MachineInstr *MI) { + assert(isRegMask() && "Wrong MachineOperand mutator"); + Contents.RegMask = RegMaskPtr; + if (MI == nullptr) + MI = ParentMI; + MI->getMF()->getRegInfo().addPhysRegsUsedFromRegMask(RegMaskPtr); +} + +MachineOperand MachineOperand::CreateRegMask(const uint32_t *Mask, + MachineFunction *MF) { + assert(Mask && "Missing register mask"); + MachineOperand Op(MachineOperand::MO_RegisterMask); + Op.Contents.RegMask = Mask; + MF->getRegInfo().addPhysRegsUsedFromRegMask(Mask); + return Op; +} + /// ChangeToImmediate - Replace this operand with a new immediate operand of /// the specified value. If an operand is known to be an immediate already, /// the setImm method should be used. Index: llvm/lib/CodeGen/MachineRegisterInfo.cpp =================================================================== --- llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -523,8 +523,9 @@ // used later. for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) - if (!def_empty(*AI) || isAllocatable(*AI)) + if (!def_empty(*AI) || isAllocatable(*AI) || UsedPhysRegMask.test(PhysReg)) return false; + return true; } @@ -584,6 +585,18 @@ return false; } +void MachineRegisterInfo::recollectUsedPhysRegMask() { + UsedPhysRegMask.reset(); + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB.instrs()) { + for (MachineOperand &MO : MI.operands()) { + if (MO.isRegMask()) + addPhysRegsUsedFromRegMask(MO.getRegMask()); + } + } + } +} + bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest) const { if (!SkipRegMaskTest && UsedPhysRegMask.test(PhysReg)) Index: llvm/lib/CodeGen/RegUsageInfoPropagate.cpp =================================================================== --- llvm/lib/CodeGen/RegUsageInfoPropagate.cpp +++ llvm/lib/CodeGen/RegUsageInfoPropagate.cpp @@ -143,6 +143,10 @@ } } + // MRI.UsedPhysRegMask should reflect the updated RegMask. + if (Changed) + MF.getRegInfo().recollectUsedPhysRegMask(); + LLVM_DEBUG( dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" "++++++ \n"); Index: llvm/lib/CodeGen/SelectionDAG/FastISel.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -863,7 +863,7 @@ // Push the register mask info. Ops.push_back(MachineOperand::CreateRegMask( - TRI.getCallPreservedMask(*FuncInfo.MF, CC))); + TRI.getCallPreservedMask(*FuncInfo.MF, CC), FuncInfo.MF)); // Add scratch registers as implicit def and early clobber. const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC); Index: llvm/lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- llvm/lib/CodeGen/TargetInstrInfo.cpp +++ llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -969,7 +969,7 @@ // If the physreg has no defs anywhere, it's just an ambient register // and we can freely move its uses. Alternatively, if it's allocatable, // it could get allocated to something with a def during allocation. - if (!MRI.isConstantPhysReg(Reg)) + if (!MRI.isConstantPhysReg(Reg) && !isIgnorableUse(MO)) return false; } else { // A physreg def. We can't remat it. Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -577,6 +577,13 @@ Optional describeLoadedValue(const MachineInstr &MI, Register Reg) const override; + /// Given \p MO is a PhysReg use return if it can be ignored for the purpose + /// of instruction rematerialization or sinking. + bool isIgnorableUse(const MachineOperand &MO) const override { + // An RIP relative address is a constant. + return MO.getReg() == X86::RIP; + } + protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -503,32 +503,31 @@ ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -541,9 +540,8 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -580,29 +578,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -627,30 +625,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -692,30 +690,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -746,31 +744,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -849,32 +846,31 @@ ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 @@ -883,15 +879,14 @@ ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -928,32 +923,32 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_clause 0x1 ; GFX10W64-NEXT: s_load_dword s7, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v4, s7 -; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: v_mov_b32_e32 v5, s7 +; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB3_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: @@ -978,12 +973,13 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX10W32-NEXT: ; %bb.1: @@ -991,20 +987,19 @@ ; GFX10W32-NEXT: s_clause 0x1 ; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s5 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s5 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v5, s6 +; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB3_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: @@ -1046,33 +1041,33 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_clause 0x1 ; GFX11W64-NEXT: s_load_b32 s7, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v4, s7 -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: v_mov_b32_e32 v5, s7 +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB3_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1103,13 +1098,13 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 ; GFX11W32-NEXT: ; %bb.1: @@ -1117,20 +1112,18 @@ ; GFX11W32-NEXT: s_clause 0x1 ; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44 ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v5, s6 +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB3_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1710,32 +1703,31 @@ ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 @@ -1748,9 +1740,8 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1787,29 +1778,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1834,30 +1825,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1899,30 +1890,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB7_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1953,31 +1944,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB7_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -498,47 +498,45 @@ ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -549,46 +547,44 @@ ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -629,6 +625,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -638,12 +635,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB2_2: @@ -680,6 +676,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -689,11 +686,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB2_2: @@ -746,8 +742,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -757,12 +754,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB2_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB2_2: @@ -805,21 +801,20 @@ ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB2_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB2_2: @@ -852,8 +847,9 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -873,23 +869,23 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v0, v2 +; GFX8-NEXT: ds_add_u32 v2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -909,14 +905,13 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v0, v2 +; GFX9-NEXT: ds_add_u32 v2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_endpgm @@ -941,13 +936,13 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -972,16 +967,16 @@ ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v3, v0 +; GFX1032-NEXT: ds_add_u32 v0, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB3_2: @@ -1013,17 +1008,17 @@ ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v3, v0 +; GFX1164-NEXT: ds_add_u32 v0, v3 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB3_2: @@ -1048,17 +1043,16 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v3, v0 +; GFX1132-NEXT: ds_add_u32 v0, v3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB3_2: @@ -1074,30 +1068,30 @@ ; ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v4 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -1109,30 +1103,30 @@ ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v4, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 2 @@ -1142,29 +1136,29 @@ ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v4, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 2 @@ -1174,29 +1168,29 @@ ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v4, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1207,27 +1201,27 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, s3, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v3 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v4, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1237,31 +1231,31 @@ ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: v_mov_b32_e32 v0, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v4, 5, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,29 +1267,29 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 -; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1132-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1132-NEXT: v_mov_b32_e32 v0, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_add_rtn_u64 v[2:3], v1, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1132-NEXT: v_readfirstlane_b32 s3, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v4, 5, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) @@ -1317,13 +1311,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -1360,6 +1354,7 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1370,9 +1365,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1399,6 +1393,7 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -1412,7 +1407,6 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1436,6 +1430,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1444,7 +1439,6 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1474,14 +1468,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -1511,8 +1505,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1520,7 +1515,6 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB5_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1553,15 +1547,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -2167,47 +2161,45 @@ ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -2218,46 +2210,44 @@ ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -2298,6 +2288,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -2307,12 +2298,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB9_2: @@ -2349,6 +2339,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -2358,11 +2349,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB9_2: @@ -2415,8 +2405,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -2426,12 +2417,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB9_2: @@ -2474,21 +2464,20 @@ ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB9_2: @@ -2521,8 +2510,9 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -2542,23 +2532,23 @@ ; GFX8-NEXT: v_readlane_b32 s2, v1, 63 ; GFX8-NEXT: s_mov_b64 exec, s[0:1] ; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v0, v2 +; GFX8-NEXT: ds_sub_u32 v2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -2578,14 +2568,13 @@ ; GFX9-NEXT: v_readlane_b32 s2, v1, 63 ; GFX9-NEXT: s_mov_b64 exec, s[0:1] ; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v0, v2 +; GFX9-NEXT: ds_sub_u32 v2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: s_endpgm @@ -2610,13 +2599,13 @@ ; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 ; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: v_mov_b32_e32 v3, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2641,16 +2630,16 @@ ; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v3, v0 +; GFX1032-NEXT: ds_sub_u32 v0, v3 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: @@ -2682,17 +2671,17 @@ ; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1164-NEXT: s_mov_b64 exec, s[0:1] ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1164-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v3, v0 +; GFX1164-NEXT: ds_sub_u32 v0, v3 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB10_2: @@ -2717,17 +2706,16 @@ ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v3, v0 +; GFX1132-NEXT: ds_sub_u32 v0, v3 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB10_2: @@ -2743,30 +2731,30 @@ ; ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[2:3], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v3 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v4 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -2778,29 +2766,29 @@ ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[2:3], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_readfirstlane_b32 s3, v1 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v4 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -2812,28 +2800,28 @@ ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_sub_rtn_u64 v[2:3], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v4 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc @@ -2845,30 +2833,30 @@ ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mul_i32 s2, s2, 5 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -2881,28 +2869,28 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -2914,31 +2902,31 @@ ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_mov_b64 s[2:3], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, s3, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1164-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_mul_i32 s2, s2, 5 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc @@ -2953,29 +2941,29 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v3, s3, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v3 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[1:2] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v3 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 -; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v3 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo @@ -3000,13 +2988,13 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 @@ -3043,6 +3031,7 @@ ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3053,9 +3042,8 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 ; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3083,6 +3071,7 @@ ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -3096,7 +3085,6 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3122,6 +3110,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3130,7 +3119,6 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3163,14 +3151,14 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -3203,8 +3191,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[6:7], exec -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3212,7 +3201,6 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB12_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -3247,15 +3235,15 @@ ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo -; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 @@ -3748,47 +3736,45 @@ ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB15_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -3799,46 +3785,44 @@ ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB15_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -3879,6 +3863,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -3888,12 +3873,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB15_2: @@ -3930,6 +3914,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -3939,11 +3924,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB15_2: @@ -3996,8 +3980,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -4007,12 +3992,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB15_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB15_2: @@ -4055,21 +4039,20 @@ ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB15_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB15_2: @@ -4109,47 +4092,45 @@ ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB16_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -4160,46 +4141,44 @@ ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB16_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -4240,6 +4219,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -4249,12 +4229,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB16_2: @@ -4291,6 +4270,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -4300,11 +4280,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB16_2: @@ -4357,8 +4336,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -4368,12 +4348,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB16_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB16_2: @@ -4416,21 +4395,20 @@ ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB16_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB16_2: @@ -5690,47 +5668,45 @@ ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 +; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s4, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB21_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 @@ -5741,46 +5717,44 @@ ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 +; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s4, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz .LBB21_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -5821,6 +5795,7 @@ ; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] @@ -5830,12 +5805,11 @@ ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 +; GFX1064-NEXT: v_mov_b32_e32 v0, s7 ; GFX1064-NEXT: s_mov_b32 s3, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB21_2: @@ -5872,6 +5846,7 @@ ; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 @@ -5881,11 +5856,10 @@ ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB21_2: @@ -5938,8 +5912,9 @@ ; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 ; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 ; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1164-NEXT: s_mov_b64 exec, s[4:5] @@ -5949,12 +5924,11 @@ ; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1164-NEXT: s_cbranch_execz .LBB21_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 +; GFX1164-NEXT: v_mov_b32_e32 v0, s7 ; GFX1164-NEXT: s_mov_b32 s3, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB21_2: @@ -5997,21 +5971,20 @@ ; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 ; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 ; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr0 ; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB21_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: v_mov_b32_e32 v0, s4 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB21_2: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -502,32 +502,31 @@ ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 @@ -540,9 +539,8 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -579,29 +577,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -626,30 +624,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -691,30 +689,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -745,31 +743,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1349,32 +1346,31 @@ ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 @@ -1387,9 +1383,8 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1426,29 +1421,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W64-NEXT: .LBB6_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1473,30 +1468,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc ; GFX10W32-NEXT: .LBB6_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1538,30 +1533,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W64-NEXT: .LBB6_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1592,31 +1587,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc ; GFX11W32-NEXT: .LBB6_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -468,46 +468,44 @@ ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB2_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 @@ -518,48 +516,45 @@ ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB2_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -596,30 +591,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB2_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: @@ -644,31 +638,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB2_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: @@ -710,31 +703,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB2_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -765,32 +757,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB2_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1438,46 +1428,44 @@ ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s6, v1, 63 ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX8-NEXT: .LBB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 @@ -1488,48 +1476,45 @@ ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v1, 63 ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc ; GFX9-NEXT: .LBB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v3, v0, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1566,30 +1551,29 @@ ; GFX10W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX10W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX10W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX10W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX10W64-NEXT: ; implicit-def: $vgpr4 ; GFX10W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc +; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc ; GFX10W64-NEXT: .LBB7_2: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W64-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: @@ -1614,31 +1598,30 @@ ; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX10W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10W32-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX10W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX10W32-NEXT: ; implicit-def: $vgpr4 ; GFX10W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: ; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX10W32-NEXT: s_mov_b32 s5, s6 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc +; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc ; GFX10W32-NEXT: .LBB7_2: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX10W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX10W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v4, v0, s[2:3] +; GFX10W32-NEXT: global_store_dword v0, v4, s[2:3] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: @@ -1680,31 +1663,30 @@ ; GFX11W64-NEXT: v_readlane_b32 s8, v1, 47 ; GFX11W64-NEXT: v_writelane_b32 v3, s7, 32 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX11W64-NEXT: v_writelane_b32 v3, s8, 48 ; GFX11W64-NEXT: s_mov_b64 exec, s[4:5] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX11W64-NEXT: ; implicit-def: $vgpr4 ; GFX11W64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W64-NEXT: .LBB7_2: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W64-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W64-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W64-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; @@ -1735,32 +1717,30 @@ ; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX11W32-NEXT: v_readlane_b32 s5, v1, 15 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: s_or_saveexec_b32 s4, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11W32-NEXT: v_writelane_b32 v3, s5, 16 ; GFX11W32-NEXT: s_mov_b32 exec_lo, s4 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11W32-NEXT: ; implicit-def: $vgpr4 ; GFX11W32-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v0, s6 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v4, s6 ; GFX11W32-NEXT: s_mov_b32 s5, s6 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc +; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc ; GFX11W32-NEXT: .LBB7_2: ; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, v3 -; GFX11W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11W32-NEXT: v_readfirstlane_b32 s0, v4 +; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s0, v4 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v4, v0, s[2:3] +; GFX11W32-NEXT: global_store_b32 v0, v4, s[2:3] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -173,18 +173,24 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_mov_b32 s18, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_mov_b32 s19, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %if.else ; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v3 ; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: v_or3_b32 v31, v4, v1, v0 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s14, s16 @@ -192,13 +198,7 @@ ; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_branch .LBB4_3 -; GCN-NEXT: .LBB4_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: .LBB4_3: ; %if.end +; GCN-NEXT: .LBB4_2: ; %if.end ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm @@ -226,18 +226,24 @@ ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GCN-NEXT: s_add_u32 s0, s0, s17 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b32 s32, 0 +; GCN-NEXT: s_mov_b32 s18, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bitcmp1_b32 s12, 0 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 ; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_mov_b32 s19, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NEXT: s_and_b64 vcc, exec, s[12:13] +; GCN-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %if.else ; GCN-NEXT: s_add_u32 s8, s8, 8 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v3 ; GCN-NEXT: s_addc_u32 s9, s9, 0 -; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 +; GCN-NEXT: v_or3_b32 v31, v4, v1, v0 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s14, s16 @@ -245,13 +251,7 @@ ; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] -; GCN-NEXT: s_branch .LBB5_3 -; GCN-NEXT: .LBB5_2: -; GCN-NEXT: s_mov_b32 s4, 0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: .LBB5_3: ; %if.end +; GCN-NEXT: .LBB5_2: ; %if.end ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -31,12 +31,12 @@ ; GLOBALNESS1-LABEL: kernel: ; GLOBALNESS1: ; %bb.0: ; %bb ; GLOBALNESS1-NEXT: s_load_dwordx4 s[56:59], s[8:9], 0x0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v44, 0 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS1-NEXT: global_store_dword v[46:47], v44, off ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: global_load_dword v0, v44, s[56:57] ; GLOBALNESS1-NEXT: s_mov_b32 s61, 0 @@ -274,7 +274,7 @@ ; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v56, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] @@ -433,8 +433,7 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.11: ; %baz.exit.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: flat_load_dword v0, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v0, v[46:47] ; GLOBALNESS1-NEXT: v_readlane_b32 s60, v41, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s61, v41, 1 ; GLOBALNESS1-NEXT: v_readlane_b32 s62, v41, 2 @@ -573,8 +572,7 @@ ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS1-NEXT: ; %bb.12: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[46:47], off ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v41, 36 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v41, 37 ; GLOBALNESS1-NEXT: s_mov_b32 s91, s59 @@ -583,10 +581,10 @@ ; GLOBALNESS1-NEXT: ; %bb.13: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: .LBB1_14: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS1-NEXT: v_readlane_b32 s62, v41, 32 ; GLOBALNESS1-NEXT: v_readlane_b32 s64, v41, 34 @@ -640,7 +638,6 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] @@ -649,15 +646,14 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s13, s45 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], a[32:33], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], a[32:33], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[52:53] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_15 ; GLOBALNESS1-NEXT: ; %bb.24: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_17 Depth=2 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[0:1], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_15 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -743,14 +739,12 @@ ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -793,12 +787,12 @@ ; GLOBALNESS0-LABEL: kernel: ; GLOBALNESS0: ; %bb.0: ; %bb ; GLOBALNESS0-NEXT: s_load_dwordx4 s[56:59], s[8:9], 0x0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v44, 0 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v44, off +; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 +; GLOBALNESS0-NEXT: global_store_dword v[46:47], v44, off ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: global_load_dword v0, v44, s[56:57] ; GLOBALNESS0-NEXT: s_mov_b32 s61, 0 @@ -1036,7 +1030,7 @@ ; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: buffer_store_dword v44, off, s[0:3], 0 -; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v56, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] @@ -1195,8 +1189,7 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.11: ; %baz.exit.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: flat_load_dword v0, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v0, v[46:47] ; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s61, v41, 1 ; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 2 @@ -1335,8 +1328,7 @@ ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 ; GLOBALNESS0-NEXT: ; %bb.12: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[2:3], off +; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[46:47], off ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v41, 36 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v41, 37 ; GLOBALNESS0-NEXT: s_mov_b32 s91, s59 @@ -1345,10 +1337,10 @@ ; GLOBALNESS0-NEXT: ; %bb.13: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: .LBB1_14: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v56 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc ; GLOBALNESS0-NEXT: v_readlane_b32 s60, v41, 34 ; GLOBALNESS0-NEXT: v_readlane_b32 s62, v41, 32 @@ -1402,7 +1394,6 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] @@ -1411,15 +1402,14 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s13, s45 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s44 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], a[32:33], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], a[32:33], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[100:101] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_15 ; GLOBALNESS0-NEXT: ; %bb.24: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_17 Depth=2 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[0:1], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_15 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -1505,14 +1495,12 @@ ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v45, v44 -; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[44:45], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] Index: llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -333,15 +333,15 @@ ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v3, s33, 2 +; GFX9-O0-NEXT: v_writelane_b32 v4, s33, 2 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O0-NEXT: v_writelane_b32 v4, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v4, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 @@ -363,19 +363,19 @@ ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v2, v3, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v4, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GFX9-O0-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s33, v4, 2 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -384,15 +384,15 @@ ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v3, s33, 2 -; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 +; GFX9-O3-NEXT: v_writelane_b32 v4, s33, 2 +; GFX9-O3-NEXT: v_writelane_b32 v4, s30, 0 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 -; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 +; GFX9-O3-NEXT: v_writelane_b32 v4, s31, 1 ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0 @@ -403,19 +403,19 @@ ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called@rel32@hi+12 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-O3-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0 +; GFX9-O3-NEXT: v_readlane_b32 s31, v4, 1 +; GFX9-O3-NEXT: v_readlane_b32 s30, v4, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-O3-NEXT: v_readlane_b32 s33, v3, 2 +; GFX9-O3-NEXT: v_readlane_b32 s33, v4, 2 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] @@ -517,36 +517,36 @@ ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_writelane_b32 v10, s33, 8 +; GFX9-O0-NEXT: v_writelane_b32 v13, s33, 8 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 -; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 -; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 +; GFX9-O0-NEXT: v_writelane_b32 v13, s30, 0 +; GFX9-O0-NEXT: v_writelane_b32 v13, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 ; GFX9-O0-NEXT: s_mov_b32 s36, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 ; GFX9-O0-NEXT: s_mov_b32 s37, s5 ; GFX9-O0-NEXT: s_mov_b32 s38, s6 ; GFX9-O0-NEXT: s_mov_b32 s39, s7 -; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 -; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 -; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 -; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 +; GFX9-O0-NEXT: v_writelane_b32 v13, s36, 2 +; GFX9-O0-NEXT: v_writelane_b32 v13, s37, 3 +; GFX9-O0-NEXT: v_writelane_b32 v13, s38, 4 +; GFX9-O0-NEXT: v_writelane_b32 v13, s39, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 @@ -558,12 +558,12 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 -; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-O0-NEXT: v_writelane_b32 v13, s34, 6 +; GFX9-O0-NEXT: v_writelane_b32 v13, s35, 7 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 -; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9] +; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s34, v[8:9] ; GFX9-O0-NEXT: s_getpc_b64 s[34:35] ; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4 ; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12 @@ -572,55 +572,53 @@ ; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1] ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v11 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 -; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 -; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 -; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 -; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 -; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v13, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v13, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v13, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v13, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v13, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v13, 5 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec +; GFX9-O0-NEXT: v_add_co_u32_e64 v8, s[40:41], v10, v8 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v9, s[40:41], v11, v9, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v9 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 -; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 +; GFX9-O0-NEXT: v_readlane_b32 s31, v13, 1 +; GFX9-O0-NEXT: v_readlane_b32 s30, v13, 0 ; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 -; GFX9-O0-NEXT: v_readlane_b32 s33, v10, 8 +; GFX9-O0-NEXT: v_readlane_b32 s33, v13, 8 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -629,21 +627,18 @@ ; GFX9-O3: ; %bb.0: ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O3-NEXT: s_waitcnt vmcnt(0) -; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O3-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O3-NEXT: v_writelane_b32 v8, s33, 2 -; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 +; GFX9-O3-NEXT: v_writelane_b32 v10, s33, 2 +; GFX9-O3-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O3-NEXT: s_mov_b32 s33, s32 ; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 -; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 +; GFX9-O3-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4 @@ -661,31 +656,27 @@ ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] -; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc +; GFX9-O3-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O3-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-O3-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc ; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7 ; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4 -; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1 -; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0 +; GFX9-O3-NEXT: v_readlane_b32 s31, v10, 1 +; GFX9-O3-NEXT: v_readlane_b32 s30, v10, 0 ; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-O3-NEXT: v_readlane_b32 s33, v8, 2 +; GFX9-O3-NEXT: v_readlane_b32 s33, v10, 2 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 ; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O3-NEXT: s_nop 0 -; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-O3-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/wwm-reserved.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -118,9 +118,9 @@ ; GFX9: v_mov_b32_e32 v0, v2 ; GFX9: s_swappc_b64 %tmp134 = call i32 @called(i32 %tmp107) -; GFX9: v_mov_b32_e32 v1, v0 -; GFX9-O3: v_add_u32_e32 v1, v1, v2 -; GFX9-O0: v_add_u32_e64 v1, v1, v2 +; GFX9: v_mov_b32_e32 v3, v0 +; GFX9-O3: v_add_u32_e32 v2, v3, v2 +; GFX9-O0: v_add_u32_e64 v2, v3, v2 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) ; GFX9: buffer_store_dword v0 @@ -309,9 +309,9 @@ ; GFX9: v_mov_b32_e32 v0, v2 ; GFX9: s_swappc_b64 %tmp134 = call i32 @strict_wwm_called(i32 %tmp107) -; GFX9: v_mov_b32_e32 v1, v0 -; GFX9-O3: v_add_u32_e32 v1, v1, v2 -; GFX9-O0: v_add_u32_e64 v1, v1, v2 +; GFX9: v_mov_b32_e32 v3, v0 +; GFX9-O3: v_add_u32_e32 v2, v3, v2 +; GFX9-O0: v_add_u32_e64 v2, v3, v2 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) ; GFX9: buffer_store_dword v0 Index: llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll =================================================================== --- llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll +++ llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll @@ -2,18 +2,18 @@ ; Without strictfp, CSE should be free to eliminate the repeated multiply ; and conversion instructions. ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 ; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ -; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2 +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 ; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \ -; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \ -; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2 +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4 @IndirectCallPtr = dso_local local_unnamed_addr global void (...)* null, align 8 define dso_local signext i32 @func1() local_unnamed_addr #0 { Index: llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll =================================================================== --- llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll +++ llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll @@ -19,10 +19,11 @@ ; CHECK-NEXT: std 0, 16(1) ; CHECK-NEXT: stdu 1, -48(1) ; CHECK-NEXT: addis 3, 13, __tls_guard@tprel@ha -; CHECK-NEXT: lbz 4, __tls_guard@tprel@l(3) -; CHECK-NEXT: andi. 4, 4, 1 +; CHECK-NEXT: lbz 3, __tls_guard@tprel@l(3) +; CHECK-NEXT: andi. 3, 3, 1 ; CHECK-NEXT: bc 12, 1, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %init.i +; CHECK-NEXT: addis 3, 13, __tls_guard@tprel@ha ; CHECK-NEXT: li 4, 1 ; CHECK-NEXT: stb 4, __tls_guard@tprel@l(3) ; CHECK-NEXT: addis 3, 13, sg@tprel@ha Index: llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll =================================================================== --- llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll +++ llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll @@ -1231,10 +1231,10 @@ ; P8-NEXT: bl __gnu_h2f_ieee ; P8-NEXT: nop ; P8-NEXT: xxlxor f0, f0, f0 +; P8-NEXT: addis r3, r2, .LCPI20_0@toc@ha ; P8-NEXT: fcmpu cr0, f1, f0 ; P8-NEXT: beq cr0, .LBB20_2 ; P8-NEXT: # %bb.1: -; P8-NEXT: addis r3, r2, .LCPI20_0@toc@ha ; P8-NEXT: lfs f0, .LCPI20_0@toc@l(r3) ; P8-NEXT: .LBB20_2: ; P8-NEXT: fmr f1, f0 Index: llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll =================================================================== --- llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll +++ llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll @@ -154,7 +154,7 @@ entry: ;CHECK-LABEL: restore_sethi: ;CHECK-NOT: sethi 3 -;CHECK: restore %g0, 3072, %o0 +;CHECK: restore %0 = tail call i32 @bar(i32 %a) nounwind %1 = icmp ne i32 %0, 0 %2 = select i1 %1, i32 3072, i32 0 Index: llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll =================================================================== --- llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll +++ llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll @@ -15,7 +15,7 @@ ; SPARC-NEXT: mov %i3, %l7 ; SPARC-NEXT: mov %i2, %l5 ; SPARC-NEXT: mov %i1, %l4 -; SPARC-NEXT: mov %i0, %l6 +; SPARC-NEXT: mov %i0, %l0 ; SPARC-NEXT: sra %i0, 31, %o4 ; SPARC-NEXT: st %o4, [%sp+96] ; SPARC-NEXT: st %o4, [%sp+92] @@ -30,41 +30,41 @@ ; SPARC-NEXT: st %o2, [%fp+-20] ! 4-byte Folded Spill ; SPARC-NEXT: st %o3, [%fp+-24] ! 4-byte Folded Spill ; SPARC-NEXT: st %i5, [%sp+96] -; SPARC-NEXT: mov %g0, %l0 +; SPARC-NEXT: mov %g0, %l6 ; SPARC-NEXT: st %i4, [%sp+92] -; SPARC-NEXT: mov %l0, %o0 -; SPARC-NEXT: mov %l0, %o1 +; SPARC-NEXT: mov %l6, %o0 +; SPARC-NEXT: mov %l6, %o1 ; SPARC-NEXT: mov %i2, %o2 ; SPARC-NEXT: mov %i3, %o3 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l6, %o4 ; SPARC-NEXT: call __multi3 -; SPARC-NEXT: mov %l0, %o5 +; SPARC-NEXT: mov %l6, %o5 ; SPARC-NEXT: st %o0, [%fp+-28] ! 4-byte Folded Spill ; SPARC-NEXT: st %o1, [%fp+-32] ! 4-byte Folded Spill ; SPARC-NEXT: st %o2, [%fp+-36] ! 4-byte Folded Spill ; SPARC-NEXT: mov %o3, %i3 ; SPARC-NEXT: st %l3, [%sp+96] ; SPARC-NEXT: st %l2, [%sp+92] -; SPARC-NEXT: mov %l0, %o0 -; SPARC-NEXT: mov %l0, %o1 +; SPARC-NEXT: mov %l6, %o0 +; SPARC-NEXT: mov %l6, %o1 ; SPARC-NEXT: mov %i2, %o2 ; SPARC-NEXT: mov %l7, %o3 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l6, %o4 ; SPARC-NEXT: call __multi3 -; SPARC-NEXT: mov %l0, %o5 +; SPARC-NEXT: mov %l6, %o5 ; SPARC-NEXT: mov %o0, %i0 ; SPARC-NEXT: mov %o1, %i1 ; SPARC-NEXT: st %o2, [%fp+-4] ! 4-byte Folded Spill ; SPARC-NEXT: st %o3, [%fp+-8] ! 4-byte Folded Spill ; SPARC-NEXT: st %l3, [%sp+96] ; SPARC-NEXT: st %l2, [%sp+92] -; SPARC-NEXT: mov %l0, %o0 -; SPARC-NEXT: mov %l0, %o1 -; SPARC-NEXT: mov %l6, %o2 +; SPARC-NEXT: mov %l6, %o0 +; SPARC-NEXT: mov %l6, %o1 +; SPARC-NEXT: mov %l0, %o2 ; SPARC-NEXT: mov %l4, %o3 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l6, %o4 ; SPARC-NEXT: call __multi3 -; SPARC-NEXT: mov %l0, %o5 +; SPARC-NEXT: mov %l6, %o5 ; SPARC-NEXT: mov %o0, %l2 ; SPARC-NEXT: mov %o1, %l3 ; SPARC-NEXT: mov %o2, %l1 @@ -75,7 +75,7 @@ ; SPARC-NEXT: mov %o0, %o1 ; SPARC-NEXT: mov %o0, %o2 ; SPARC-NEXT: mov %o0, %o3 -; SPARC-NEXT: mov %l6, %o4 +; SPARC-NEXT: mov %l0, %o4 ; SPARC-NEXT: call __multi3 ; SPARC-NEXT: mov %l4, %o5 ; SPARC-NEXT: st %i5, [%sp+96] @@ -101,15 +101,15 @@ ; SPARC-NEXT: addxcc %g3, 0, %g3 ; SPARC-NEXT: addcc %i2, %i3, %i2 ; SPARC-NEXT: addxcc %g2, %g3, %i3 -; SPARC-NEXT: addxcc %l0, 0, %l1 -; SPARC-NEXT: addxcc %l0, 0, %l2 -; SPARC-NEXT: mov %l0, %o0 -; SPARC-NEXT: mov %l0, %o1 -; SPARC-NEXT: mov %l6, %o2 +; SPARC-NEXT: addxcc %l6, 0, %l1 +; SPARC-NEXT: addxcc %l6, 0, %l2 +; SPARC-NEXT: mov %l6, %o0 +; SPARC-NEXT: mov %l6, %o1 +; SPARC-NEXT: mov %l0, %o2 ; SPARC-NEXT: mov %l4, %o3 -; SPARC-NEXT: mov %l0, %o4 +; SPARC-NEXT: mov %l6, %o4 ; SPARC-NEXT: call __multi3 -; SPARC-NEXT: mov %l0, %o5 +; SPARC-NEXT: mov %l6, %o5 ; SPARC-NEXT: addcc %o3, %i2, %i2 ; SPARC-NEXT: addxcc %o2, %i3, %i3 ; SPARC-NEXT: addxcc %o1, %l1, %g2 @@ -127,15 +127,15 @@ ; SPARC-NEXT: or %i2, %i4, %i2 ; SPARC-NEXT: or %i2, %i3, %i2 ; SPARC-NEXT: cmp %i2, 0 -; SPARC-NEXT: be .LBB0_2 -; SPARC-NEXT: nop -; SPARC-NEXT: ! %bb.1: -; SPARC-NEXT: mov 1, %l0 +; SPARC-NEXT: bne .LBB0_2 +; SPARC-NEXT: mov 1, %i4 +; SPARC-NEXT: ! %bb.1: ! %start +; SPARC-NEXT: mov %l6, %i4 ; SPARC-NEXT: .LBB0_2: ! %start ; SPARC-NEXT: ld [%fp+-4], %i2 ! 4-byte Folded Reload ; SPARC-NEXT: ld [%fp+-8], %i3 ! 4-byte Folded Reload ; SPARC-NEXT: ret -; SPARC-NEXT: restore %g0, %l0, %o4 +; SPARC-NEXT: restore ; ; SPARC64-LABEL: muloti_test: ; SPARC64: .cfi_startproc Index: llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll =================================================================== --- llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll +++ llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll @@ -3,6 +3,12 @@ ; Now this test spills one register. But a reload in the loop is cheaper than ; the divsd so it's a win. +; FIXME: MachineLICM failed to move DIVSDrr out of loop because it uses register +; $mxcsr, it is clobbered by function call to sin. We need to model the +; volatile / non-volatile part of $mxcsr, so DIVSDrr uses the non-volatile part +; of $mxcsr and function call clobbers volatile part of $mxcsr, then we can +; safely move DIVSDrr out of the loop. + define fastcc void @fourn(ptr %data, i32 %isign) nounwind { ; CHECK: fourn entry: @@ -15,10 +21,9 @@ %1 = icmp sgt i32 %0, 2 ; [#uses=1] br i1 %1, label %bb30.loopexit, label %bb -; CHECK: %bb30.loopexit +; CHECK: %bb18 ; CHECK: divsd %xmm0 -; CHECK: movsd %xmm0, 16(%esp) -; CHECK: %bb3 +; CHECK: movsd %xmm0, (%esp) bb3: ; preds = %bb30.loopexit, %bb25, %bb3 %2 = load i32, ptr null, align 4 ; [#uses=1] %3 = mul i32 %2, 0 ; [#uses=1] Index: llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll =================================================================== --- llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll +++ llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -380,10 +380,12 @@ ; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: testq %rdi, %rdi ; SSE2-NEXT: cmovnsq %rdi, %rcx -; SSE2-NEXT: cvtsi2ss %rcx, %xmm0 -; SSE2-NEXT: jns .LBB9_2 +; SSE2-NEXT: cvtsi2ss %rcx, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: js .LBB9_2 ; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: .LBB9_2: ; SSE2-NEXT: pushq %rax ; SSE2-NEXT: callq __truncsfhf2@PLT Index: llvm/test/CodeGen/X86/half.ll =================================================================== --- llvm/test/CodeGen/X86/half.ll +++ llvm/test/CodeGen/X86/half.ll @@ -367,19 +367,19 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rsi, %rbx -; CHECK-LIBCALL-NEXT: testq %rdi, %rdi -; CHECK-LIBCALL-NEXT: js .LBB10_1 -; CHECK-LIBCALL-NEXT: # %bb.2: -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: jmp .LBB10_3 -; CHECK-LIBCALL-NEXT: .LBB10_1: ; CHECK-LIBCALL-NEXT: movq %rdi, %rax ; CHECK-LIBCALL-NEXT: shrq %rax -; CHECK-LIBCALL-NEXT: andl $1, %edi -; CHECK-LIBCALL-NEXT: orq %rax, %rdi -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 +; CHECK-LIBCALL-NEXT: movl %edi, %ecx +; CHECK-LIBCALL-NEXT: andl $1, %ecx +; CHECK-LIBCALL-NEXT: orq %rax, %rcx +; CHECK-LIBCALL-NEXT: cvtsi2ss %rcx, %xmm0 ; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: .LBB10_3: +; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm1 +; CHECK-LIBCALL-NEXT: testq %rdi, %rdi +; CHECK-LIBCALL-NEXT: js .LBB10_2 +; CHECK-LIBCALL-NEXT: # %bb.1: +; CHECK-LIBCALL-NEXT: movaps %xmm1, %xmm0 +; CHECK-LIBCALL-NEXT: .LBB10_2: ; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT ; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) Index: llvm/test/CodeGen/X86/pr29112.ll =================================================================== --- llvm/test/CodeGen/X86/pr29112.ll +++ llvm/test/CodeGen/X86/pr29112.ll @@ -37,20 +37,20 @@ ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 ; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] ; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 ; CHECK-NEXT: vmovaps %xmm13, %xmm1 ; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 -; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 +; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm3 +; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm8 ; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 -; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm10, (%rsp) ; CHECK-NEXT: vmovaps %xmm9, %xmm3 ; CHECK-NEXT: vzeroupper Index: llvm/test/CodeGen/X86/sqrt-partial.ll =================================================================== --- llvm/test/CodeGen/X86/sqrt-partial.ll +++ llvm/test/CodeGen/X86/sqrt-partial.ll @@ -12,22 +12,24 @@ define float @f(float %val) nounwind { ; SSE-LABEL: f: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: ucomiss %xmm1, %xmm0 +; SSE-NEXT: sqrtss %xmm0, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: ucomiss %xmm2, %xmm0 ; SSE-NEXT: jb .LBB0_2 ; SSE-NEXT: # %bb.1: # %.split -; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; SSE-NEXT: .LBB0_2: # %call.sqrt ; SSE-NEXT: jmp sqrtf # TAILCALL ; ; AVX-LABEL: f: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vucomiss %xmm2, %xmm0 ; AVX-NEXT: jb .LBB0_2 ; AVX-NEXT: # %bb.1: # %.split -; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm1, %xmm0 ; AVX-NEXT: retq ; AVX-NEXT: .LBB0_2: # %call.sqrt ; AVX-NEXT: jmp sqrtf # TAILCALL @@ -38,22 +40,24 @@ define double @d(double %val) nounwind { ; SSE-LABEL: d: ; SSE: # %bb.0: -; SSE-NEXT: xorpd %xmm1, %xmm1 -; SSE-NEXT: ucomisd %xmm1, %xmm0 +; SSE-NEXT: sqrtsd %xmm0, %xmm1 +; SSE-NEXT: xorpd %xmm2, %xmm2 +; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: jb .LBB1_2 ; SSE-NEXT: # %bb.1: # %.split -; SSE-NEXT: sqrtsd %xmm0, %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; SSE-NEXT: .LBB1_2: # %call.sqrt ; SSE-NEXT: jmp sqrt # TAILCALL ; ; AVX-LABEL: d: ; AVX: # %bb.0: -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vucomisd %xmm1, %xmm0 +; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vucomisd %xmm2, %xmm0 ; AVX-NEXT: jb .LBB1_2 ; AVX-NEXT: # %bb.1: # %.split -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovapd %xmm1, %xmm0 ; AVX-NEXT: retq ; AVX-NEXT: .LBB1_2: # %call.sqrt ; AVX-NEXT: jmp sqrt # TAILCALL Index: llvm/test/CodeGen/X86/sse-intel-ocl.ll =================================================================== --- llvm/test/CodeGen/X86/sse-intel-ocl.ll +++ llvm/test/CodeGen/X86/sse-intel-ocl.ll @@ -16,14 +16,14 @@ ; WIN32-NEXT: subl $80, %esp ; WIN32-NEXT: movups 72(%ebp), %xmm4 ; WIN32-NEXT: movups 8(%ebp), %xmm3 -; WIN32-NEXT: addps %xmm4, %xmm3 -; WIN32-NEXT: movups 56(%ebp), %xmm4 -; WIN32-NEXT: movups 40(%ebp), %xmm5 -; WIN32-NEXT: movups 24(%ebp), %xmm6 +; WIN32-NEXT: movups 56(%ebp), %xmm5 +; WIN32-NEXT: movups 40(%ebp), %xmm6 +; WIN32-NEXT: movups 24(%ebp), %xmm7 ; WIN32-NEXT: movl %esp, %eax -; WIN32-NEXT: addps %xmm6, %xmm0 -; WIN32-NEXT: addps %xmm5, %xmm1 -; WIN32-NEXT: addps %xmm4, %xmm2 +; WIN32-NEXT: addps %xmm7, %xmm0 +; WIN32-NEXT: addps %xmm6, %xmm1 +; WIN32-NEXT: addps %xmm5, %xmm2 +; WIN32-NEXT: addps %xmm4, %xmm3 ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: calll _func_float16_ptr ; WIN32-NEXT: addl $4, %esp Index: llvm/test/CodeGen/X86/swifterror.ll =================================================================== --- llvm/test/CodeGen/X86/swifterror.ll +++ llvm/test/CodeGen/X86/swifterror.ll @@ -243,8 +243,6 @@ ; CHECK-i386-NEXT: .cfi_offset %edi, -8 ; CHECK-i386-NEXT: movl 32(%esp), %esi ; CHECK-i386-NEXT: leal 16(%esp), %edi -; CHECK-i386-NEXT: fld1 -; CHECK-i386-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-i386-NEXT: LBB2_1: ## %bb_loop ; CHECK-i386-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-i386-NEXT: movl $0, 16(%esp) @@ -255,7 +253,7 @@ ; CHECK-i386-NEXT: jne LBB2_4 ; CHECK-i386-NEXT: ## %bb.2: ## %cont ; CHECK-i386-NEXT: ## in Loop: Header=BB2_1 Depth=1 -; CHECK-i386-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload +; CHECK-i386-NEXT: fld1 ; CHECK-i386-NEXT: fxch %st(1) ; CHECK-i386-NEXT: fucompp ; CHECK-i386-NEXT: fnstsw %ax @@ -270,7 +268,7 @@ ; CHECK-i386-NEXT: fstp %st(0) ; CHECK-i386-NEXT: movl %ecx, (%esp) ; CHECK-i386-NEXT: calll _free -; CHECK-i386-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload +; CHECK-i386-NEXT: fld1 ; CHECK-i386-NEXT: addl $20, %esp ; CHECK-i386-NEXT: popl %esi ; CHECK-i386-NEXT: popl %edi @@ -470,8 +468,6 @@ ; CHECK-i386-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-i386-NEXT: movl 36(%esp), %esi ; CHECK-i386-NEXT: movl 32(%esp), %edi -; CHECK-i386-NEXT: fld1 -; CHECK-i386-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-i386-NEXT: LBB4_1: ## %bb_loop ; CHECK-i386-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-i386-NEXT: testl %esi, %esi @@ -485,9 +481,8 @@ ; CHECK-i386-NEXT: movb $1, 8(%eax) ; CHECK-i386-NEXT: LBB4_3: ## %bb_cont ; CHECK-i386-NEXT: ## in Loop: Header=BB4_1 Depth=1 +; CHECK-i386-NEXT: fld1 ; CHECK-i386-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload -; CHECK-i386-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload -; CHECK-i386-NEXT: fxch %st(1) ; CHECK-i386-NEXT: fucompp ; CHECK-i386-NEXT: fnstsw %ax ; CHECK-i386-NEXT: ## kill: def $ah killed $ah killed $ax Index: llvm/tools/llvm-reduce/ReducerWorkItem.cpp =================================================================== --- llvm/tools/llvm-reduce/ReducerWorkItem.cpp +++ llvm/tools/llvm-reduce/ReducerWorkItem.cpp @@ -325,7 +325,7 @@ std::memcpy(DstMask, SrcMO.getRegMask(), sizeof(*DstMask) * MachineOperand::getRegMaskSize(TRI->getNumRegs())); - DstMO.setRegMask(DstMask); + DstMO.setRegMask(DstMask, DstMI); } } Index: llvm/unittests/CodeGen/MachineOperandTest.cpp =================================================================== --- llvm/unittests/CodeGen/MachineOperandTest.cpp +++ llvm/unittests/CodeGen/MachineOperandTest.cpp @@ -61,7 +61,7 @@ auto MF = createMachineFunction(Ctx, Mod); uint32_t *Dummy = MF->allocateRegMask(); - MachineOperand MO = MachineOperand::CreateRegMask(Dummy); + MachineOperand MO = MachineOperand::CreateRegMask(Dummy, MF.get()); // Checking some preconditions on the newly created // MachineOperand.